In [1]:
from os import path
import csv
import sys
import os
sys.path.insert(0, os.path.abspath('../..'))
import numpy as np

# Frequency table

### Frequency table class finds the frequency of occurrence of elements. For example, let us say there is an array of integer that we would like to find its frequencies.

### By creating a "FrequencyTable", we can get the preliminary statistics by printing it or its summary:

In [2]:
from probability.empirical_distributions import FrequencyTable

samples = [2, 1, 9, 5, 9, 3, 7, 4, 0, 7, 8, 8, 5, 7, 4, 4, 8, 6, 3, 1, 6, 4,
       1, 2, 5, 5, 4, 2, 3, 3, 3, 6, 2, 6, 2, 8, 5, 5, 4, 6, 6, 6, 5, 2,
       3, 3, 2, 7, 7, 8, 2, 7, 0, 5, 4, 5, 0, 6, 3, 9, 4, 1, 4, 8, 0, 6,
       5, 0, 0, 9, 2, 4, 0, 4, 3, 5, 3, 0, 2, 6, 5, 9, 6, 2, 3, 0, 4, 2,
       0, 0, 8, 3, 7, 0, 2, 1, 4, 3, 6, 2, 5, 6, 1, 6, 5, 4, 4, 9, 3, 5,
       8, 6, 3, 7, 8, 8, 1, 0, 8, 2, 4, 1, 4, 1, 1, 2, 1, 5, 3, 3, 7, 9,
       5, 8, 0, 1, 5, 7, 1, 1, 3, 3, 0, 3, 4, 4, 9, 8, 7, 0, 4, 5, 2, 2,
       7, 6, 6, 9, 0, 7, 6, 6, 2, 0, 8, 3, 8, 0, 8, 7, 7, 5, 8, 0, 6, 6,
       6, 6, 1, 6, 3, 8, 0, 2, 2, 9, 1, 9, 4, 1, 6, 9, 2, 4, 6, 3, 5, 8,
       4, 3]

f_table = FrequencyTable(samples)
print(f_table)
print(f_table.summary())

Frequency table (rv:'X1', total:200)
Frequency table 
random variable:'X1'
total:200


### Also, it generates a table of frequencies by calling its 'to_table' method:

In [3]:
print(f_table.to_table())

|X1       |frequency|
|---------|---------|
|        0|21.0     |
|        1|17.0     |
|        2|22.0     |
|        3|24.0     |
|        4|23.0     |
|        5|21.0     |
|        6|26.0     |
|        7|15.0     |
|        8|19.0     |
|        9|12.0     |
|**total**|200.0      |


### If you'd like, use Jupyter feature and turn it to a proper table. Note that here we requested the probabilities:

In [4]:
from IPython.display import Markdown

Markdown(f_table.to_table(normalised=True))

|X1       |probability|
|---------|-----------|
|        0|0.105      |
|        1|0.085      |
|        2|0.11       |
|        3|0.12       |
|        4|0.115      |
|        5|0.105      |
|        6|0.13       |
|        7|0.075      |
|        8|0.095      |
|        9|0.06       |
|**total**|1.0        |

### To show the top four, we can use 'most_common' method and turn it to FrequencyTable again:

In [5]:
r = {k:v for k, v in f_table.most_common(4)}
Markdown(FrequencyTable(r).to_table(normalised=False))


|X1       |frequency|
|---------|---------|
|        2|22.0     |
|        3|24.0     |
|        4|23.0     |
|        6|26.0     |
|**total**|95.0       |

### The random variable's default name is "X1". The 'frequency table' constructor accept the random variable name as an argument. Also, you can find the levels of the random variable (in our example, it is zero to nine digits):

In [6]:
f_table = FrequencyTable(samples, name="digits")
print(f"RV name: {f_table.name}")
print(f"RV levels: {list(f_table.keys())}")
print(f"RV details: {f_table.discrete_rv}")

print(f"frequency of '4': {f_table.frequency(4)}")
print(f"probability of '4': {f_table.probability(4)}")
print(f"total: {f_table.total}")

RV name: digits
RV levels: [2, 1, 9, 5, 3, 7, 4, 0, 8, 6]
RV details: 'digits'
frequency of '4': 23
probability of '4': 0.115
total: 200


## Let us try a different sample. This time, we use a long text and find ten the most common character:

In [7]:
long_text= """The number of daily reported cases has been rising steadily and some of that has been put down to an increase in the number of people being tested.

Put simply, the more you test the more new cases you will find. But the jump of more than one thousand in a day is a significant new spike.

The health secretary says the government is concerned and has renewed official calls for more vigilance on social distancing.

What Matt Hancock and health officials are worried about is that the UK might follow the same path as France and Spain, where increases in infections amongst younger adults led after a few weeks to higher numbers of admissions to hospitals for older and more vulnerable patients.

The number of people seriously ill in hospital with Covid-19 has fallen and there were just two new daily reported deaths.

Medical leaders and ministers can only hope that the spread of the virus amongst younger people does not get passed on to the elderly and those with underlying health problems."""

freq_chars = FrequencyTable(long_text, name="character")
top_ten = freq_chars.most_common(10)
print("char \t freq \t prob")
for index, (character,frequency)  in enumerate(top_ten):    
    probability = freq_chars.probability(character)
    print(f"{character} \t {frequency} \t {probability:.3f}")

char 	 freq 	 prob
  	 168 	 0.168
e 	 106 	 0.106
a 	 67 	 0.067
t 	 64 	 0.064
n 	 62 	 0.062
o 	 59 	 0.059
s 	 59 	 0.059
i 	 52 	 0.052
r 	 44 	 0.044
h 	 42 	 0.042


## Or if we need the frequency of words, we need to convert the test string to a list of words:

In [8]:
freq_words = FrequencyTable(long_text.split(' '), name="word")
top_ten = freq_words.most_common(10)
print("char \t freq \t prob")
for index, (character,frequency)  in enumerate(top_ten):    
    probability = freq_words.probability(character)
    print(f"{character} \t {frequency} \t {probability:.3f}")

char 	 freq 	 prob
the 	 10 	 0.059
and 	 8 	 0.047
of 	 7 	 0.041
more 	 5 	 0.030
has 	 4 	 0.024
to 	 4 	 0.024
in 	 4 	 0.024
number 	 3 	 0.018
that 	 3 	 0.018
people 	 3 	 0.018


### We try to load the 'audiology' file by using 'genfromtxt' of [numpy version](https://numpy.org/doc/stable/reference/generated/numpy.genfromtxt.html) 

### Note that the file is in byte and in order to load the third column properly, we used a converter that turn the byte string to utf8. We can see, the dtype="U" which tells the numpy to treat the values as unicode string. Also, the converter handles the missing values ('?') and turns them to 'NA'.

### After that, turning the data to "FrequencyTable" is similar to previous examples:

In [9]:
fname = 'data/audiology.standardized.data'
data = np.genfromtxt(fname, 
                     usecols=[3], 
                     delimiter=",", 
                     converters= {3: lambda x: u'NA' if(x == b'?') else x.decode("utf-8")})
test = FrequencyTable(data, name="count")
Markdown(test.to_table(normalised=True))

|count    |probability|
|---------|-----------|
|       NA|0.02       |
|   absent|0.25       |
| elevated|0.145      |
|   normal|0.585      |
|**total**|1.0        |