# 23.2. Computing Entropy (10 points)
- Collection of files to work with: Pizza and Chili Corpus
- Compute k-th order entropy of files in the dataset for k=0,1,2,3,4.
- Compute approximate size of the compressed file if encoded by Hk bits per symbol. Length times H_k.

In [1]:
import os
from collections import Counter
import numpy as np
import math
import pandas as pd
import plotly.express as px
import time

In [2]:
def load_data(path):
    content = ''
    with open(path, 'r', encoding='windows-1252', errors='replace') as f:
        content = f.read()
    return content

In [3]:
directory = f'..\..\datasets'

In [4]:
def get_alphabet(data):
    c = dict(Counter(data))
    return list(c.keys())

In [5]:
def get_k_words_with_freq(text, k):
    res = {}

    tic = time.time()
    
    for i in range(0, len(text)-k+1):
        current_text = text[i:i+k]
        res[current_text] = res.get(current_text, 0) + 1

    toc = time.time()

    return (res, toc - tic)

In [6]:
def calc_with_zero(nom, den):
    if den == 0:
        return 0
    return nom / den

In [7]:
def get_sub_k_with_freq(data, keys, k):
    tic = time.time()

    k_1 = k + 1

    res = { k:{} for k in keys }
    
    for i in range(0, len(data) - k_1 + 1):

        current_text = data[i:i+k_1]
        
        prefix = current_text[0:k]

        current_dic = res[prefix]

        current_dic[current_text] = current_dic.get(current_text, 0) + 1

        res[prefix] = current_dic
    
    toc = time.time()

    return res, toc - tic

In [8]:
def calculate_entropy_for_state(state_k_dic, state_k_plus_1_dic):
    tic = time.time()

    res = {}

    states_n = np.sum(list(state_k_dic.values()))

    for state, state_freq in state_k_dic.items():
        suma = 0
        for state_k_plus_1, state_k_plus_1_freq in state_k_plus_1_dic[state].items():
            suma += calc_with_zero(state_k_plus_1_freq, states_n) * math.log2(state_k_plus_1_freq / state_freq) 
        res[state] = suma


    toc = time.time()
    return -np.sum(list(res.values())), toc - tic, res

In [9]:
def run(path, k):
   data = load_data(path)

   n = len(data)

   states_with_freq, k_freq_time = get_k_words_with_freq(data, k)

   k_plus_1_states_with_freq, k_plus_1_freq_time = get_sub_k_with_freq(data, list(states_with_freq.keys()), k)

   finate, calculation_time, state_entropies =  calculate_entropy_for_state(states_with_freq, k_plus_1_states_with_freq)


   return {
      "finate": finate,
      "k": k,
      "path": path,
      "aproximate_size": finate * n,
      "original_size": n * 8,
      "k_freq_time": k_freq_time, 
      "k_plus_1_freq_time": k_plus_1_freq_time,
      "calculation_time": calculation_time,
      "state_entropies": state_entropies
   }

In [10]:
all_directories = []


for dir in os.listdir(directory):
    for nested_dir in os.listdir(os.path.sep.join([directory, dir])):
       all_directories.append(os.path.sep.join([directory, dir, nested_dir])) 

In [11]:
all_directories[1:2]

['..\\..\\datasets\\english\\english.50MB']

In [12]:
res = {}
counter = 0
from_k = 1
to_k = 6


for path in all_directories[0:2]:
    result = None
    for k in range(from_k, to_k):
        print(f'{path} - {k}')
        full_path = path
        result = run(full_path, k)
        res[counter] = result
        counter += 1

..\..\datasets\dna\dna.50MB - 1
..\..\datasets\dna\dna.50MB - 2
..\..\datasets\dna\dna.50MB - 3
..\..\datasets\dna\dna.50MB - 4
..\..\datasets\dna\dna.50MB - 5
..\..\datasets\english\english.50MB - 1
..\..\datasets\english\english.50MB - 2
..\..\datasets\english\english.50MB - 3
..\..\datasets\english\english.50MB - 4
..\..\datasets\english\english.50MB - 5


In [15]:
result_dataframe = pd.DataFrame.from_dict(res, orient="index")

In [18]:
result_dataframe

Unnamed: 0,finate,k,path,aproximate_size,original_size,k_freq_time,k_plus_1_freq_time,calculation_time,state_entropies
0,1.934922,1,..\..\datasets\dna\dna.50MB,101445600.0,419430400,18.526829,33.898585,0.0,"{'G': -0.4182296166786281, 'A': -0.57031832158..."
1,1.924513,2,..\..\datasets\dna\dna.50MB,100899900.0,419430400,24.53994,37.697706,0.000999,"{'GA': -0.11694824052442548, 'AT': -0.14553541..."
2,1.919584,3,..\..\datasets\dna\dna.50MB,100641500.0,419430400,24.279142,39.537709,0.001987,"{'GAT': -0.02592923929629321, 'ATC': -0.023003..."
3,1.912851,4,..\..\datasets\dna\dna.50MB,100288500.0,419430400,24.733479,43.666369,0.004957,"{'GATC': -0.004477112476998099, 'ATCA': -0.009..."
4,1.903402,5,..\..\datasets\dna\dna.50MB,99793060.0,419430400,26.128554,38.320786,0.010001,"{'GATCA': -0.0017566910040682701, 'ATCAA': -0...."
5,3.606487,1,..\..\datasets\english\english.50MB,189083800.0,419430400,18.76726,32.866286,0.005,"{' ': -0.0956065483004109, 'T': -0.00488728286..."
6,2.921758,2,..\..\datasets\english\english.50MB,153184300.0,419430400,23.815711,42.093274,0.070873,"{' ': -0.01029982260502319, ' T': -0.00054962..."
7,2.385559,3,..\..\datasets\english\english.50MB,125072000.0,419430400,28.463911,49.414789,0.402959,"{' ': -0.001002157849502695, ' T': -0.00021..."
8,2.01251,4,..\..\datasets\english\english.50MB,105513500.0,419430400,31.833898,56.284305,1.494567,"{' ': -0.0005175874720459757, ' T': -3.70..."
9,1.763829,5,..\..\datasets\english\english.50MB,92475460.0,419430400,33.28804,53.137946,3.668184,"{' ': -0.0002271166314822888, ' T': -2...."


# Graphs

In [17]:
fig = px.line(result_dataframe, x="k", y="finate", color='path')
fig.show()