https://homel.vsb.cz/~vas218/acs.html

In [11]:
import os
from collections import Counter
import numpy as np
import math
import pandas as pd

In [12]:
def load_data(path):
    content = ''
    with open(path, 'r', encoding='windows-1252', errors='replace') as f:
        content = f.read()
    return content

In [13]:
def calc_freq(content):
    c = Counter(list(content))
    return c

In [14]:
def calc_p(counter, n):
    counter = dict(counter)
    res = {}
    for k, v in counter.items():
        res[k] = v / n  
    return res

In [15]:
def get_n(counter):
    counter = dict(counter)
    return np.sum(list(counter.values()))

In [16]:
def calc_H(p):
    H = 0
    for k, v in p.items():
        #Shannon equation!
        H += p[k] * math.log2(p[k])
    return -H

In [22]:
def run_with_data(data):
    counter = calc_freq(data)
    n = get_n(counter)
    p = calc_p(counter, n)
    H = calc_H(p)
    _H = n * H
    res = {
        "alphabet_size": len(list(counter.keys())),
        "H": H,
        "approximate_size": ("%.3f" % _H),
        "chars": n,
        "original_size": n * 8,
        "ratio": (n * 8) / _H,
        "frequencies_of_symbols": dict(counter)
    }
    return res

In [23]:
def run(path):
    data = load_data(path)
    return run_with_data(data)

In [24]:
directory = f'..\..\datasets'

In [25]:
res = {}

for dir in os.listdir(directory):
    for nested_dir in os.listdir(os.path.sep.join([directory, dir])):
        full_path = os.path.sep.join([directory, dir, nested_dir])
        H = run(full_path)
        res[full_path] = H

In [26]:
df = pd.DataFrame.from_dict(res, orient='index')

In [27]:
df.head()

Unnamed: 0,alphabet_size,H,approximate_size,chars,original_size,ratio,frequencies_of_symbols
..\..\datasets\dna\dna.50MB,16,1.981943,103910918.17,52428800,419430400,4.036442,"{'G': 11008713, 'A': 15203237, 'T': 15190093, ..."
..\..\datasets\english\english.50MB,175,4.528864,237442921.024,52428800,419430400,1.766447,"{' ': 1007651, 'T': 138587, 'h': 2750886, 'i':..."
..\..\datasets\proteins\proteins.50MB,25,4.195469,219963384.304,52428800,419430400,1.906819,"{'M': 1327185, 'G': 3614402, 'K': 2655712, 'S'..."
..\..\datasets\sources\sources.50MB,223,5.536963,290296322.774,52428800,419430400,1.444835,"{'/': 418112, '*': 875368, ' ': 1763309, ' ': ..."


In [40]:
[print("%.16f" % x) for x in df["approximate_size"].values];

72025559.9586011320352554
164582891.2518661618232727
152466999.6567884385585785
201218077.6573725938796997


[None, None, None, None]

In [28]:
run_with_data("aaabbbccc")

{'alphabet_size': 3,
 'H': 1.584962500721156,
 'approximate_size': '14.265',
 'chars': 9,
 'original_size': 72,
 'ratio': 5.04743802857166,
 'frequencies_of_symbols': {'a': 3, 'b': 3, 'c': 3}}