### Character Counts from Books

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import string


In [20]:
languages = ['finnish','french','italian','dzongkha','greek','polish','telugu','russian']

paths = [f'../data/{lang}.txt' for lang in languages]

In [26]:
def make_character_pair_df(path):
    with open(path, 'r') as file:
     data = file.read()
    clean_data = re.sub(r'[0-9]+', '', data)
    clean_data = re.sub("\n", " ", clean_data)
    extended_punct = string.punctuation + "‘" + "«" + "“" + "’" + "—" + "—" + "”" + "»"
    clean_data = clean_data.translate(str.maketrans('', '', extended_punct))
    if ('dzongkha' in path) | ('greek' in path):
        clean_data = re.sub(r'\s*[A-Za-z]+\b', '' , clean_data)
        print('cleaning non english')
    clean_data = clean_data.lower()
    vectorizer = CountVectorizer(analyzer='char',ngram_range=(2, 2))
    X = vectorizer.fit_transform([clean_data])
    pairs = vectorizer.get_feature_names_out()
    pairs = [pair.replace(" ", "_") for pair in pairs]
    df = pd.DataFrame(X.toarray(),columns=pairs).transpose().reset_index()

    return df 

In [27]:
def extract_heatmap_data(df):
    entry_list = []
    for idx in range(len(df)):
        entry_list.append({"x": df['index'].iloc[idx][0],"y": df['index'].iloc[idx][1],  "color":int(df[0].iloc[idx])})
    return entry_list

In [28]:
def extract_multiple_heatmap_data(path_list):
    big_entry_list = []
    for idx, path in enumerate(path_list):
        print(path)
        df = make_character_pair_df(path)
        big_entry_list.append(extract_heatmap_data(df))

    return big_entry_list

In [29]:
super_data = extract_multiple_heatmap_data(paths)

../data/finnish.txt
../data/french.txt
../data/italian.txt
../data/dzongkha.txt
cleaning non english
../data/greek.txt
cleaning non english
../data/polish.txt
../data/telugu.txt
../data/russian.txt


In [30]:
import json

json_object = json.dumps(super_data, indent=1)

with open("../data/big_heatmap.json", "w") as outfile:
    outfile.write(json_object)