In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%pip install decorator==5.0.9

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [3]:
import imp
import sys

sys.path.append('../src/')

import text_cleanup.text_cleanup as thesisCleanUp
import preprocessing.text_preprocessing as thesisTextPreprocessing
import data.reader as thesisDataReader
import utils.utils as thesisUtils
import features.tf_idf.n_gram as thesisTfIdfNgramFeatures
import similarities.cosine as thesisCosineSimilarity
import similarities.bm25 as thesisBM25Similarity
import features.count_vectorizer.n_gram as thesisCountVectorizerNgramFeatures

imp.reload(thesisCleanUp)
imp.reload(thesisTextPreprocessing)
imp.reload(thesisDataReader)
imp.reload(thesisUtils)
imp.reload(thesisCosineSimilarity)
imp.reload(thesisTfIdfNgramFeatures)
imp.reload(thesisBM25Similarity)
imp.reload(thesisCountVectorizerNgramFeatures)

<module 'features.count_vectorizer.n_gram' from '../src/features/count_vectorizer/n_gram.py'>

In [4]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()
breslau_corpus = thesisDataReader.get_breslau_corpus()

In [5]:
statistics_df_zwickau_to_london = thesisCosineSimilarity.create_statistics_df(
    zwickau_corpus, 
    london_corpus, 
    'zwickau_to_london'
)
statistics_df_zwickau_to_breslau = thesisCosineSimilarity.create_statistics_df(
    zwickau_corpus, 
    breslau_corpus, 
    'zwickau_to_breslau'
)

In [6]:
statistics_df_london_to_zwickau = thesisCosineSimilarity.create_statistics_df(
    london_corpus, 
    zwickau_corpus, 
    'london_to_zwickau'
)
statistics_df_london_to_breslau = thesisCosineSimilarity.create_statistics_df(
    london_corpus, 
    breslau_corpus, 
    'london_to_breslau'
)

In [7]:
statistics_df_breslau_to_zwickau = thesisCosineSimilarity.create_statistics_df(
    breslau_corpus, 
    zwickau_corpus, 
    'breslau_to_zwickau'
)
statistics_df_breslau_to_london = thesisCosineSimilarity.create_statistics_df(
    breslau_corpus, 
    london_corpus, 
    'breslau_to_london'
)

In [8]:
zwickau_to_breslau_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_zwikau_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_breslau_5_gram_corpus(),
    'zwickau_to_breslau'
)
zwickau_to_london_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_zwikau_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_london_5_gram_corpus(),
    'zwickau_to_london'
)

In [9]:
london_to_breslau_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_london_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_breslau_5_gram_corpus(),
    'london_to_breslau'
)
london_to_zwickau_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_london_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_zwikau_5_gram_corpus(),
    'london_to_zwickau'
)

In [10]:
breslau_to_zwickau_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_breslau_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_zwikau_5_gram_corpus(),
    'breslau_to_zwickau'
)
breslau_to_london_bm25_stats_df = thesisBM25Similarity.create_statistics_df(
    thesisCountVectorizerNgramFeatures.create_breslau_5_gram_corpus(),
    thesisCountVectorizerNgramFeatures.create_london_5_gram_corpus(),
    'breslau_to_london'
)

In [11]:
statistics_df_combined = pd.concat([
    statistics_df_zwickau_to_london, 
    statistics_df_zwickau_to_breslau,
    statistics_df_london_to_zwickau,
    statistics_df_london_to_breslau,
    statistics_df_breslau_to_zwickau,
    statistics_df_breslau_to_london,
    zwickau_to_breslau_bm25_stats_df,
    zwickau_to_london_bm25_stats_df,
    london_to_breslau_bm25_stats_df,
    london_to_zwickau_bm25_stats_df,
    breslau_to_zwickau_bm25_stats_df,
    breslau_to_london_bm25_stats_df
])

In [12]:
statistics_df_combined

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version
0,5_gram,0,inner,0.008343,0.015417,0.0,0.000000,0.002362,0.010302,0.148716,138,31,321,0.148716,53,,,,,zwickau_to_london
1,5_gram,1,inner,0.031249,0.018082,0.0,0.017713,0.028674,0.040268,0.103347,3,878,6,0.103347,1587,,,,,zwickau_to_london
2,5_gram,2,inner,0.023848,0.014911,0.0,0.013357,0.020934,0.031907,0.097735,4,377,212,0.097735,422,,,,,zwickau_to_london
3,5_gram,3,inner,0.029136,0.016467,0.0,0.017094,0.027478,0.039089,0.103975,2,725,5,0.103975,1248,,,,,zwickau_to_london
4,5_gram,4,inner,0.026172,0.016658,0.0,0.014303,0.022741,0.034621,0.107332,2,707,283,0.107332,1603,,,,,zwickau_to_london
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,bm25,66,cross,0.092881,0.078620,0.0,0.053620,0.077171,0.111674,1.000000,1,241,201,1.000000,285,,,,,breslau_to_london
138,bm25,67,cross,0.121218,0.088245,0.0,0.073623,0.102282,0.144638,1.000000,1,555,202,1.000000,446,,,,,breslau_to_london
139,bm25,68,cross,0.065310,0.068873,0.0,0.031353,0.056117,0.083235,1.000000,4,209,203,1.000000,350,,,,,breslau_to_london
140,bm25,69,cross,0.118411,0.080344,0.0,0.074913,0.105575,0.144949,1.000000,1,701,204,1.000000,462,,,,,breslau_to_london


In [13]:
statistics_df_combined_copy = statistics_df_combined.copy()

In [14]:
# zwickau_inner_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'inner') &
#     (statistics_df_combined['version'] == 'zwickau')
# ]
# zwickau_cross_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'cross') &
#     (statistics_df_combined['version'] == 'zwickau')
# ]

In [15]:
def check_inner_cross_mean(statistics_df, version_name):
    features = statistics_df.feature_name.unique()
    
    for feature in features:
        total = 0
        
        version_inner_df = statistics_df.loc[
            (statistics_df['cross/inner'] == 'inner') &
            (statistics_df['version'] == version_name) & 
            (statistics_df['feature_name'] == feature)
        ]
        version_cross_df = statistics_df.loc[
            (statistics_df['cross/inner'] == 'cross') &
            (statistics_df['version'] == version_name) & 
            (statistics_df['feature_name'] == feature)
        ]
        
        for i, d in version_cross_df.iterrows():
            cross_mean_val = d['mean']
            p_index = d['p_#']
            inner_mean_val = version_inner_df.loc[
                (version_inner_df['p_#'] == p_index) &
                (version_inner_df['feature_name'] == feature),
                'mean'
            ].values[0]
            
            cross_inner_mean_diff = cross_mean_val - inner_mean_val
            if cross_inner_mean_diff > 0:
                statistics_df.loc[
                    (statistics_df['version'] == version_name) &
                    (statistics_df['cross/inner'] == 'inner') &
                    (statistics_df['p_#'] == p_index) & 
                    (statistics_df['feature_name'] == feature),
                    'inner_mean_is_low'
                ] = True
            else:
                statistics_df.loc[
                    (statistics_df['version'] == version_name) &
                    (statistics_df['cross/inner'] == 'inner') &
                    (statistics_df['p_#'] == p_index) & 
                    (statistics_df['feature_name'] == feature),
                    'inner_mean_is_low'
                ] = False
                total += 1
            statistics_df.loc[
                (statistics_df['version'] == version_name) &
                (statistics_df['cross/inner'] == 'inner') &
                (statistics_df['p_#'] == p_index) & 
                (statistics_df['feature_name'] == feature),
                'cross_inner_mean_diff'
                ] = cross_inner_mean_diff
            
        print(f'version: {version_name}, for feature: {feature}, number of inner lowe that cross is: {total}')

In [16]:
check_inner_cross_mean(statistics_df_combined_copy, 'zwickau_to_london')
check_inner_cross_mean(statistics_df_combined_copy, 'zwickau_to_breslau')

version: zwickau_to_london, for feature: 5_gram, number of inner lowe that cross is: 22
version: zwickau_to_london, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 7
version: zwickau_to_london, for feature: bm25, number of inner lowe that cross is: 1
version: zwickau_to_breslau, for feature: 5_gram, number of inner lowe that cross is: 0
version: zwickau_to_breslau, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 1
version: zwickau_to_breslau, for feature: bm25, number of inner lowe that cross is: 0


In [17]:
check_inner_cross_mean(statistics_df_combined_copy, 'london_to_zwickau')
check_inner_cross_mean(statistics_df_combined_copy, 'london_to_breslau')

version: london_to_zwickau, for feature: 5_gram, number of inner lowe that cross is: 284
version: london_to_zwickau, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 315
version: london_to_zwickau, for feature: bm25, number of inner lowe that cross is: 7
version: london_to_breslau, for feature: 5_gram, number of inner lowe that cross is: 1
version: london_to_breslau, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 1
version: london_to_breslau, for feature: bm25, number of inner lowe that cross is: 0


In [18]:
check_inner_cross_mean(statistics_df_combined_copy, 'breslau_to_zwickau')
check_inner_cross_mean(statistics_df_combined_copy, 'breslau_to_london')

version: breslau_to_zwickau, for feature: 5_gram, number of inner lowe that cross is: 71
version: breslau_to_zwickau, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 71
version: breslau_to_zwickau, for feature: bm25, number of inner lowe that cross is: 0
version: breslau_to_london, for feature: 5_gram, number of inner lowe that cross is: 71
version: breslau_to_london, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 71
version: breslau_to_london, for feature: bm25, number of inner lowe that cross is: 0


In [19]:
statistics_df_combined_copy

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,...,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version,inner_mean_is_low,cross_inner_mean_diff
0,5_gram,0,inner,0.008343,0.015417,0.0,0.000000,0.002362,0.010302,0.148716,...,321,0.148716,53,,,,,zwickau_to_london,True,0.001215
1,5_gram,1,inner,0.031249,0.018082,0.0,0.017713,0.028674,0.040268,0.103347,...,6,0.103347,1587,,,,,zwickau_to_london,True,0.001848
2,5_gram,2,inner,0.023848,0.014911,0.0,0.013357,0.020934,0.031907,0.097735,...,212,0.097735,422,,,,,zwickau_to_london,True,0.001138
3,5_gram,3,inner,0.029136,0.016467,0.0,0.017094,0.027478,0.039089,0.103975,...,5,0.103975,1248,,,,,zwickau_to_london,True,0.002081
4,5_gram,4,inner,0.026172,0.016658,0.0,0.014303,0.022741,0.034621,0.107332,...,283,0.107332,1603,,,,,zwickau_to_london,True,0.001349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,bm25,66,cross,0.092881,0.078620,0.0,0.053620,0.077171,0.111674,1.000000,...,201,1.000000,285,,,,,breslau_to_london,,
138,bm25,67,cross,0.121218,0.088245,0.0,0.073623,0.102282,0.144638,1.000000,...,202,1.000000,446,,,,,breslau_to_london,,
139,bm25,68,cross,0.065310,0.068873,0.0,0.031353,0.056117,0.083235,1.000000,...,203,1.000000,350,,,,,breslau_to_london,,
140,bm25,69,cross,0.118411,0.080344,0.0,0.074913,0.105575,0.144949,1.000000,...,204,1.000000,462,,,,,breslau_to_london,,


In [None]:
# total = 0
# for i, d in zwickau_cross_df.iterrows():
#     cross_mean_val = d['mean']
#     p_index = d['p_#']
#     inner_mean_val = zwickau_inner_df.loc[zwickau_inner_df['p_#'] == p_index, 'mean'].values[0]
#     if cross_mean_val > inner_mean_val:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'zwickau') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = True
#     else:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'zwickau') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = False
#         total += 1
# print(f'number of inner mean lowwer than cross is: {total}')

In [None]:
# london_inner_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'inner') &
#     (statistics_df_combined['version'] == 'london')
# ]
# london_cross_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'cross') &
#     (statistics_df_combined['version'] == 'london')
# ]

In [None]:
# total = 0
# for i, d in london_cross_df.iterrows():
#     cross_mean_val = d['mean']
#     p_index = d['p_#']
#     inner_mean_val = london_inner_df.loc[london_inner_df['p_#'] == p_index, 'mean'].values[0]
#     if cross_mean_val > inner_mean_val:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'london') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = True
#     else:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'london') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = False
#         total += 1
# print(f'number of inner mean lowwer than cross is: {total}')

In [20]:
statistics_df_combined_copy.to_csv('../computed_data/text_to_text/statistics_df.csv')

In [None]:
statistics_df_zwickau.query("`cross/inner` == 'inner'")

In [None]:
statistics_df_zwickau.loc[(statistics_df_zwickau['p_#'] == 0) & (statistics_df_zwickau['cross/inner'] == 'inner')]

In [None]:
statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') |
    (statistics_df_combined_copy['feature_name'] == 'count_vectorizer_5_gram')
].sort_values('p_#')

In [None]:
statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') |
    (statistics_df_combined_copy['feature_name'] == 'count_vectorizer_5_gram')
].sort_values('p_#').to_csv('../computed_data/text_to_text/5_gram_cv_itidf.csv')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].sort_values('p_#').to_csv('../computed_data/text_to_text/5_gram_zwickau_stats_df.csv')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].set_index(['p_#', 'cross/inner', ]).sort_values('p_#')

In [41]:
breslau_to_london_copy_df = statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') &
    (statistics_df_combined_copy['cross/inner'] == 'inner') &
    (statistics_df_combined_copy['version'] == 'breslau_to_london')
]

In [43]:
breslau_to_london_copy_df.sort_values('mean', ascending=False)

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,...,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version,inner_mean_is_low,cross_inner_mean_diff
62,5_gram,62,inner,0.108341,0.047088,0.023299,0.073488,0.099550,0.134982,0.262217,...,61,0.262217,2931,,,,,breslau_to_london,False,-0.031107
61,5_gram,61,inner,0.101552,0.045344,0.020580,0.073869,0.094379,0.120315,0.262217,...,62,0.262217,2519,,,,,breslau_to_london,False,-0.025298
59,5_gram,59,inner,0.097720,0.041298,0.030417,0.070602,0.088431,0.115694,0.253560,...,61,0.253560,2931,,,,,breslau_to_london,False,-0.026111
43,5_gram,43,inner,0.089277,0.037345,0.022789,0.065123,0.078725,0.113324,0.211170,...,61,0.211170,2931,,,,,breslau_to_london,False,-0.023276
39,5_gram,39,inner,0.088731,0.030577,0.030606,0.066163,0.086966,0.114683,0.167552,...,40,0.167552,1122,,,,,breslau_to_london,False,-0.019788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,5_gram,68,inner,0.040693,0.026103,0.004355,0.023004,0.034270,0.052816,0.139836,...,67,0.139836,626,,,,,breslau_to_london,False,-0.012337
0,5_gram,0,inner,0.039432,0.023920,0.009441,0.024446,0.034146,0.048725,0.154090,...,2,0.154090,2977,,,,,breslau_to_london,False,-0.008485
70,5_gram,70,inner,0.038211,0.016445,0.005362,0.025016,0.037643,0.049672,0.077819,...,33,0.077819,1361,,,,,breslau_to_london,False,-0.008119
10,5_gram,10,inner,0.038003,0.019467,0.012000,0.023438,0.035445,0.046311,0.100943,...,17,0.100943,1162,,,,,breslau_to_london,False,-0.010086


In [36]:
breslau_to_london_copy_df.sum()

feature_name                     5_gram5_gram5_gram5_gram5_gram5_gram5_gram5_gr...
p_#                                                                           2485
cross/inner                      innerinnerinnerinnerinnerinnerinnerinnerinneri...
mean                                                                      4.369927
std                                                                       2.054861
min                                                                       1.067496
25%                                                                       2.913761
50%                                                                       4.003055
75%                                                                        5.43002
max                                                                      11.096751
# of 0 similarities                                                              0
p_length                                                                     63279
most

In [42]:
breslau_to_london_copy_df.describe()

Unnamed: 0,p_#,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,cross_inner_mean_diff
count,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0
mean,35.0,0.061548,0.028942,0.015035,0.041039,0.056381,0.076479,0.156292,0.0,891.253521,39.408451,0.156292,1404.464789,-0.015697
std,20.639767,0.017415,0.007701,0.007183,0.01367,0.016438,0.021346,0.045555,0.0,586.934694,21.229748,0.045555,819.408983,0.005506
min,0.0,0.02132,0.012438,0.001261,0.012978,0.019856,0.026341,0.077571,0.0,222.0,0.0,0.077571,277.0,-0.031107
25%,17.5,0.046582,0.022962,0.009401,0.030873,0.042978,0.057618,0.12054,0.0,491.5,22.5,0.12054,749.0,-0.0192
50%,35.0,0.060332,0.02772,0.01521,0.038342,0.055388,0.075586,0.15164,0.0,742.0,35.0,0.15164,1162.0,-0.015165
75%,52.5,0.071678,0.034926,0.020022,0.051373,0.065881,0.090162,0.191817,0.0,1167.5,61.0,0.191817,2266.0,-0.011516
max,70.0,0.108341,0.047088,0.030606,0.073869,0.09955,0.134982,0.262217,0.0,2977.0,69.0,0.262217,2977.0,-0.005344


In [44]:
statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') &
    (statistics_df_combined_copy['cross/inner'] == 'inner') &
    (statistics_df_combined_copy['version'] == 'london_to_breslau')
].sum()

feature_name                     5_gram5_gram5_gram5_gram5_gram5_gram5_gram5_gr...
p_#                                                                          50403
cross/inner                      innerinnerinnerinnerinnerinnerinnerinnerinneri...
mean                                                                     11.685203
std                                                                       7.078118
min                                                                       0.411462
25%                                                                       6.903892
50%                                                                      10.343138
75%                                                                      14.899616
max                                                                      52.441663
# of 0 similarities                                                            678
p_length                                                                    193244
most

# Word counters

In [None]:
from collections import Counter
# # here is neat graph for count: https://www.absentdata.com/python-graphs/python-word-frequency/
def create_words_frequency(corpus):    
    word_counter =  Counter(' '.join(corpus).split())
    return sorted(word_counter.items(), key=lambda item: item[1], reverse=True)

In [None]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()
breslau_corpus = thesisDataReader.get_breslau_corpus()

In [None]:
def create_data(
    dictionary, 
    corpus_1_name, 
    corpus_2_name, 
    corpus_3_name,
    feature_name
):
    data = []
    for i in dictionary:
        corpus_1_counter = 0 if corpus_1_name not in dictionary[i] else dictionary[i][corpus_1_name]
        corpus_2_counter = 0 if corpus_2_name not in dictionary[i] else dictionary[i][corpus_2_name]
        corpus_3_counter = 0 if corpus_3_name not in dictionary[i] else dictionary[i][corpus_3_name]
        
        all_counters = [corpus_1_counter, corpus_2_counter, corpus_3_counter]
        all_names = [corpus_1_name, corpus_2_name, corpus_3_name]
        max_version_name = all_names[np.argmax(all_counters)]
        
        data.append([
            feature_name,
            i,
            corpus_1_counter,
            corpus_2_counter,
            corpus_3_counter,
            max_version_name
#             corpus_1_name if corpus_1_counter > corpus_2_counter else 'equal' if corpus_1_counter == corpus_2_counter else corpus_2_name
        ])
    return data

In [None]:
def creat_word_frequency_data(
    corpus_1, 
    corpus_1_name, 
    corpus_2, 
    corpus_2_name,
    corpus_3,
    corpus_3_name
):    
    dictionary = {}
    
    counter_sorted_corpus_1 = create_words_frequency(corpus_1)
    counter_sorted_corpus_2 = create_words_frequency(corpus_2)
    counter_sorted_corpus_3 = create_words_frequency(corpus_3)
    
    def add_to_dictionary(counters, corpus_name):
        for i in counters:
            word = i[0]
            count = i[1]
            if word not in dictionary:
                dictionary[word] = {}
            dictionary[word][corpus_name] = count

    add_to_dictionary(counter_sorted_corpus_1, corpus_1_name)
    add_to_dictionary(counter_sorted_corpus_2, corpus_2_name)
    add_to_dictionary(counter_sorted_corpus_3, corpus_3_name)

    data = []
    return create_data(
        dictionary, 
        corpus_1_name, 
        corpus_2_name,
        corpus_3_name,
        'word_counter'
    )

In [None]:
def creat_n_gram_frequency_data(
    corpus_1, 
    corpus_1_name, 
    corpus_2, 
    corpus_2_name, 
    corpus_3,
    corpus_3_name,
    n_gram
):    
    dictionary = {}
    
    counter_sorted_corpus_1 = create_n_gram_frequency(n_gram, corpus_1)
    counter_sorted_corpus_2 = create_n_gram_frequency(n_gram, corpus_2)
    counter_sorted_corpus_3 = create_n_gram_frequency(n_gram, corpus_3)
    
    def add_to_dictionary(counters, corpus_name):
        for i in counters:
            word = i[0].replace(' ', '_')
            count = i[1]
            if word not in dictionary:
                dictionary[word] = {}
            dictionary[word][corpus_name] = count

    add_to_dictionary(counter_sorted_corpus_1, corpus_1_name)
    add_to_dictionary(counter_sorted_corpus_2, corpus_2_name)
    add_to_dictionary(counter_sorted_corpus_3, corpus_3_name)
    
    return create_data(
        dictionary, 
        corpus_1_name, 
        corpus_2_name,  
        corpus_3_name,
        f'count_vectorizer_{n_gram}_gram'
    )
#     data = []
#     for i in dictionary:
#         corpus_1_counter = 0 if corpus_1_name not in dictionary[i] else dictionary[i][corpus_1_name]
#         corpus_2_counter = 0 if corpus_2_name not in dictionary[i] else dictionary[i][corpus_2_name]
#         data.append([
#             f'count_vectorizer_{n_gram}_gram',
#             i,
#             corpus_1_counter,
#             corpus_2_counter
#         ])
        
#     return data

In [None]:
word_freq = creat_word_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
def create_n_gram_frequency(n_gram, corpus):
    vec = CountVectorizer(ngram_range=(n_gram, n_gram), analyzer='char').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq

In [None]:
create_n_gram_frequency(5, zwickau_corpus)

In [None]:
def creat_5_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 5)

In [None]:
def creat_6_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 6)

In [None]:
def creat_7_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 7)

In [None]:
def creat_8_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 8)

In [None]:
def creat_9_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 9)

In [None]:
def creat_10_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, corpus_3, corpus_3_name, 10)

In [None]:
gram_5_frequency = creat_5_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)
gram_6_frequency = creat_6_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)
gram_7_frequency = creat_7_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)
gram_8_frequency = creat_8_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)
gram_9_frequency = creat_9_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)
gram_10_frequency = creat_10_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london',
    breslau_corpus,
    'breslau'
)

In [None]:
len(gram_10_frequency[0])

In [None]:
df_columns = [
    'feature_name', 
    'word', 
    'zwickau', 
    'london',
    'breslau',
    'version with higher score'
]

words_df = pd.DataFrame(
    word_freq + gram_5_frequency + gram_6_frequency + gram_7_frequency + gram_8_frequency + gram_9_frequency + gram_10_frequency,
    columns=df_columns
)
words_df

In [None]:
# words_df['feature_name'] == 'word_counter']].to_csv('../computed_data/text_to_text/5_gram_zwickau_stats_df.csv')

In [None]:
# words_df[words_df['feature_name'] == 'word_counter'].to_csv('../computed_data/text_to_text/count_words/word_counter.csv')

In [None]:
for feature_name in [
    'word_counter',
    'count_vectorizer_5_gram',
    'count_vectorizer_6_gram',
    'count_vectorizer_7_gram',
    'count_vectorizer_8_gram',
    'count_vectorizer_9_gram',
    'count_vectorizer_10_gram'
]:
    words_df[
        words_df['feature_name'] == feature_name
    ].to_csv(f'../computed_data/text_to_text/count_words/{feature_name}.csv')