In [1]:
%pip install decorator==5.0.9

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [2]:
import imp
import sys

sys.path.append('../src/')

import text_cleanup.text_cleanup as thesisCleanUp
import preprocessing.text_preprocessing as thesisTextPreprocessing
import data.reader as thesisDataReader
import utils.utils as thesisUtils
import features.tf_idf.n_gram as thesisTfIdfNgramFeatures
import similarities.cosine as thesisCosineSimilarity

imp.reload(thesisCleanUp)
imp.reload(thesisTextPreprocessing)
imp.reload(thesisDataReader)
imp.reload(thesisUtils)
imp.reload(thesisCosineSimilarity)
imp.reload(thesisTfIdfNgramFeatures)

<module 'features.tf_idf.n_gram' from '../src/features/tf_idf/n_gram.py'>

In [3]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()

In [4]:
statistics_df_zwickau = thesisCosineSimilarity.create_statistics_df(zwickau_corpus, london_corpus, 'zwickau')

In [5]:
statistics_df_london = thesisCosineSimilarity.create_statistics_df(london_corpus, zwickau_corpus, 'london')

In [6]:
statistics_df_combined = pd.concat([statistics_df_zwickau, statistics_df_london])

In [7]:
statistics_df_combined

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version
0,2_gram,0,inner,0.231428,0.049144,0.103870,0.199567,0.234173,0.266041,0.382694,0,31,321,0.382694,53,,,,,zwickau
1,2_gram,1,inner,0.688952,0.104741,0.274285,0.624526,0.710362,0.764685,0.866673,0,878,5,0.866673,1248,,,,,zwickau
2,2_gram,2,inner,0.625779,0.096483,0.250700,0.572698,0.644933,0.700207,0.797541,0,377,5,0.797541,1248,,,,,zwickau
3,2_gram,3,inner,0.659646,0.096405,0.289649,0.601198,0.677046,0.733958,0.826515,0,725,5,0.826515,1248,,,,,zwickau
4,2_gram,4,inner,0.655271,0.098697,0.314683,0.595564,0.673477,0.730636,0.816763,0,707,281,0.816763,1334,,,,,zwickau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,count_vectorizer_5_gram,313,cross,0.109089,0.045656,0.007279,0.079608,0.107231,0.131828,0.316795,0,3226,297,0.316795,2488,False,,,,london
3176,count_vectorizer_5_gram,314,cross,0.095552,0.053794,0.013822,0.057207,0.085922,0.120639,0.341544,0,1325,152,0.341544,1518,False,,,,london
3177,count_vectorizer_5_gram,315,cross,0.102737,0.058298,0.004337,0.062212,0.093139,0.129564,0.343818,0,821,87,0.343818,1222,False,,,,london
3178,count_vectorizer_5_gram,316,cross,0.057517,0.033607,0.000000,0.032537,0.051247,0.077881,0.169342,3,463,115,0.169342,272,False,,,,london


In [8]:
statistics_df_combined_copy = statistics_df_combined.copy()

In [9]:
# zwickau_inner_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'inner') &
#     (statistics_df_combined['version'] == 'zwickau')
# ]
# zwickau_cross_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'cross') &
#     (statistics_df_combined['version'] == 'zwickau')
# ]

In [10]:
def check_inner_cross_mean(statistics_df, version_name):
    features = statistics_df.feature_name.unique()
    
    for feature in features:
        total = 0
        
        version_inner_df = statistics_df.loc[
            (statistics_df['cross/inner'] == 'inner') &
            (statistics_df['version'] == version_name) & 
            (statistics_df['feature_name'] == feature)
        ]
        version_cross_df = statistics_df.loc[
            (statistics_df['cross/inner'] == 'cross') &
            (statistics_df['version'] == version_name) & 
            (statistics_df['feature_name'] == feature)
        ]
        
        for i, d in version_cross_df.iterrows():
            cross_mean_val = d['mean']
            p_index = d['p_#']
            inner_mean_val = version_inner_df.loc[
                (version_inner_df['p_#'] == p_index) &
                (version_inner_df['feature_name'] == feature),
                'mean'
            ].values[0]
            
            cross_inner_mean_diff = cross_mean_val - inner_mean_val
            if cross_inner_mean_diff > 0:
                statistics_df.loc[
                    (statistics_df['version'] == version_name) &
                    (statistics_df['cross/inner'] == 'inner') &
                    (statistics_df['p_#'] == p_index) & 
                    (statistics_df['feature_name'] == feature),
                    'inner_mean_is_low'
                ] = True
            else:
                statistics_df.loc[
                    (statistics_df['version'] == version_name) &
                    (statistics_df['cross/inner'] == 'inner') &
                    (statistics_df['p_#'] == p_index) & 
                    (statistics_df['feature_name'] == feature),
                    'inner_mean_is_low'
                ] = False
                total += 1
            statistics_df.loc[
                (statistics_df['version'] == version_name) &
                (statistics_df['cross/inner'] == 'inner') &
                (statistics_df['p_#'] == p_index) & 
                (statistics_df['feature_name'] == feature),
                'cross_inner_mean_diff'
                ] = cross_inner_mean_diff
            
        print(f'version: {version_name}, for feature: {feature}, number of inner lowe that cross is: {total}')

In [11]:
check_inner_cross_mean(statistics_df_combined_copy, 'zwickau')

version: zwickau, for feature: 2_gram, number of inner lowe that cross is: 4
version: zwickau, for feature: 3_gram, number of inner lowe that cross is: 3
version: zwickau, for feature: 4_gram, number of inner lowe that cross is: 12
version: zwickau, for feature: 5_gram, number of inner lowe that cross is: 22
version: zwickau, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 7


In [12]:
check_inner_cross_mean(statistics_df_combined_copy, 'london')

version: london, for feature: 2_gram, number of inner lowe that cross is: 318
version: london, for feature: 3_gram, number of inner lowe that cross is: 318
version: london, for feature: 4_gram, number of inner lowe that cross is: 316
version: london, for feature: 5_gram, number of inner lowe that cross is: 284
version: london, for feature: count_vectorizer_5_gram, number of inner lowe that cross is: 315


In [13]:
# total = 0
# for i, d in zwickau_cross_df.iterrows():
#     cross_mean_val = d['mean']
#     p_index = d['p_#']
#     inner_mean_val = zwickau_inner_df.loc[zwickau_inner_df['p_#'] == p_index, 'mean'].values[0]
#     if cross_mean_val > inner_mean_val:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'zwickau') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = True
#     else:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'zwickau') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = False
#         total += 1
# print(f'number of inner mean lowwer than cross is: {total}')

In [14]:
# london_inner_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'inner') &
#     (statistics_df_combined['version'] == 'london')
# ]
# london_cross_df = statistics_df_combined.loc[
#     (statistics_df_combined['cross/inner'] == 'cross') &
#     (statistics_df_combined['version'] == 'london')
# ]

In [15]:
# total = 0
# for i, d in london_cross_df.iterrows():
#     cross_mean_val = d['mean']
#     p_index = d['p_#']
#     inner_mean_val = london_inner_df.loc[london_inner_df['p_#'] == p_index, 'mean'].values[0]
#     if cross_mean_val > inner_mean_val:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'london') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = True
#     else:
#         statistics_df_combined_copy.loc[
#             (statistics_df_combined_copy['version'] == 'london') &
#             (statistics_df_combined_copy['cross/inner'] == 'inner') &
#             (statistics_df_combined_copy['p_#'] == p_index),
#             'inner_mean_is_low'
#         ] = False
#         total += 1
# print(f'number of inner mean lowwer than cross is: {total}')

In [54]:
statistics_df_combined_copy.to_csv('../computed_data/text_to_text/statistics_df.csv')

In [21]:
statistics_df_zwickau.query("`cross/inner` == 'inner'")

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version
0,2_gram,0,inner,0.231428,0.049144,0.103870,0.199567,0.234173,0.266041,0.382694,0,31,321,0.382694,53,,,,,zwickau
1,2_gram,1,inner,0.688952,0.104741,0.274285,0.624526,0.710362,0.764685,0.866673,0,878,5,0.866673,1248,,,,,zwickau
2,2_gram,2,inner,0.625779,0.096483,0.250700,0.572698,0.644933,0.700207,0.797541,0,377,5,0.797541,1248,,,,,zwickau
3,2_gram,3,inner,0.659646,0.096405,0.289649,0.601198,0.677046,0.733958,0.826515,0,725,5,0.826515,1248,,,,,zwickau
4,2_gram,4,inner,0.655271,0.098697,0.314683,0.595564,0.673477,0.730636,0.816763,0,707,281,0.816763,1334,,,,,zwickau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1605,count_vectorizer_5_gram,317,inner,0.063064,0.027083,0.000000,0.046415,0.062365,0.077461,0.175285,2,556,299,0.175285,2764,,,,,zwickau
1606,count_vectorizer_5_gram,318,inner,0.109734,0.043691,0.004167,0.079846,0.106301,0.136133,0.260851,0,1535,313,0.260851,1912,,,,,zwickau
1607,count_vectorizer_5_gram,319,inner,0.078253,0.029624,0.004151,0.057275,0.076135,0.096259,0.191496,0,1418,123,0.191496,1337,,,,,zwickau
1608,count_vectorizer_5_gram,320,inner,0.053711,0.025757,0.003082,0.034590,0.051501,0.071044,0.146063,0,520,297,0.146063,2488,,,,,zwickau


In [None]:
statistics_df_zwickau.loc[(statistics_df_zwickau['p_#'] == 0) & (statistics_df_zwickau['cross/inner'] == 'inner')]

In [16]:
statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') |
    (statistics_df_combined_copy['feature_name'] == 'count_vectorizer_5_gram')
].sort_values('p_#')

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,...,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version,inner_mean_is_low,cross_inner_mean_diff
966,5_gram,0,inner,0.008343,0.015417,0.000000,0.00000,0.002362,0.010302,0.148716,...,321,0.148716,53,,,,,zwickau,True,0.001215
2898,count_vectorizer_5_gram,0,cross,0.016631,0.026600,0.000000,0.00000,0.007410,0.023127,0.220219,...,19,0.220219,360,False,,,,zwickau,,
2576,5_gram,0,cross,0.009559,0.017142,0.000000,0.00000,0.003247,0.011943,0.160069,...,8,0.160069,550,False,,,,zwickau,,
2862,count_vectorizer_5_gram,0,cross,0.020757,0.032964,0.000000,0.00000,0.000000,0.030692,0.158941,...,274,0.158941,853,False,,,,london,,
1272,count_vectorizer_5_gram,0,inner,0.019303,0.031500,0.000000,0.00000,0.000000,0.027472,0.205132,...,19,0.205132,360,,,,,london,True,0.001454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1608,count_vectorizer_5_gram,320,inner,0.053711,0.025757,0.003082,0.03459,0.051501,0.071044,0.146063,...,297,0.146063,2488,,,,,zwickau,True,0.003665
1287,5_gram,321,inner,0.010207,0.016764,0.000000,0.00000,0.005519,0.013074,0.148716,...,0,0.148716,31,,,,,zwickau,True,0.000088
2897,5_gram,321,cross,0.010295,0.013363,0.000000,0.00000,0.006274,0.013622,0.082978,...,19,0.082978,360,False,,,,zwickau,,
3219,count_vectorizer_5_gram,321,cross,0.018136,0.022027,0.000000,0.00000,0.011561,0.024641,0.156363,...,19,0.156363,360,False,,,,zwickau,,


In [17]:
statistics_df_combined_copy[
    (statistics_df_combined_copy['feature_name'] == '5_gram') |
    (statistics_df_combined_copy['feature_name'] == 'count_vectorizer_5_gram')
].sort_values('p_#').to_csv('../computed_data/text_to_text/5_gram_cv_itidf.csv')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].sort_values('p_#').to_csv('../computed_data/text_to_text/5_gram_zwickau_stats_df.csv')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].set_index(['p_#', 'cross/inner', ]).sort_values('p_#')

# Word counters

In [5]:
from collections import Counter
# # here is neat graph for count: https://www.absentdata.com/python-graphs/python-word-frequency/
def create_words_frequency(corpus):    
    word_counter =  Counter(' '.join(corpus).split())
    return sorted(word_counter.items(), key=lambda item: item[1], reverse=True)

In [6]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()

In [7]:
def create_data(dictionary, corpus_1_name, corpus_2_name, feature_name):
    data = []
    for i in dictionary:
        corpus_1_counter = 0 if corpus_1_name not in dictionary[i] else dictionary[i][corpus_1_name]
        corpus_2_counter = 0 if corpus_2_name not in dictionary[i] else dictionary[i][corpus_2_name]
        data.append([
            feature_name,
            i,
            corpus_1_counter,
            corpus_2_counter,
            corpus_1_name if corpus_1_counter > corpus_2_counter else 'equal' if corpus_1_counter == corpus_2_counter else corpus_2_name
        ])
    return data

In [8]:
def creat_word_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):    
    dictionary = {}
    
    counter_sorted_corpus_1 = create_words_frequency(corpus_1)
    counter_sorted_corpus_2 = create_words_frequency(corpus_2)
    
    def add_to_dictionary(counters, corpus_name):
        for i in counters:
            word = i[0]
            count = i[1]
            if word not in dictionary:
                dictionary[word] = {}
            dictionary[word][corpus_name] = count

    add_to_dictionary(counter_sorted_corpus_1, corpus_1_name)
    add_to_dictionary(counter_sorted_corpus_2, corpus_2_name)

    data = []
    return create_data(dictionary, corpus_1_name, corpus_2_name, 'word_counter')

In [9]:
def creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, n_gram):    
    dictionary = {}
    
    counter_sorted_corpus_1 = create_n_gram_frequency(n_gram, corpus_1)
    counter_sorted_corpus_2 = create_n_gram_frequency(n_gram, corpus_2)
    
    def add_to_dictionary(counters, corpus_name):
        for i in counters:
            word = i[0].replace(' ', '_')
            count = i[1]
            if word not in dictionary:
                dictionary[word] = {}
            dictionary[word][corpus_name] = count

    add_to_dictionary(counter_sorted_corpus_1, corpus_1_name)
    add_to_dictionary(counter_sorted_corpus_2, corpus_2_name)
    return create_data(dictionary, corpus_1_name, corpus_2_name,  f'count_vectorizer_{n_gram}_gram')
#     data = []
#     for i in dictionary:
#         corpus_1_counter = 0 if corpus_1_name not in dictionary[i] else dictionary[i][corpus_1_name]
#         corpus_2_counter = 0 if corpus_2_name not in dictionary[i] else dictionary[i][corpus_2_name]
#         data.append([
#             f'count_vectorizer_{n_gram}_gram',
#             i,
#             corpus_1_counter,
#             corpus_2_counter
#         ])
        
#     return data

In [10]:
word_freq = creat_word_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [12]:
def create_n_gram_frequency(n_gram, corpus):
    vec = CountVectorizer(ngram_range=(n_gram, n_gram), analyzer='char').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq

In [13]:
create_n_gram_frequency(5, zwickau_corpus)

[(' est ', 593),
 ('s et ', 357),
 ('ibus ', 319),
 ('m et ', 283),
 ('itur ', 280),
 (' que ', 263),
 (' quod', 262),
 (' mont', 261),
 ('quod ', 250),
 ('t in ', 238),
 (' cont', 234),
 ('iuita', 234),
 (' ciui', 229),
 ('ciuit', 229),
 (' per ', 224),
 ('sunt ', 221),
 ('ntem ', 209),
 (' sunt', 209),
 (' leuc', 209),
 ('contr', 207),
 ('ontra', 207),
 ('ntra ', 205),
 (' terr', 201),
 ('e et ', 186),
 ('orum ', 183),
 ('m in ', 181),
 (' qui ', 180),
 ('sque ', 171),
 (' et i', 171),
 ('monte', 163),
 ('usque', 162),
 (' non ', 159),
 (' usqu', 159),
 ('uitat', 159),
 ('terra', 157),
 (' habe', 153),
 ('a et ', 152),
 (' et s', 150),
 ('entem', 149),
 ('m est', 149),
 ('e ad ', 145),
 (' cum ', 143),
 ('s in ', 143),
 (' dici', 142),
 ('itate', 141),
 ('dicit', 139),
 (' ubi ', 139),
 ('ntur ', 137),
 ('um et', 137),
 (' et a', 133),
 ('is et', 133),
 ('t et ', 130),
 (' sed ', 130),
 ('i et ', 129),
 ('erat ', 128),
 ('et in', 127),
 ('e in ', 126),
 ('citur', 124),
 (' et p', 123

In [14]:
def creat_5_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 5)

In [15]:
def creat_6_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 6)

In [16]:
def creat_7_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 7)

In [17]:
def creat_8_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 8)

In [18]:
def creat_9_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 9)

In [19]:
def creat_10_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name):
    return creat_n_gram_frequency_data(corpus_1, corpus_1_name, corpus_2, corpus_2_name, 10)

In [20]:
gram_5_frequency = creat_5_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)
gram_6_frequency = creat_6_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)
gram_7_frequency = creat_7_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)
gram_8_frequency = creat_8_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)
gram_9_frequency = creat_9_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)
gram_10_frequency = creat_10_gram_frequency_data(
    zwickau_corpus,
    'zwickau',
    london_corpus,
    'london'
)

In [21]:
df_columns = [
    'feature_name', 
    'word', 
    'zwickau', 
    'london',
    'version with higher score'
]

words_df = pd.DataFrame(
    word_freq + gram_5_frequency + gram_6_frequency + gram_7_frequency + gram_8_frequency + gram_9_frequency + gram_10_frequency,
    columns=df_columns
)
words_df

Unnamed: 0,feature_name,word,zwickau,london,version with higher score
0,word_counter,et,1521,1685,london
1,word_counter,in,1038,1149,london
2,word_counter,est,600,668,london
3,word_counter,de,459,531,london
4,word_counter,ad,426,441,london
...,...,...,...,...,...
801433,count_vectorizer_10_gram,_dietarum_,0,1,london
801434,count_vectorizer_10_gram,dietarum_a,0,1,london
801435,count_vectorizer_10_gram,ietarum_am,0,1,london
801436,count_vectorizer_10_gram,etarum_ame,0,1,london


In [106]:
words_df['feature_name'] == 'word_counter']].to_csv('../computed_data/text_to_text/5_gram_zwickau_stats_df.csv')

Unnamed: 0,feature_name,word,zwickau,london
4,word_counter,ad,426,441


In [None]:
words_df[words_df['feature_name'] == 'word_counter'].to_csv('../computed_data/text_to_text/count_words/word_counter.csv')

In [24]:
for feature_name in [
    'word_counter',
    'count_vectorizer_5_gram',
    'count_vectorizer_6_gram',
    'count_vectorizer_7_gram',
    'count_vectorizer_8_gram',
    'count_vectorizer_9_gram',
    'count_vectorizer_10_gram'
]:
    words_df[
        words_df['feature_name'] == feature_name
    ].to_csv(f'../computed_data/text_to_text/count_words/{feature_name}.csv')