In [1]:
%pip install decorator==5.0.9

import networkx as nx
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [2]:
import imp
import sys

sys.path.append('../src/')

import text_cleanup.text_cleanup as thesisCleanUp
import preprocessing.text_preprocessing as thesisTextPreprocessing
import data.reader as thesisDataReader
import utils.utils as thesisUtils
import features.tf_idf.n_gram as thesisTfIdfNgramFeatures
import similarities.cosine as thesisCosineSimilarity

imp.reload(thesisCleanUp)
imp.reload(thesisTextPreprocessing)
imp.reload(thesisDataReader)
imp.reload(thesisUtils)
imp.reload(thesisCosineSimilarity)

<module 'similarities.cosine' from '../src/similarities/cosine.py'>

In [3]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()

In [4]:
statistics_df_zwickau = thesisCosineSimilarity.create_statistics_df(zwickau_corpus, london_corpus, 'zwickau')

In [5]:
statistics_df_london = thesisCosineSimilarity.create_statistics_df(london_corpus, zwickau_corpus, 'london')

In [6]:
statistics_df_combined = pd.concat([statistics_df_zwickau, statistics_df_london])

In [7]:
statistics_df_combined

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version
0,5_gram,0,inner,0.008343,0.015417,0.000000,0.000000,0.002362,0.010302,0.148716,138,31,321,0.148716,53,,,,,zwickau
1,5_gram,1,inner,0.031249,0.018082,0.000000,0.017713,0.028674,0.040268,0.103347,3,878,6,0.103347,1587,,,,,zwickau
2,5_gram,2,inner,0.023848,0.014911,0.000000,0.013357,0.020934,0.031907,0.097735,4,377,212,0.097735,422,,,,,zwickau
3,5_gram,3,inner,0.029136,0.016467,0.000000,0.017094,0.027478,0.039089,0.103975,2,725,5,0.103975,1248,,,,,zwickau
4,5_gram,4,inner,0.026172,0.016658,0.000000,0.014303,0.022741,0.034621,0.107332,2,707,283,0.107332,1603,,,,,zwickau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,5_gram,313,cross,0.048970,0.024866,0.004035,0.032811,0.046879,0.059116,0.175417,0,3226,297,0.175417,2488,False,,,,london
632,5_gram,314,cross,0.047719,0.032325,0.002782,0.024351,0.041076,0.064282,0.214930,0,1325,115,0.214930,272,False,,,,london
633,5_gram,315,cross,0.046201,0.030950,0.001738,0.025761,0.039880,0.057878,0.199030,0,821,115,0.199030,272,False,,,,london
634,5_gram,316,cross,0.028303,0.018283,0.000000,0.014738,0.024134,0.038626,0.098014,3,463,115,0.098014,272,False,,,,london


In [8]:
statistics_df_combined_copy = statistics_df_combined.copy()

In [9]:
zwickau_inner_df = statistics_df_combined.loc[
    (statistics_df_combined['cross/inner'] == 'inner') &
    (statistics_df_combined['version'] == 'zwickau')
]
zwickau_cross_df = statistics_df_combined.loc[
    (statistics_df_combined['cross/inner'] == 'cross') &
    (statistics_df_combined['version'] == 'zwickau')
]

In [10]:
total = 0
for i, d in zwickau_cross_df.iterrows():
    cross_mean_val = d['mean']
    p_index = d['p_#']
    inner_mean_val = zwickau_inner_df.loc[zwickau_inner_df['p_#'] == p_index, 'mean'].values[0]
    if cross_mean_val > inner_mean_val:
        statistics_df_combined_copy.loc[
            (statistics_df_combined_copy['version'] == 'zwickau') &
            (statistics_df_combined_copy['cross/inner'] == 'inner') &
            (statistics_df_combined_copy['p_#'] == p_index),
            'inner_mean_is_low'
        ] = True
    else:
        statistics_df_combined_copy.loc[
            (statistics_df_combined_copy['version'] == 'zwickau') &
            (statistics_df_combined_copy['cross/inner'] == 'inner') &
            (statistics_df_combined_copy['p_#'] == p_index),
            'inner_mean_is_low'
        ] = False
        total += 1
print(f'number of inner mean lowwer than cross is: {total}')

number of inner mean lowwer than cross is: 22


In [11]:
london_inner_df = statistics_df_combined.loc[
    (statistics_df_combined['cross/inner'] == 'inner') &
    (statistics_df_combined['version'] == 'london')
]
london_cross_df = statistics_df_combined.loc[
    (statistics_df_combined['cross/inner'] == 'cross') &
    (statistics_df_combined['version'] == 'london')
]

In [12]:
total = 0
for i, d in london_cross_df.iterrows():
    cross_mean_val = d['mean']
    p_index = d['p_#']
    inner_mean_val = london_inner_df.loc[london_inner_df['p_#'] == p_index, 'mean'].values[0]
    if cross_mean_val > inner_mean_val:
        statistics_df_combined_copy.loc[
            (statistics_df_combined_copy['version'] == 'london') &
            (statistics_df_combined_copy['cross/inner'] == 'inner') &
            (statistics_df_combined_copy['p_#'] == p_index),
            'inner_mean_is_low'
        ] = True
    else:
        statistics_df_combined_copy.loc[
            (statistics_df_combined_copy['version'] == 'london') &
            (statistics_df_combined_copy['cross/inner'] == 'inner') &
            (statistics_df_combined_copy['p_#'] == p_index),
            'inner_mean_is_low'
        ] = False
        total += 1
print(f'number of inner mean lowwer than cross is: {total}')

number of inner mean lowwer than cross is: 284


In [13]:
statistics_df_combined_copy.to_csv('../computed_data/text_to_text/statistics_df.csv')

In [14]:
statistics_df_zwickau.query("`cross/inner` == 'inner'")

Unnamed: 0,feature_name,p_#,cross/inner,mean,std,min,25%,50%,75%,max,# of 0 similarities,p_length,most_similar_p_#,most_similar_score,most_similar_p_length,most_similar_dropped,most_similar_dropped_p_#,most_similar_dropperd_score,most_similar_dropped_p_length,version
0,5_gram,0,inner,0.008343,0.015417,0.000000,0.000000,0.002362,0.010302,0.148716,138,31,321,0.148716,53,,,,,zwickau
1,5_gram,1,inner,0.031249,0.018082,0.000000,0.017713,0.028674,0.040268,0.103347,3,878,6,0.103347,1587,,,,,zwickau
2,5_gram,2,inner,0.023848,0.014911,0.000000,0.013357,0.020934,0.031907,0.097735,4,377,212,0.097735,422,,,,,zwickau
3,5_gram,3,inner,0.029136,0.016467,0.000000,0.017094,0.027478,0.039089,0.103975,2,725,5,0.103975,1248,,,,,zwickau
4,5_gram,4,inner,0.026172,0.016658,0.000000,0.014303,0.022741,0.034621,0.107332,2,707,283,0.107332,1603,,,,,zwickau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,5_gram,317,inner,0.024994,0.014451,0.000000,0.015916,0.022584,0.031138,0.120139,2,556,252,0.120139,87,,,,,zwickau
318,5_gram,318,inner,0.048843,0.023006,0.001005,0.033565,0.047335,0.061470,0.131941,0,1535,313,0.131941,1912,,,,,zwickau
319,5_gram,319,inner,0.034576,0.016283,0.003571,0.022721,0.032784,0.043333,0.108028,0,1418,123,0.108028,1337,,,,,zwickau
320,5_gram,320,inner,0.023162,0.012739,0.000427,0.013082,0.021871,0.031558,0.067314,0,520,297,0.067314,2488,,,,,zwickau


In [None]:
statistics_df_zwickau.loc[(statistics_df_zwickau['p_#'] == 0) & (statistics_df_zwickau['cross/inner'] == 'inner')]

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].sort_values('p_#')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].sort_values('p_#').to_csv('../computed_data/text_to_text/5_gram_zwickau_stats_df.csv')

In [None]:
statistics_df[statistics_df['feature_name'] == '5_gram'].set_index(['p_#', 'cross/inner', ]).sort_values('p_#')