In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [125]:
%pip install decorator==5.0.9

import imp
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [127]:
# importing local modules

import sys
sys.path.append('../src/')

import text_cleanup.text_cleanup as thesisCleanUp
import preprocessing.text_preprocessing as thesisTextPreprocessing
import data.reader as thesisDataReader
import utils.utils as thesisUtils
import features.tf_idf.n_gram as thesisTfIdfNgramFeatures
import similarities.cosine as thesisCosineSimilarities

imp.reload(thesisCleanUp)
imp.reload(thesisTextPreprocessing)
imp.reload(thesisDataReader)
imp.reload(thesisUtils)
imp.reload(thesisTfIdfNgramFeatures)

<module 'features.tf_idf.n_gram' from '../src/features/tf_idf/n_gram.py'>

In [4]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()
breslau_corpus = thesisDataReader.get_breslau_corpus()

In [5]:
zwickau_london_similarities = np.load('../computed_data/similarities/cross_version/zwickau_london_similarities.npy')
zwickau_breslau_similarities = np.load('../computed_data/similarities/cross_version/zwickau_breslau_similarities.npy')

london_zwickau_similarities = np.load('../computed_data/similarities/cross_version/london_zwickau_similarities.npy')
london_breslau_similarities = np.load('../computed_data/similarities/cross_version/london_breslau_similarities.npy')

breslau_zwickau_similarities = np.load('../computed_data/similarities/cross_version/breslau_zwickau_similarities.npy')
breslau_london_similarities = np.load('../computed_data/similarities/cross_version/breslau_london_similarities.npy')

In [19]:
zwickau_london_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(zwickau_london_similarities)
zwickau_breslau_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(zwickau_breslau_similarities)

london_zwickau_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(london_zwickau_similarities)
london_breslau_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(london_breslau_similarities)

breslau_zwickau_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(breslau_zwickau_similarities)
breslau_london_max_similarity_per_p = thesisUtils.get_max_similarity_per_p(breslau_london_similarities)

In [130]:
zwickau_breslau_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(zwickau_corpus, breslau_corpus)
zwickau_london_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(zwickau_corpus, london_corpus)

In [167]:
zwickau_p_aligment_columns = [
    'zwickau text',
    'london text',
    'london p#',
    'score',
    'breslau text',
    'breslau p#',
    'score'
]
zwickau_p_aligment = []

for i, zwickau_breslau_best_match in enumerate(zwickau_breslau_best_smlrt):
    zwickau_london_best_match = zwickau_london_best_smlrt[i]
    london_best_p, london_best_score = zwickau_london_best_match['5_gram']
#     london_best_score = = zwickau_london_best_match['5_gram'][1]
    
    breslau_best_p, breslau_best_score = zwickau_breslau_best_match['5_gram']
#     breslau_best_score = zwickau_breslau_best_match['5_gram'][1]

    data = [
        zwickau_corpus[i],
        london_corpus[london_best_p],
        london_best_p,
        london_best_score,
        
        breslau_corpus[breslau_best_p],
        breslau_best_p,
        breslau_best_score
    ]
    
    zwickau_p_aligment.append(data)

zwickau_p_aligment_df = pd.DataFrame(zwickau_p_aligment, columns=zwickau_p_aligment_columns)

In [154]:
london_breslau_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(london_corpus, breslau_corpus)
london_zwickau_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(london_corpus, zwickau_corpus)

In [173]:
london_p_aligment_columns = [
    'london text',
    'zwicau text',
    'zwicau p#',
    'score',
    'breslau text',
    'breslau p#',
    'score'
]
london_p_aligment = []

for i, london_breslau_best_match in enumerate(london_breslau_best_smlrt):
    london_zwickau_best_match = london_zwickau_best_smlrt[i]
    zwickau_best_p, zwickau_best_score = london_zwickau_best_match['5_gram']
    
    breslau_best_p, breslau_best_score = london_breslau_best_match['5_gram']
    
    london_p_aligment.append([
        london_corpus[i],
        zwickau_corpus[zwickau_best_p],
        zwickau_best_p,
        zwickau_best_score,
        
        breslau_corpus[breslau_best_p],
        breslau_best_p,
        breslau_best_score
    ])

london_p_aligment_df = pd.DataFrame(london_p_aligment, columns=london_p_aligment_columns)

In [157]:
breslau_london_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(breslau_corpus, london_corpus)
breslau_zwickau_best_smlrt = thesisCosineSimilarities.get_cross_version_best_similarities(breslau_corpus, zwickau_corpus)

In [169]:
breslau_p_aligment_columns = [
    'breslau text',
    'zwicau text',
    'zwicau p#',
    'score',
    'london text',
    'london p#',
    'score'
]
breslau_p_aligment = []

for i, breslau_london_best_match in enumerate(breslau_london_best_smlrt):
    breslau_zwickau_best_match = breslau_zwickau_best_smlrt[i]
    zwickau_best_p, zwickau_best_score = breslau_zwickau_best_match['5_gram']
    
    london_best_p, london_best_score = breslau_london_best_match['5_gram']
    
    breslau_p_aligment.append([
        breslau_corpus[i],
        zwickau_corpus[zwickau_best_p],
        zwickau_best_p,
        zwickau_best_score,
        
        london_corpus[london_best_p],
        london_best_p,
        london_best_score
    ])

breslau_p_aligment_df = pd.DataFrame(breslau_p_aligment, columns=breslau_p_aligment_columns)

In [170]:
zwickau_p_aligment_df.to_csv('../computed_data/p_aligment/zwickau_london_breslau.csv')

In [174]:
london_p_aligment_df.to_csv('../computed_data/p_aligment/london_zwickau_breslau.csv')

In [172]:
breslau_p_aligment_df.to_csv('../computed_data/p_aligment/breslau_london_zwickau.csv')

In [16]:
london_to_breslau_best_match = np.argmax(london_breslau_max_similarity_per_p)
print(f'london to breslau best match is p: {london_to_breslau_best_match} with score: {london_breslau_max_similarity_per_p[london_to_breslau_best_match]}')
print(f'london to zwickau for the same p has score: {london_zwickau_max_similarity_per_p[london_to_breslau_best_match]}')
print(f'p: \n{london_corpus[london_to_breslau_best_match]}')

london to breslau best match is p: 224 with score: 0.6236674556825493
london to zwickau for the same p has score: 0.7423196556996194
p: 
uidi ego in ecclesia ista miraculum gloriosum soldanus enim uidens ecclesie huius ornatum et tabulas et columpnas omnes preciosas ualde precepit omnia deponi et portari in babiloniam uolens inde palatium suum edificare mira res artificibus cum instrumentis accedentibus ipso adhuc soldano astante cum multis aliis de sano et integro pariete quem nec acus uidebatur posse penetrare serpens mire magnitudinis exiuit primeque tabule que occurrit morsum dedit tabula per transuersum crepuit secundam adiit terciamque et quartam et deinceps usque ad 19 et omnibus similiter accidit omnibus stupentibus et ipso soldano et continuo propositum reuocante serpens disparuit remansit igitur ecclesia et remanet usque hodie sicut prius uestigia tamen corporis serpentis apparent usque hodie in singulis tabulis quas transiuit quasi conbustio quedam igne facta et super omnia 

In [20]:
zwickau_to_breslau_best_match = np.argmax(zwickau_breslau_max_similarity_per_p)
print(f'zwickau to breslau best match is p: {zwickau_to_breslau_best_match} with score: {zwickau_breslau_max_similarity_per_p[zwickau_to_breslau_best_match]}')
print(f'zwickau to london for the same p: {zwickau_london_max_similarity_per_p[zwickau_to_breslau_best_match]}')
print(f'p: \n{zwickau_corpus[zwickau_to_breslau_best_match]}')

zwickau to breslau best match is p: 215 with score: 0.6339557742781724
zwickau to london for the same p: 0.742964011370067
p: 
uidi ego in ecclesia illa miraculum gloriosum soldanus enim uidens huius ecclesie ornatum et tabulas parietum et columpnas preciosas iussit omnia deponi et deportari in babilonemuolens inde suum palacium decorare et mira res ministris cum instrumentis accedentibus adhuc ipso soldano astante de sano et integro pariete quem nec acus uidebatur posse penetrare serpens mire magnitudinis exiuit et prime tabule que occurrit morsum dedit et tabula per medium latitudinis confracta est secundam adiit et terciam et quartam et cetera per ordinem usque 59am et omnibus similiter accidit omnibus stupentibus et ipso soldano et ab incepto animum reuocante serpens decerpsit remansit igitur ecclesia et hodie remanet sicut prius uestigia tamen tractus corporis serpentis in singulis tabulis apparuit notabiliter quasi combustio quedam facta et similiter omnia uidetur mirabile quomod

In [39]:
zwickau_breslau_similarities_copy = np.copy(zwickau_breslau_similarities)
zwickau_breslau_similarities_copy[zwickau_to_breslau_best_match][0] = 0
best_breslau_for_zwickau = np.argmax(zwickau_breslau_similarities_copy[zwickau_to_breslau_best_match])
print(f'best breslau for zwickau: {best_breslau_for_zwickau}')
breslau_corpus[best_breslau_for_zwickau]

best breslau for zwickau: 61


'ibi uidi cellam beati ieronimi lectum eius et sepulcrum claustrum monachorum locum interfectionis innocentum ecclesiam beate paule et eustochium cisternam in quam stella dux magorum dicitur cecidisse de bethlesem contra orientem ad uii leucas sunt engadi montes et loca tutissima latibula dauid supra mare mortuum ubi erant uinee balsami inde ad iii leucas contra austrum achile ubi etiam latuit dauid de engadi ad duas ceyla quam conseruauit dauid a philisteis de ceyla ad ui leucas contra austrum ziph casale ubi habitabant traditores dauid huic adiacet desertum maon ad austrum in quo habitabat nabal carmelus inter ceyla et bethlehem medio loco tecua ubi dormit amos propheta cui ad orientem adiacet desertum tecue postea sequitur mons seyr et desertum magnum in quo morati sunt filii israel 9 annis de bethlehem contra austrum iiii leucas bechgar uilla in loco eminenti sita iuxta quam stans cum sociis 12 uidi ab oriente arabiam totam et mare mortuum usque ad montem seis et or loca latibuorum

In [43]:
london_breslau_similarities_copy = np.copy(london_breslau_similarities)
london_breslau_similarities_copy[london_to_breslau_best_match][0] = 0
best_breslau_for_london = np.argmax(london_breslau_similarities_copy[london_to_breslau_best_match])
print(f'best breslau for london: {best_breslau_for_london}')
breslau_corpus[best_breslau_for_london]

best breslau for london: 61


'ibi uidi cellam beati ieronimi lectum eius et sepulcrum claustrum monachorum locum interfectionis innocentum ecclesiam beate paule et eustochium cisternam in quam stella dux magorum dicitur cecidisse de bethlesem contra orientem ad uii leucas sunt engadi montes et loca tutissima latibula dauid supra mare mortuum ubi erant uinee balsami inde ad iii leucas contra austrum achile ubi etiam latuit dauid de engadi ad duas ceyla quam conseruauit dauid a philisteis de ceyla ad ui leucas contra austrum ziph casale ubi habitabant traditores dauid huic adiacet desertum maon ad austrum in quo habitabat nabal carmelus inter ceyla et bethlehem medio loco tecua ubi dormit amos propheta cui ad orientem adiacet desertum tecue postea sequitur mons seyr et desertum magnum in quo morati sunt filii israel 9 annis de bethlehem contra austrum iiii leucas bechgar uilla in loco eminenti sita iuxta quam stans cum sociis 12 uidi ab oriente arabiam totam et mare mortuum usque ad montem seis et or loca latibuorum

In [None]:
# def 

In [42]:
for i, n in enumerate(breslau_corpus):
    print(f'p: {i}, len: {len(n)}')

p: 0, len: 914
p: 1, len: 1566
p: 2, len: 2977
p: 3, len: 339
p: 4, len: 462
p: 5, len: 945
p: 6, len: 529
p: 7, len: 1365
p: 8, len: 1035
p: 9, len: 590
p: 10, len: 371
p: 11, len: 246
p: 12, len: 616
p: 13, len: 280
p: 14, len: 341
p: 15, len: 476
p: 16, len: 1048
p: 17, len: 1162
p: 18, len: 553
p: 19, len: 1073
p: 20, len: 369
p: 21, len: 392
p: 22, len: 1283
p: 23, len: 723
p: 24, len: 1228
p: 25, len: 588
p: 26, len: 1254
p: 27, len: 749
p: 28, len: 630
p: 29, len: 603
p: 30, len: 314
p: 31, len: 432
p: 32, len: 1178
p: 33, len: 1361
p: 34, len: 615
p: 35, len: 742
p: 36, len: 707
p: 37, len: 317
p: 38, len: 308
p: 39, len: 1830
p: 40, len: 1122
p: 41, len: 1507
p: 42, len: 745
p: 43, len: 1474
p: 44, len: 934
p: 45, len: 1210
p: 46, len: 650
p: 47, len: 404
p: 48, len: 618
p: 49, len: 1107
p: 50, len: 810
p: 51, len: 827
p: 52, len: 674
p: 53, len: 507
p: 54, len: 513
p: 55, len: 1173
p: 56, len: 1542
p: 57, len: 963
p: 58, len: 297
p: 59, len: 2266
p: 60, len: 1329
p: 61, len: 

In [54]:
# l = np.array(london_breslau_max_similarity_per_p)
# l[np.argsort(london_breslau_max_similarity_per_p)]
london_breslau_ordered_similarities_indexes = np.flip(np.argsort(london_breslau_max_similarity_per_p))
london_breslau_ordered_similarities_indexes

array([224, 112,  28, 170,  29, 163,  21,  80,   3, 204,  52,  25, 161,
       100, 201,  23, 203,  83,  30, 176, 167, 200, 101, 147,   5,  98,
        26, 197, 118,  31,  86,  50, 156, 198, 164, 113,  65,  94, 162,
        87,  82, 117, 222,   4,  68, 202,  33,  59,   2,  97, 199,   6,
       144, 171, 174,  49, 168, 103, 180, 172,  71, 119, 151, 146, 107,
        22,  53,   7, 183,  47, 177, 150, 231, 137,  85,  66, 102, 165,
       115, 123, 135, 108, 105,  67,   1, 166, 114, 106, 235,  57,  48,
        96, 230, 152, 314,  73, 246, 233,  91, 153, 142, 191, 185, 247,
       138,  95, 110, 182, 132, 315, 261, 104, 187, 213,  11, 188, 122,
       192,  51, 134, 120, 242, 291, 245, 250, 225, 129,  24, 130, 148,
       131, 243,  76,  77, 234, 220,  79, 218, 317, 298, 292, 216,  63,
       275, 228, 237, 186,  37,  27,  74, 125, 217, 211, 208,  13, 221,
       178, 272, 195, 254, 223, 179, 212,  99, 289,  56,  32, 244, 136,
       255, 205,  55, 169, 273, 209, 158,  18, 193,  42,  75,  6

In [95]:
london_breslau_similarities[41]

array([1.        , 0.03359564, 0.0356074 , 0.03450299, 0.00534642,
       0.01600341, 0.02441912, 0.02898681, 0.01872077, 0.03455249,
       0.01226232, 0.02044366, 0.04558316, 0.04069613, 0.03164737,
       0.01300878, 0.01610632, 0.06739508, 0.04786875, 0.0330308 ,
       0.02372537, 0.03625821, 0.03793494, 0.02637321, 0.02830618,
       0.05481313, 0.0370618 , 0.04919291, 0.01781526, 0.01427227,
       0.01860352, 0.00935268, 0.02988496, 0.05382501, 0.03631061,
       0.03201566, 0.03145205, 0.03992291, 0.0203532 , 0.01249072,
       0.0392049 , 0.03023563, 0.05715835, 0.02890434, 0.0527551 ,
       0.04551411, 0.02097619, 0.02560018, 0.03761932, 0.02247766,
       0.04015935, 0.04218998, 0.01207797, 0.03570971, 0.01015048,
       0.01538134, 0.02354456, 0.02326966, 0.02780154, 0.00703262,
       0.04687988, 0.02519928, 0.06222957, 0.06588533, 0.02291273,
       0.01104232, 0.00791228, 0.0133437 , 0.01731927, 0.00309612,
       0.02411912, 0.00852977])

In [107]:
london_breslau_similarities_copy = np.copy(london_breslau_similarities)

for i in london_breslau_ordered_similarities_indexes:
    print(london_corpus[i])
    print('*'*40)
    london_breslau_best_without_self = london_breslau_similarities_copy[i][1:]
#     london_breslau_similarities_copy[i][0] = 0
#     print(len(london_breslau_best_without_self))
    best_breslau_for_london = np.argmax(london_breslau_best_without_self)
    print(f'best breslau for london: {best_breslau_for_london}')
    print(breslau_corpus[best_breslau_for_london])
    print('*'*40)
    

uidi ego in ecclesia ista miraculum gloriosum soldanus enim uidens ecclesie huius ornatum et tabulas et columpnas omnes preciosas ualde precepit omnia deponi et portari in babiloniam uolens inde palatium suum edificare mira res artificibus cum instrumentis accedentibus ipso adhuc soldano astante cum multis aliis de sano et integro pariete quem nec acus uidebatur posse penetrare serpens mire magnitudinis exiuit primeque tabule que occurrit morsum dedit tabula per transuersum crepuit secundam adiit terciamque et quartam et deinceps usque ad 19 et omnibus similiter accidit omnibus stupentibus et ipso soldano et continuo propositum reuocante serpens disparuit remansit igitur ecclesia et remanet usque hodie sicut prius uestigia tamen corporis serpentis apparent usque hodie in singulis tabulis quas transiuit quasi conbustio quedam igne facta et super omnia mirabile uidetur quomodo serpens ille sic procedere potuit per transuersum in pariete qui erat planissimus et politissimus sicut uitrum
*

In [None]:
london_breslau_similarities_copy = np.copy(london_breslau_similarities)
london_breslau_similarities_copy[london_to_breslau_best_match][0] = 0
best_breslau_for_london = np.argmax(london_breslau_similarities_copy[london_to_breslau_best_match])
print(f'best breslau for london: {best_breslau_for_london}')
breslau_corpus[best_breslau_for_london]