In [54]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder

import sys
import imp
import time

sys.path.append('../src/')

import data.reader as dataReader
import utils.utils as thesisUtils
import similarities.cosine as thesisCosineSimilarity
import google_sheets.client as googleSheetsClient
import vocabulary.vocabulary as thesisVocabulary
import features.factory as thesisFactoryFeatures
import features.model_features as thesisModelFeatures


imp.reload(dataReader)
imp.reload(thesisUtils)
imp.reload(thesisVocabulary)
imp.reload(googleSheetsClient)
imp.reload(thesisModelFeatures)
imp.reload(thesisFactoryFeatures)
imp.reload(thesisCosineSimilarity)

<module 'similarities.cosine' from '../src/similarities/cosine.py'>

In [56]:
london_corpus = dataReader.CorpusByNewLine.london()
zwickau_corpus = dataReader.CorpusByNewLine.zwickau()
burchard_corpus_by_london = dataReader.BurchardCorpus(london_corpus, zwickau_corpus)

In [57]:
featuresFactory = thesisFactoryFeatures.FeaturesFactory(
    london_corpus = london_corpus,
    zwickau_corpus = zwickau_corpus
)

In [58]:
london_leftovers = dataReader.LeftoversCorpus(london_corpus, zwickau_corpus)
zwickau_leftovers = dataReader.LeftoversCorpus(zwickau_corpus, london_corpus)

In [59]:
burchard_VS_zwickau_features_df = featuresFactory.burchard_by_london_VS_zwickau()
burchard_VS_london_features_df = featuresFactory.burchard_by_london_VS_london()
# london_VS_zwickau_features_df = featuresFactory.london_VS_zwickau()

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [60]:
burchard_VS_zwickau_wrong_predictions_experiminet = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_VS_zwickau_features_df,
    AdaBoostClassifier(learning_rate=1, n_estimators=2000)
)
burchard_VS_zwickau_wrong_predictions_experiminet.run()

score is: 0.8
score is: 0.9714285714285714
score is: 0.8571428571428571
score is: 0.7714285714285715
score is: 0.8571428571428571
score is: 0.8
score is: 0.8571428571428571
score is: 0.9428571428571428
score is: 0.8
score is: 0.9117647058823529


In [61]:
burchard_VS_zwickau_wrong_predictions_experiminet.get_burchard_wrong_predictions()

[Row 9 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 12 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 45 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 63 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 64 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 81 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 100 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 108 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 114 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 118 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 121 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 135 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 174 has been classified as 1(Zwickau) and should be 2(Burchard)]

In [62]:
burchard_VS_london_wrong_predictions_experiminet = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_VS_london_features_df,
    RandomForestClassifier(max_depth=11, n_estimators=300, random_state=0)
)
burchard_VS_london_wrong_predictions_experiminet.run()

score is: 0.8333333333333334
score is: 0.8611111111111112
score is: 0.75
score is: 0.8055555555555556
score is: 0.8333333333333334
score is: 0.5833333333333334
score is: 0.8888888888888888
score is: 0.75
score is: 0.7222222222222222
score is: 0.75


In [63]:
burchard_VS_london_wrong_predictions_experiminet.get_burchard_wrong_predictions()

[Row 15 has been classified as 0(London) and should be 2(Burchard),
 Row 22 has been classified as 0(London) and should be 2(Burchard),
 Row 44 has been classified as 0(London) and should be 2(Burchard),
 Row 67 has been classified as 0(London) and should be 2(Burchard),
 Row 81 has been classified as 0(London) and should be 2(Burchard),
 Row 100 has been classified as 0(London) and should be 2(Burchard),
 Row 101 has been classified as 0(London) and should be 2(Burchard),
 Row 114 has been classified as 0(London) and should be 2(Burchard),
 Row 121 has been classified as 0(London) and should be 2(Burchard),
 Row 146 has been classified as 0(London) and should be 2(Burchard)]

In [64]:
# london_VS_zwickau_wrong_predictions_experiment = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
#     london_VS_zwickau_features_df,
#     xgb.XGBClassifier(gamma = 0.4, max_depth = 3, min_child_weight = 5)
# )
# london_VS_zwickau_wrong_predictions_experiment.run()

In [65]:
X_london, y_london = thesisModelFeatures.create_X_y(featuresFactory.london_by_burchard_by_london_VS_zwickau_vectorizer())
X, y = thesisModelFeatures.create_X_y(burchard_VS_zwickau_features_df)
burchard_VS_zwickau_best_classifier = AdaBoostClassifier(learning_rate=1, n_estimators=2000).fit(X, y)

n_gram_feature_name: 2_5_gram


In [66]:
london_predictions_by_burchard_vs_zwickau_classifier = burchard_VS_zwickau_best_classifier.predict(X_london)

In [67]:
X_zwickau, y_zwickau = thesisModelFeatures.create_X_y(featuresFactory.zwickau_by_burchard_by_london_VS_london_vectorizer())
X, y = thesisModelFeatures.create_X_y(burchard_VS_london_features_df)
burchard_VS_london_best_classifier = RandomForestClassifier(max_depth=11, n_estimators=300, random_state=0).fit(X, y)

n_gram_feature_name: 2_5_gram


In [68]:
zwickau_predictions_by_burchard_vs_london_classifier = burchard_VS_london_best_classifier.predict(X_zwickau)

In [69]:
print(
    [ 
        thesisModelFeatures.version_label_to_human_readable(int(i))
        for i in zwickau_predictions_by_burchard_vs_london_classifier 
    ]
)

['London', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'Burchard', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'Burchard', 'London', 'London', 'Burchard', 'London', 'Burchard', '

In [89]:
burchard_spreadsheet = googleSheetsClient.BurchardResults(
    burchard_corpus = burchard_corpus_by_london,
    london_left_overs_corpus = london_leftovers,
    zwickau_left_overs_corpus = zwickau_leftovers,
    
    burchard_vs_london_burchard_wrong_predictions = burchard_VS_london_wrong_predictions_experiminet.get_burchard_wrong_predictions(),
    burchard_vs_zwickau_burchard_wrong_predictions = burchard_VS_zwickau_wrong_predictions_experiminet.get_burchard_wrong_predictions(),
    
    burchard_vs_london_london_wrong_predictions = burchard_VS_london_wrong_predictions_experiminet.get_london_wrong_predictions(),
    burchard_vs_zwickau_zwickau_wrong_predictions = burchard_VS_zwickau_wrong_predictions_experiminet.get_zwickau_wrong_predictions(),
    
    london_predictions_by_burchard_vs_zwickau_classifier = [ [thesisModelFeatures.version_label_to_human_readable(int(i))] for i in london_predictions_by_burchard_vs_zwickau_classifier ],
    zwickau_predictions_by_burchard_vs_london_classifier = [ [thesisModelFeatures.version_label_to_human_readable(int(i))] for i in zwickau_predictions_by_burchard_vs_london_classifier ]
)

In [71]:
# burchard_spreadsheet.write_london_predictions_by_burchard_vs_zwickau_classifier()

In [72]:
# burchard_spreadsheet.write_zwickau_predictions_by_burchard_vs_london_classifier()

In [94]:
burchard_spreadsheet.write_headers()

In [93]:
# burchard_spreadsheet.write_london_left_overs_paragraphs()
# burchard_spreadsheet.colorize_london_leftovers_shared_parts()
# burchard_spreadsheet.write_london_wrong_predictions()
# burchard_spreadsheet.write_london_predictions_by_burchard_vs_zwickau_classifier()

In [98]:
# burchard_spreadsheet.write_zwickau_left_overs_paragraphs()
# burchard_spreadsheet.colorize_zwickau_leftovers_shared_parts()
# burchard_spreadsheet.write_zwickau_wrong_predictions()
burchard_spreadsheet.write_zwickau_predictions_by_burchard_vs_london_classifier()

In [None]:
# burchard_spreadsheet.write_london_wrong_predictions()

In [None]:
# burchard_spreadsheet.write_zwickau_wrong_predictions()

In [None]:
# burchard_spreadsheet.write_headers()

In [None]:
# burchard_spreadsheet.write_london_left_overs_paragraphs()

# Short results for Yoni (after our f2f meeting)

In [None]:
london_zwickau_similarities = thesisCosineSimilarity.CrossVersionSimilarity5Gram(london_corpus, zwickau_corpus)
# london_zwickau_similarities.calculate()

zwickau_london_similarities = thesisCosineSimilarity.CrossVersionSimilarity5Gram(zwickau _corpus, london_corpus)
# zwickau_london_similarities.calculate()

In [None]:
# london_zwickau_similarities.save()
# zwickau_london_similarities.save()
london_zwickau_similarities.calculate()
zwickau_london_similarities.calculate()

In [None]:
t = "ebron ciuitas de betgara leucis et plus est ebron ciuitas in qua primus homo conditus et sepultus est cum coniuge et maiores patriarce cum coniugibus suis apud eorum sepulcra ibidem fui uerum est quod ebron ila uetus in qua olim habitauerunt gigantes cariatarbe dicta sita in monte quem expugnauit calep filius iepore et in qua regnauit dauid annis penitus est destructa ebron distat ab ierusalem per miliaria"

zwickau_london_similarities.get_best_match_of_text(t)

In [None]:
t2 = "de betleem leucis contra austrum uia que ducit ebron est betakar uila in alto sita loco cui ad austrum adiacet alia uila rama nomine excelsa ualde in qua in cole quodam sublimi stans cum aliis multis uidi totam terram arabie usque ad montem seir et omnia loca circa mare mortuum et loca latibulorum dauid iordanem insuper usque secim et usque ad montem abarim contra occidentem uero uidi in eodem loco totum litus maris magni ab ioppe usque gasam et bersabee usque ad desertum sur totam insuper terram pilistiim a ramaca sopim per get et acaron et asotum et iamnam et ascalonem cum omni planicie sub monte iuda de rama plusquam leuca ad dexteram prope uiam regiam que ducit ebron est manbre ubi habitabat abraham longo tempore ubi sedens ad hilicem manbre in ostio tabernaculi sui uidit uiros descendentes per uiam quos etiam recepit hospitio ut dicitur genesis ilex ila hodie ostenditur ante oscium tabernaculi abrahe uerum est quod ila uetus aruit sed de radice eius alia nata est de cuius fructu et ligno tuli mecum in bona quantitate et habet folia modico maiora quam lentiscus sed fructum omnino sicut quercus"

london_zwickau_similarities.get_best_match_of_text(t2)

In [None]:
res = []
for m in zwickau_london_similarities.best_matches:
    if m.score >= 0.2 and m.score <= 0.4:
        res.append(m)

In [None]:
# res

In [None]:
zwickau_corpus.corpus_without_processing[225]

In [None]:
zwickau_corpus.corpus[225]

In [None]:
london_corpus.corpus[223]

In [None]:
london_leftovers.similarity_scores['ueteribus historiis legamus beatus libris positi menpiticos apolonius caucasum scitas indiam bragmanos iartam uanerabantur cerubin futuri aput uenerabilis quociens tociens linteaminibus']

In [None]:
len(london_leftovers.corpus)