In [78]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [86]:
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder

import sys
import imp

sys.path.append('../src/')
import data.reader as dataReader
import utils.utils as thesisUtils
import similarities.cosine as thesisCosineSimilarity
import vocabulary.vocabulary as thesisVocabulary
import features.model_features as thesisModelFeatures
import data.corpus_stats as thesisCorpusStats

imp.reload(dataReader)
imp.reload(thesisUtils)
imp.reload(thesisCorpusStats)
imp.reload(thesisVocabulary)
imp.reload(thesisModelFeatures)
imp.reload(thesisCosineSimilarity)

<module 'similarities.cosine' from '../src/similarities/cosine.py'>

In [88]:
london_corpus = dataReader.CorpusByNewLine.london()
zwickau_corpus = dataReader.CorpusByNewLine.zwickau()

In [8]:
london_zwickau_similarities = thesisCosineSimilarity.CrossVersionSimilarity5Gram(london_corpus, zwickau_corpus)
# london_zwickau_similarities.calculate()

zwickau_london_similarities = thesisCosineSimilarity.CrossVersionSimilarity5Gram(zwickau_corpus, london_corpus)
# zwickau_london_similarities.calculate()

In [9]:
# london_zwickau_similarities.save()
# zwickau_london_similarities.save()
london_zwickau_similarities.load()
zwickau_london_similarities.load()

In [89]:
burchard_corpus_by_london = dataReader.BurchardCorpus(london_corpus, zwickau_corpus)
burchard_corpus_by_zwickau = dataReader.BurchardCorpus(zwickau_corpus, london_corpus)

In [11]:
london_leftovers = dataReader.LeftoversCorpus(london_corpus, zwickau_corpus)
zwickau_leftovers = dataReader.LeftoversCorpus(zwickau_corpus, london_corpus)

In [12]:
print(len(london_leftovers.corpus_for_predictions()))
print(len(zwickau_leftovers.corpus_for_predictions()))

157
146


In [13]:
def filter_short_p(corpus):
    return list(filter(lambda x: len(x.split()) > 20, corpus))

In [14]:
def build_burchard_corpus_with_predictions(burchard_corpus, wrong_predictions_by_london, wrong_predictions_by_zwickau):
    is_burchard = True
    temp_corpus = [ [p, is_burchard, is_burchard] for p in burchard_corpus ]
    
    for prediction in wrong_predictions_by_london:
        temp_corpus[prediction.index][1] = False
    for prediction in wrong_predictions_by_zwickau:
        temp_corpus[prediction.index][2] = False

    return temp_corpus

In [15]:
london_leftofvers_long = filter_short_p(london_leftovers.corpus) # list(filter(lambda x: len(x.split()) > 20, london_leftovers.corpus))
zwickau_leftofvers_long = filter_short_p(zwickau_leftovers.corpus) # list(filter(lambda x: len(x.split()) > 20, zwickau_leftovers.corpus))
burchard_lz_corpus_long = filter_short_p(burchard_corpus_lz.corpus) # list(filter(lambda x: len(x.split()) > 20, burchard_corpus_lz.corpus))
burchard_zl_corpus_long = filter_short_p(burchard_corpus_zl.corpus) # list(filter(lambda x: len(x.split()) > 20, burchard_corpus_zl.corpus))

NameError: name 'burchard_corpus_lz' is not defined

In [17]:
len(burchard_lz_corpus_long)

208

In [94]:
london_best_result_1 = np.amax([i[1] for i in burchard_lz_london_greed_rearch_resp])
print(f'london_best_result_1: {london_best_result_1}')

london_best_result_2 = np.amax([i[1] for i in burchard_zl_london_greed_rearch_resp])
print(f'london_best_result_2: {london_best_result_2}')

london_classifier_total_result = (london_best_result_1 + london_best_result_2) / 2
print(f'london_classifier_total_result: {london_classifier_total_result}')

london_best_result_1: 0.794069069069069
london_best_result_2: 0.7718468468468469
london_classifier_total_result: 0.782957957957958


In [95]:
zwickau_best_result_1 = np.amax([i[1] for i in burchard_lz_zwickau_greed_rearch_resp])
print(f'zwickau_best_result_1: {zwickau_best_result_1}')

zwickau_best_result_2 = np.amax([i[1] for i in burchard_zl_zwickau_greed_rearch_resp])
print(f'zwickau_best_result_2: {zwickau_best_result_2}')

zwickau_classifier_total_result = (zwickau_best_result_1 + zwickau_best_result_2) / 2
print(f'zwickau_classifier_total_result: {zwickau_classifier_total_result}')

zwickau_best_result_1: 0.8646825396825397
zwickau_best_result_2: 0.8326984126984126
zwickau_classifier_total_result: 0.8486904761904761


In [96]:
more_original_version = 'not found'
if london_classifier_total_result < zwickau_classifier_total_result: more_original_version = 'london'
elif zwickau_classifier_total_result < london_classifier_total_result: more_original_version = 'zwickau'

print(f'Due to classifier ability to distinguish between 2 version, version candidate to be closer to burchard is: {more_original_version}')

Due to classifier ability to distinguish between 2 version, version candidate to be closer to burchard is: london


In [None]:
burchard_lz_zwickau_features_df = thesisModelFeatures.create_features_df(
    None,
    zwickau_leftofvers_long,
    burchard_lz_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)
burchard_zl_zwickau_features_df = thesisModelFeatures.create_features_df(
    None,
    zwickau_leftofvers_long,
    burchard_zl_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)
burchard_lz_london_features_df = thesisModelFeatures.create_features_df(
    london_leftofvers_long,
    None,
    burchard_lz_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)
burchard_zl_london_features_df = thesisModelFeatures.create_features_df(
    london_leftofvers_long,
    None,
    burchard_zl_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)

In [125]:
burchard_lz_zwickau_wrong_predictions_experiment = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_lz_zwickau_features_df, 
    AdaBoostClassifier(learning_rate=1, n_estimators=2000)
)
burchard_lz_zwickau_wrong_predictions_experiment.run()

score is: 0.8055555555555556
score is: 0.8888888888888888
score is: 0.8888888888888888
score is: 0.7777777777777778
score is: 0.8571428571428571
score is: 0.8571428571428571
score is: 0.8571428571428571
score is: 0.9142857142857143
score is: 0.9142857142857143
score is: 0.8857142857142857


In [126]:
burchard_zl_zwickau_wrong_predictions_experiment = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_zl_zwickau_features_df, 
    AdaBoostClassifier(learning_rate=1, n_estimators=2000)
)
burchard_zl_zwickau_wrong_predictions_experiment.run()

score is: 0.8055555555555556
score is: 0.9444444444444444
score is: 0.8055555555555556
score is: 0.7714285714285715
score is: 0.8285714285714286
score is: 0.8
score is: 0.8
score is: 0.8857142857142857
score is: 0.8
score is: 0.8857142857142857


In [127]:
burchard_lz_london_wrong_predictions_experiment = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_lz_london_features_df, 
    xgb.XGBClassifier(gamma = 0.4, max_depth = 9, min_child_weight = 3)
)
burchard_lz_london_wrong_predictions_experiment.run()

score is: 0.8918918918918919
score is: 0.7567567567567568
score is: 0.7837837837837838
score is: 0.8378378378378378
score is: 0.8648648648648649
score is: 0.75
score is: 0.8611111111111112
score is: 0.75
score is: 0.6944444444444444
score is: 0.75


In [128]:
burchard_zl_london_wrong_predictions_experiment = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_zl_london_features_df, 
    RandomForestClassifier(criterion = "entropy" , max_depth=12, n_estimators=200, random_state=0)
)
burchard_zl_london_wrong_predictions_experiment.run()

score is: 0.8378378378378378
score is: 0.8108108108108109
score is: 0.7027027027027027
score is: 0.7837837837837838
score is: 0.8333333333333334
score is: 0.6666666666666666
score is: 0.8611111111111112
score is: 0.7777777777777778
score is: 0.6944444444444444
score is: 0.75


In [197]:
burchard_corpus_lz_truly_predicted = burchard_corpus_lz.get_burchard_predicted_truly(
    burchard_lz_london_wrong_predictions_experiment.get_burchard_wrong_predictions(),
    burchard_lz_london_wrong_predictions_experiment.get_burchard_wrong_predictions()
)
burchard_corpus_zl_truly_predicted = burchard_corpus_zl.get_burchard_predicted_truly(
    burchard_lz_london_wrong_predictions_experiment.get_burchard_wrong_predictions(),
    burchard_lz_london_wrong_predictions_experiment.get_burchard_wrong_predictions()
)

##### burchard paragraphs that was identify as burchard from london and zwickau side

In [194]:
truly_predicted_from_both_sides = []
for i in burchard_corpus_lz_truly_predicted:
    predicted_from_another_side = any(x for x in burchard_corpus_zl_truly_predicted if x['index'] == i['index'])
    if predicted_from_another_side: truly_predicted_from_both_sides.append(i['index'])
print(truly_predicted_from_both_sides)
len(truly_predicted_from_both_sides)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 103, 104, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 122, 123, 124, 125, 128, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 148, 149, 150, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 183, 185, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 205, 206]


179

##### london leftovers predicted as burchard from london and zwickau side

In [220]:
print([i['index'] for i in london_leftovers.leftovers_predicted_falsy(
    burchard_lz_london_wrong_predictions_experiment.get_london_wrong_predictions(),
    burchard_zl_london_wrong_predictions_experiment.get_london_wrong_predictions()
)])

[6, 7, 18, 20, 28, 35, 42, 43, 44, 52, 54, 58, 65, 76, 77, 82, 84, 85, 88, 93, 107, 113, 114, 116, 119, 121, 122, 123, 127, 130, 131, 132, 133, 137, 141, 142, 145, 147, 149, 152, 154, 155]


In [None]:
# TODO: we can run zwickau_burcrhard classifier on london texts

In [65]:
# load best models
# run models predictions on 2 burchard candidate version
# from results of predictions make assumption of real burchard
# 

# zwickau burchard

In [11]:
burchard_lz_zwickau_features_df = thesisModelFeatures.create_features_df(
    None,
    zwickau_leftofvers_long,
    burchard_lz_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [12]:
burchard_lz_zwickau_result = thesisModelFeatures.run_models(
    burchard_lz_zwickau_features_df
)

running: SVM_linear


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


running: SVM_RBF
running: DecisionTreeClassifier
running: GaussianProcessClassifier
running: RandomForestClassifier
running: MLPClassifier
running: GaussianNB
running: KNeighborsClassifier
running: AdaBoostClassifier
running: XGBClassifier


In [13]:
burchard_lz_zwickau_result

(                           precision_macro  recall_macro  f1_macro  f1_micro  \
 SVM_linear                        0.293810      0.500000  0.370096  0.587619   
 SVM_RBF                           0.747899      0.649048  0.634857  0.700476   
 DecisionTreeClassifier            0.712525      0.711667  0.707613  0.715238   
 GaussianProcessClassifier         0.710851      0.687262  0.687785  0.706429   
 RandomForestClassifier            0.744441      0.707024  0.709057  0.734444   
 MLPClassifier                     0.712422      0.685952  0.686963  0.703730   
 GaussianNB                        0.628596      0.529167  0.466058  0.599206   
 KNeighborsClassifier              0.698802      0.549881  0.482861  0.624206   
 AdaBoostClassifier                0.805553      0.793690  0.796080  0.805079   
 XGBClassifier                     0.825755      0.812143  0.815890  0.825317   
 
                            f1_weighted  accuracy  
 SVM_linear                    0.435047  0.587619  
 SV

In [42]:
burchard_lz_zwickau_result[0]

Unnamed: 0,precision_macro,recall_macro,f1_macro,f1_micro,f1_weighted,accuracy
SVM_linear,0.29381,0.5,0.370096,0.587619,0.435047,0.587619
SVM_RBF,0.747899,0.649048,0.634857,0.700476,0.66174,0.700476
DecisionTreeClassifier,0.712525,0.711667,0.707613,0.715238,0.714894,0.715238
GaussianProcessClassifier,0.710851,0.687262,0.687785,0.706429,0.700188,0.706429
RandomForestClassifier,0.744441,0.707024,0.709057,0.734444,0.723394,0.734444
MLPClassifier,0.712422,0.685952,0.686963,0.70373,0.698473,0.70373
GaussianNB,0.628596,0.529167,0.466058,0.599206,0.512433,0.599206
KNeighborsClassifier,0.698802,0.549881,0.482861,0.624206,0.530702,0.624206
AdaBoostClassifier,0.805553,0.79369,0.79608,0.805079,0.803349,0.805079
XGBClassifier,0.825755,0.812143,0.81589,0.825317,0.822931,0.825317


In [14]:
burchard_lz_zwickau_greed_rearch_resp = []
for cls in [
    'SVC', 
    'DecisionTreeClassifier', 
    'GaussianProcessClassifier', 
    'RandomForestClassifier', 
    'GaussianNB', 
    'KNeighborsClassifier', 
    'AdaBoostClassifier', 
    'XGBClassifier'
]:
    grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_lz_zwickau_features_df, [cls])
    burchard_lz_zwickau_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
    print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['SVC']
running: SVC
0.7262698412698413
testing classifiers: ['DecisionTreeClassifier']
running: DecisionTreeClassifier
0.7492857142857142
testing classifiers: ['GaussianProcessClassifier']
running: GaussianProcessClassifier
0.7064285714285714
testing classifiers: ['RandomForestClassifier']
running: RandomForestClassifier


 0.72865079 0.74563492 0.74007937 0.72873016 0.73142857 0.72857143
 0.73142857 0.73714286 0.72865079 0.74563492 0.59063492 0.59047619
 0.58761905 0.58761905 0.58761905 0.58761905 0.58761905 0.58761905
 0.74       0.73436508 0.7315873  0.72587302 0.7315873  0.74293651
 0.74277778 0.74293651 0.74       0.73436508 0.7315873  0.72587302
 0.7315873  0.74293651 0.74277778 0.74293651 0.60761905 0.59912698
 0.59055556 0.58769841 0.58761905 0.58761905 0.58761905 0.58761905
 0.73150794 0.73436508 0.74015873 0.74015873 0.72880952 0.7402381
 0.72031746 0.73444444 0.73150794 0.73436508 0.74015873 0.74015873
 0.72880952 0.7402381  0.72031746 0.73444444 0.62444444 0.6018254
 0.59619048 0.58761905 0.59047619 0.58769841 0.58761905 0.59047619
 0.72031746 0.71174603 0.72603175 0.74015873 0.73150794 0.73444444
 0.72880952 0.73722222 0.72031746 0.71174603 0.72603175 0.74015873
 0.73150794 0.73444444 0.72880952 0.73722222 0.63015873 0.61880952
 0.61031746 0.6018254  0.60460317 0.59611111 0.59888889 0.601666

0.7656349206349206
testing classifiers: ['GaussianNB']
running: GaussianNB
0.613015873015873
testing classifiers: ['KNeighborsClassifier']
running: KNeighborsClassifier
0.6242063492063493
testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
0.8646825396825397
testing classifiers: ['XGBClassifier']
running: XGBClassifier
0.8535714285714284


In [15]:
burchard_lz_zwickau_greed_rearch_resp

[['SVC', 0.7262698412698413],
 ['DecisionTreeClassifier', 0.7492857142857142],
 ['GaussianProcessClassifier', 0.7064285714285714],
 ['RandomForestClassifier', 0.7656349206349206],
 ['GaussianNB', 0.613015873015873],
 ['KNeighborsClassifier', 0.6242063492063493],
 ['AdaBoostClassifier', 0.8646825396825397],
 ['XGBClassifier', 0.8535714285714284]]

##### create and save with argumests

In [66]:
grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_lz_zwickau_features_df, ['AdaBoostClassifier'])
burchard_lz_zwickau_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
0.8646825396825397


In [67]:
grid_search_cv_result[1][0].best_estimator_

AdaBoostClassifier(learning_rate=1, n_estimators=2000)

In [68]:
X, y = thesisModelFeatures.create_X_y(burchard_lz_zwickau_features_df)

In [71]:
adaBoostClassifier = AdaBoostClassifier(learning_rate=1, n_estimators=2000).fit(X, y)

In [72]:
thesisModelFeatures.save_zwickau_vs_burchard_best_model(
    adaBoostClassifier, 
    'burchard_lz_AdaBoostClassifier(learning_rate=1, n_estimators=2000)_0.86468'
)

In [16]:
burchard_zl_zwickau_features_df = thesisModelFeatures.create_features_df(
    None,
    zwickau_leftofvers_long,
    burchard_zl_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [17]:
burchard_zl_zwickau_result = thesisModelFeatures.run_models(
    burchard_zl_zwickau_features_df
)

running: SVM_linear


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


running: SVM_RBF
running: DecisionTreeClassifier
running: GaussianProcessClassifier
running: RandomForestClassifier
running: MLPClassifier
running: GaussianNB
running: KNeighborsClassifier
running: AdaBoostClassifier
running: XGBClassifier


In [18]:
burchard_zl_zwickau_result

(                           precision_macro  recall_macro  f1_macro  f1_micro  \
 SVM_linear                        0.293214      0.500000  0.369617  0.586429   
 SVM_RBF                           0.748904      0.621071  0.602430  0.673810   
 DecisionTreeClassifier            0.667688      0.666190  0.663477  0.671508   
 GaussianProcessClassifier         0.711243      0.683095  0.684108  0.702143   
 RandomForestClassifier            0.736234      0.693095  0.695071  0.718889   
 MLPClassifier                     0.708911      0.680952  0.680834  0.696349   
 GaussianNB                        0.646911      0.527262  0.467682  0.595000   
 KNeighborsClassifier              0.693926      0.555238  0.495901  0.625873   
 AdaBoostClassifier                0.786987      0.765952  0.767423  0.781508   
 XGBClassifier                     0.813956      0.788333  0.792804  0.807143   
 
                            f1_weighted  accuracy  
 SVM_linear                    0.433623  0.586429  
 SV

In [43]:
burchard_zl_zwickau_result[0]

Unnamed: 0,precision_macro,recall_macro,f1_macro,f1_micro,f1_weighted,accuracy
SVM_linear,0.293214,0.5,0.369617,0.586429,0.433623,0.586429
SVM_RBF,0.748904,0.621071,0.60243,0.67381,0.630755,0.67381
DecisionTreeClassifier,0.667688,0.66619,0.663477,0.671508,0.67146,0.671508
GaussianProcessClassifier,0.711243,0.683095,0.684108,0.702143,0.695814,0.702143
RandomForestClassifier,0.736234,0.693095,0.695071,0.718889,0.708448,0.718889
MLPClassifier,0.708911,0.680952,0.680834,0.696349,0.690898,0.696349
GaussianNB,0.646911,0.527262,0.467682,0.595,0.512376,0.595
KNeighborsClassifier,0.693926,0.555238,0.495901,0.625873,0.540473,0.625873
AdaBoostClassifier,0.786987,0.765952,0.767423,0.781508,0.776693,0.781508
XGBClassifier,0.813956,0.788333,0.792804,0.807143,0.801836,0.807143


In [19]:
burchard_zl_zwickau_greed_rearch_resp = []
for cls in [
    'SVC', 
    'DecisionTreeClassifier', 
    'GaussianProcessClassifier', 
    'RandomForestClassifier', 
    'GaussianNB', 
    'KNeighborsClassifier', 
    'AdaBoostClassifier'
]:
    grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_zl_zwickau_features_df, [cls])
    burchard_zl_zwickau_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
    print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['SVC']
running: SVC
0.7216666666666667
testing classifiers: ['DecisionTreeClassifier']
running: DecisionTreeClassifier
0.7738095238095237
testing classifiers: ['GaussianProcessClassifier']
running: GaussianProcessClassifier
0.705
testing classifiers: ['RandomForestClassifier']
running: RandomForestClassifier


 0.71904762 0.71896825 0.68753968 0.7047619  0.7018254  0.71047619
 0.70753968 0.71325397 0.71904762 0.71896825 0.58365079 0.58642857
 0.58642857 0.58642857 0.58642857 0.58642857 0.58642857 0.58642857
 0.71888889 0.7131746  0.72468254 0.71611111 0.72174603 0.71603175
 0.71611111 0.72174603 0.71888889 0.7131746  0.72468254 0.71611111
 0.72174603 0.71603175 0.71611111 0.72174603 0.59214286 0.58642857
 0.58642857 0.58642857 0.58642857 0.58642857 0.58642857 0.58642857
 0.6902381  0.70738095 0.71325397 0.71611111 0.72468254 0.72174603
 0.71611111 0.71904762 0.6902381  0.70738095 0.71325397 0.71611111
 0.72468254 0.72174603 0.71611111 0.71904762 0.59492063 0.58365079
 0.58642857 0.58928571 0.58928571 0.58928571 0.58928571 0.58928571
 0.71031746 0.71031746 0.72452381 0.71611111 0.71888889 0.7302381
 0.72166667 0.71611111 0.71031746 0.71031746 0.72452381 0.71611111
 0.71888889 0.7302381  0.72166667 0.71611111 0.6115873  0.60333333
 0.58912698 0.58920635 0.59769841 0.60055556 0.59777778 0.60063

0.7446031746031746
testing classifiers: ['GaussianNB']
running: GaussianNB
0.6032539682539683
testing classifiers: ['KNeighborsClassifier']
running: KNeighborsClassifier
0.6258730158730159
testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
0.8326984126984126


In [31]:
for cls in ['XGBClassifier']:
    grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_zl_zwickau_features_df, [cls])
    burchard_zl_zwickau_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
    print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['XGBClassifier']
running: XGBClassifier
0.826984126984127


In [20]:
burchard_zl_zwickau_greed_rearch_resp

[['SVC', 0.7216666666666667],
 ['DecisionTreeClassifier', 0.7738095238095237],
 ['GaussianProcessClassifier', 0.705],
 ['RandomForestClassifier', 0.7446031746031746],
 ['GaussianNB', 0.6032539682539683],
 ['KNeighborsClassifier', 0.6258730158730159],
 ['AdaBoostClassifier', 0.8326984126984126]]

In [73]:
grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_zl_zwickau_features_df, ['AdaBoostClassifier'])
print(grid_search_cv_result[1][0].best_estimator_)

testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
AdaBoostClassifier(learning_rate=1, n_estimators=2000)


In [74]:
print(grid_search_cv_result[1][0].best_score_)

0.8326984126984126


In [75]:
X, y = thesisModelFeatures.create_X_y(burchard_zl_zwickau_features_df)

In [76]:
adaBoostClassifier_burchard_zl = AdaBoostClassifier(learning_rate=1, n_estimators=2000).fit(X, y)

In [77]:
thesisModelFeatures.save_zwickau_vs_burchard_best_model(
    adaBoostClassifier_burchard_zl, 
    'burchard_zl_AdaBoostClassifier(learning_rate=1, n_estimators=2000)_0.83269'
)

# london burchard

In [21]:
burchard_lz_london_features_df = thesisModelFeatures.create_features_df(
    london_leftofvers_long,
    None,
    burchard_lz_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [22]:
burchard_lz_london_result = thesisModelFeatures.run_models(
    burchard_lz_london_features_df
)

running: SVM_linear


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


running: SVM_RBF
running: DecisionTreeClassifier
running: GaussianProcessClassifier
running: RandomForestClassifier
running: MLPClassifier
running: GaussianNB
running: KNeighborsClassifier
running: AdaBoostClassifier
running: XGBClassifier


In [23]:
burchard_lz_london_result

(                           precision_macro  recall_macro  f1_macro  f1_micro  \
 SVM_linear                        0.284947      0.500000  0.362989  0.569895   
 SVM_RBF                           0.805178      0.658185  0.635549  0.703754   
 DecisionTreeClassifier            0.636666      0.632887  0.629784  0.640841   
 GaussianProcessClassifier         0.737535      0.719196  0.717911  0.733859   
 RandomForestClassifier            0.801534      0.739554  0.736178  0.766892   
 MLPClassifier                     0.747724      0.729881  0.727008  0.742192   
 GaussianNB                        0.604570      0.564821  0.541575  0.602628   
 KNeighborsClassifier              0.638506      0.592857  0.572866  0.624700   
 AdaBoostClassifier                0.718949      0.713929  0.711432  0.720345   
 XGBClassifier                     0.766798      0.748899  0.748292  0.761111   
 
                            f1_weighted  accuracy  
 SVM_linear                    0.413811  0.569895  
 SV

In [44]:
burchard_lz_london_result[0]

Unnamed: 0,precision_macro,recall_macro,f1_macro,f1_micro,f1_weighted,accuracy
SVM_linear,0.284947,0.5,0.362989,0.569895,0.413811,0.569895
SVM_RBF,0.805178,0.658185,0.635549,0.703754,0.657934,0.703754
DecisionTreeClassifier,0.636666,0.632887,0.629784,0.640841,0.63781,0.640841
GaussianProcessClassifier,0.737535,0.719196,0.717911,0.733859,0.726839,0.733859
RandomForestClassifier,0.801534,0.739554,0.736178,0.766892,0.748578,0.766892
MLPClassifier,0.747724,0.729881,0.727008,0.742192,0.735265,0.742192
GaussianNB,0.60457,0.564821,0.541575,0.602628,0.564061,0.602628
KNeighborsClassifier,0.638506,0.592857,0.572866,0.6247,0.592579,0.6247
AdaBoostClassifier,0.718949,0.713929,0.711432,0.720345,0.71798,0.720345
XGBClassifier,0.766798,0.748899,0.748292,0.761111,0.755762,0.761111


In [24]:
burchard_lz_london_greed_rearch_resp = []
for cls in [
    'SVC', 
    'DecisionTreeClassifier', 
    'GaussianProcessClassifier', 
    'RandomForestClassifier', 
    'GaussianNB', 
    'KNeighborsClassifier', 
    'AdaBoostClassifier', 
    'XGBClassifier'
]:
    grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_lz_london_features_df, [cls])
    burchard_lz_london_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
    print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['SVC']
running: SVC
0.753003003003003
testing classifiers: ['DecisionTreeClassifier']
running: DecisionTreeClassifier
0.6713963963963964
testing classifiers: ['GaussianProcessClassifier']
running: GaussianProcessClassifier
0.7557057057057057
testing classifiers: ['RandomForestClassifier']
running: RandomForestClassifier


 0.73956456 0.73956456 0.74504505 0.74782282 0.7475976  0.73400901
 0.74219219 0.73408408 0.73956456 0.73956456 0.57815315 0.56989489
 0.56989489 0.56989489 0.56989489 0.56989489 0.56989489 0.56989489
 0.76396396 0.76674174 0.76674174 0.76411411 0.75870871 0.75307808
 0.75315315 0.7475976  0.76396396 0.76674174 0.76674174 0.76411411
 0.75870871 0.75307808 0.75315315 0.7475976  0.58355856 0.57267267
 0.57545045 0.56989489 0.56989489 0.56989489 0.56989489 0.56989489
 0.75315315 0.75315315 0.75600601 0.74774775 0.75045045 0.75307808
 0.75848348 0.76118619 0.75315315 0.75315315 0.75600601 0.74774775
 0.75045045 0.75307808 0.75848348 0.76118619 0.59737237 0.57267267
 0.58085586 0.5725976  0.5725976  0.5725976  0.5753003  0.5753003
 0.74504505 0.75578078 0.76666667 0.75840841 0.74767267 0.75585586
 0.75848348 0.75855856 0.74504505 0.75578078 0.76666667 0.75840841
 0.74767267 0.75585586 0.75848348 0.75855856 0.58385886 0.58348348
 0.58348348 0.5725976  0.5725976  0.5753003  0.578003   0.57800

0.7750750750750751
testing classifiers: ['GaussianNB']
running: GaussianNB
0.6271771771771772
testing classifiers: ['KNeighborsClassifier']
running: KNeighborsClassifier
0.6246996996996997
testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
0.7584084084084084
testing classifiers: ['XGBClassifier']
running: XGBClassifier
0.794069069069069


In [25]:
burchard_lz_london_greed_rearch_resp

[['SVC', 0.753003003003003],
 ['DecisionTreeClassifier', 0.6713963963963964],
 ['GaussianProcessClassifier', 0.7557057057057057],
 ['RandomForestClassifier', 0.7750750750750751],
 ['GaussianNB', 0.6271771771771772],
 ['KNeighborsClassifier', 0.6246996996996997],
 ['AdaBoostClassifier', 0.7584084084084084],
 ['XGBClassifier', 0.794069069069069]]

In [78]:
grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_lz_london_features_df, ['XGBClassifier'])
print(grid_search_cv_result[1][0].best_score_)
print(grid_search_cv_result[1][0].best_estimator_)

testing classifiers: ['XGBClassifier']
running: XGBClassifier
0.794069069069069
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.4, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=9, max_leaves=0, min_child_weight=3,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)


In [79]:
print(grid_search_cv_result[1][0].best_params_)

{'gamma': 0.4, 'max_depth': 9, 'min_child_weight': 3}


In [85]:
X, y = thesisModelFeatures.create_X_y(burchard_lz_london_features_df)
y_encoded = LabelEncoder().fit_transform(y)
XGBClassifier_burchard_lz = xgb.XGBClassifier(gamma = 0.4, max_depth = 9, min_child_weight = 3).fit(X, y_encoded)
thesisModelFeatures.save_london_vs_burchard_best_model(
    XGBClassifier_burchard_lz,
    'burchard_lz_XGBClassifier(gamma = 0.4, max_depth = 9, min_child_weight = 3)_0.79406'
)

In [26]:
burchard_zl_london_features_df = thesisModelFeatures.create_features_df(
    london_leftofvers_long,
    None,
    burchard_zl_corpus_long,
    n_gram = (2,5),
    features = { 'tfidf', 'inner_mean_cosine_similarity_score' }
)

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [27]:
burchard_zl_london_result = thesisModelFeatures.run_models(
    burchard_zl_london_features_df
)

running: SVM_linear


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


running: SVM_RBF
running: DecisionTreeClassifier
running: GaussianProcessClassifier
running: RandomForestClassifier
running: MLPClassifier
running: GaussianNB
running: KNeighborsClassifier
running: AdaBoostClassifier
running: XGBClassifier


In [28]:
burchard_zl_london_result

(                           precision_macro  recall_macro  f1_macro  f1_micro  \
 SVM_linear                        0.284347      0.500000  0.362497  0.568694   
 SVM_RBF                           0.756748      0.610565  0.574762  0.658934   
 DecisionTreeClassifier            0.693412      0.686696  0.684231  0.694820   
 GaussianProcessClassifier         0.751322      0.721696  0.721269  0.738363   
 RandomForestClassifier            0.761028      0.704464  0.701412  0.730631   
 MLPClassifier                     0.744924      0.728125  0.726809  0.741066   
 GaussianNB                        0.572023      0.546905  0.523554  0.585135   
 KNeighborsClassifier              0.598200      0.566577  0.547757  0.593844   
 AdaBoostClassifier                0.705674      0.691101  0.686170  0.700075   
 XGBClassifier                     0.747152      0.728958  0.727342  0.746697   
 
                            f1_weighted  accuracy  
 SVM_linear                    0.412394  0.568694  
 SV

In [45]:
burchard_zl_london_result[0]

Unnamed: 0,precision_macro,recall_macro,f1_macro,f1_micro,f1_weighted,accuracy
SVM_linear,0.284347,0.5,0.362497,0.568694,0.412394,0.568694
SVM_RBF,0.756748,0.610565,0.574762,0.658934,0.601055,0.658934
DecisionTreeClassifier,0.693412,0.686696,0.684231,0.69482,0.691294,0.69482
GaussianProcessClassifier,0.751322,0.721696,0.721269,0.738363,0.730126,0.738363
RandomForestClassifier,0.761028,0.704464,0.701412,0.730631,0.713913,0.730631
MLPClassifier,0.744924,0.728125,0.726809,0.741066,0.734884,0.741066
GaussianNB,0.572023,0.546905,0.523554,0.585135,0.546757,0.585135
KNeighborsClassifier,0.5982,0.566577,0.547757,0.593844,0.566142,0.593844
AdaBoostClassifier,0.705674,0.691101,0.68617,0.700075,0.693903,0.700075
XGBClassifier,0.747152,0.728958,0.727342,0.746697,0.737281,0.746697


In [29]:
burchard_zl_london_greed_rearch_resp = []
for cls in [
    'SVC', 
    'DecisionTreeClassifier', 
    'GaussianProcessClassifier', 
    'RandomForestClassifier', 
    'GaussianNB', 
    'KNeighborsClassifier', 
    'AdaBoostClassifier', 
    'XGBClassifier'
]:
    grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_zl_london_features_df, [cls])
    burchard_zl_london_greed_rearch_resp.append([cls, grid_search_cv_result[1][0].best_score_])
    print(grid_search_cv_result[1][0].best_score_)

testing classifiers: ['SVC']
running: SVC
0.7575075075075076
testing classifiers: ['DecisionTreeClassifier']
running: DecisionTreeClassifier
0.7225225225225226
testing classifiers: ['GaussianProcessClassifier']
running: GaussianProcessClassifier
0.7383633633633634
testing classifiers: ['RandomForestClassifier']
running: RandomForestClassifier


 0.74121622 0.73581081 0.70570571 0.74414414 0.73858859 0.74136637
 0.73581081 0.73843844 0.74121622 0.73581081 0.57132132 0.5740991
 0.56869369 0.56869369 0.56869369 0.56869369 0.56869369 0.56869369
 0.72214715 0.74136637 0.74954955 0.74136637 0.74677177 0.74129129
 0.74662162 0.73843844 0.72214715 0.74136637 0.74954955 0.74136637
 0.74677177 0.74129129 0.74662162 0.73843844 0.58506006 0.5795045
 0.5713964  0.5740991  0.56869369 0.57417417 0.56869369 0.56869369
 0.73558559 0.74114114 0.73573574 0.75225225 0.75773273 0.74932432
 0.7548048  0.74662162 0.73558559 0.74114114 0.73573574 0.75225225
 0.75773273 0.74932432 0.7548048  0.74662162 0.58513514 0.58791291
 0.57687688 0.5795045  0.5713964  0.5713964  0.5740991  0.5713964
 0.71073574 0.72462462 0.74662162 0.75495495 0.75487988 0.74662162
 0.7466967  0.75503003 0.71073574 0.72462462 0.74662162 0.75495495
 0.75487988 0.74662162 0.7466967  0.75503003 0.61253754 0.59039039
 0.59309309 0.58753754 0.58498498 0.57957958 0.57957958 0.5713964

0.7718468468468469
testing classifiers: ['GaussianNB']
running: GaussianNB
0.6096096096096096
testing classifiers: ['KNeighborsClassifier']
running: KNeighborsClassifier
0.6374624624624624
testing classifiers: ['AdaBoostClassifier']
running: AdaBoostClassifier
0.7580330330330332
testing classifiers: ['XGBClassifier']
running: XGBClassifier
0.7660660660660661


In [30]:
burchard_zl_london_greed_rearch_resp

[['SVC', 0.7575075075075076],
 ['DecisionTreeClassifier', 0.7225225225225226],
 ['GaussianProcessClassifier', 0.7383633633633634],
 ['RandomForestClassifier', 0.7718468468468469],
 ['GaussianNB', 0.6096096096096096],
 ['KNeighborsClassifier', 0.6374624624624624],
 ['AdaBoostClassifier', 0.7580330330330332],
 ['XGBClassifier', 0.7660660660660661]]

In [86]:
grid_search_cv_result = thesisModelFeatures.run_grid_search_cv(burchard_zl_london_features_df, ['RandomForestClassifier'])
print(grid_search_cv_result[1][0].best_score_)
print(grid_search_cv_result[1][0].best_estimator_)
print(grid_search_cv_result[1][0].best_params_)

testing classifiers: ['RandomForestClassifier']
running: RandomForestClassifier


 0.74121622 0.73581081 0.70570571 0.74414414 0.73858859 0.74136637
 0.73581081 0.73843844 0.74121622 0.73581081 0.57132132 0.5740991
 0.56869369 0.56869369 0.56869369 0.56869369 0.56869369 0.56869369
 0.72214715 0.74136637 0.74954955 0.74136637 0.74677177 0.74129129
 0.74662162 0.73843844 0.72214715 0.74136637 0.74954955 0.74136637
 0.74677177 0.74129129 0.74662162 0.73843844 0.58506006 0.5795045
 0.5713964  0.5740991  0.56869369 0.57417417 0.56869369 0.56869369
 0.73558559 0.74114114 0.73573574 0.75225225 0.75773273 0.74932432
 0.7548048  0.74662162 0.73558559 0.74114114 0.73573574 0.75225225
 0.75773273 0.74932432 0.7548048  0.74662162 0.58513514 0.58791291
 0.57687688 0.5795045  0.5713964  0.5713964  0.5740991  0.5713964
 0.71073574 0.72462462 0.74662162 0.75495495 0.75487988 0.74662162
 0.7466967  0.75503003 0.71073574 0.72462462 0.74662162 0.75495495
 0.75487988 0.74662162 0.7466967  0.75503003 0.61253754 0.59039039
 0.59309309 0.58753754 0.58498498 0.57957958 0.57957958 0.5713964

0.7718468468468469
RandomForestClassifier(criterion='entropy', max_depth=12, n_estimators=200,
                       random_state=0)
{'criterion': 'entropy', 'max_depth': 12, 'max_features': 'auto', 'n_estimators': 200, 'random_state': 0}


In [92]:
X, y = thesisModelFeatures.create_X_y(burchard_zl_london_features_df)
RandomForestClassifier_burchard_zl = RandomForestClassifier(criterion = "entropy" , max_depth=12, n_estimators=200, random_state=0).fit(X, y)
thesisModelFeatures.save_london_vs_burchard_best_model(
    XGBClassifier_burchard_lz,
    "burchard_zl_RandomForestClassifier(criterion='entropy', max_depth=12, n_estimators=200, random_state=0)_0.77184"
)

In [58]:
london_best_result_1 = np.amax([i[1] for i in burchard_lz_london_greed_rearch_resp])
print(f'london_best_result_1" {london_best_result_1}')

london_best_result_2 = np.amax([i[1] for i in burchard_zl_london_greed_rearch_resp])
print(f'london_best_result_2: {london_best_result_2}')

london_classifier_total_result = (london_best_result_1 + london_best_result_2) / 2
print(f'london_classifier_total_result: {london_classifier_total_result}')

london_best_result_1" 0.794069069069069
london_best_result_2: 0.7718468468468469
london_classifier_total_result: 0.782957957957958


In [60]:
zwickau_best_result_1 = np.amax([i[1] for i in burchard_lz_zwickau_greed_rearch_resp])
print(f'zwickau_best_result_1: {zwickau_best_result_1}')

zwickau_best_result_2 = np.amax([i[1] for i in burchard_zl_zwickau_greed_rearch_resp])
print(f'zwickau_best_result_2: {zwickau_best_result_2}')

zwickau_classifier_total_result = (zwickau_best_result_1 + zwickau_best_result_2) / 2
print(f'zwickau_classifier_total_result: {zwickau_classifier_total_result}')

zwickau_best_result_1: 0.8646825396825397
zwickau_best_result_2: 0.8326984126984126
zwickau_classifier_total_result: 0.8486904761904761


In [63]:
more_original_version = 'not found'
if london_classifier_total_result < zwickau_classifier_total_result: more_original_version = 'london'
elif zwickau_classifier_total_result < london_classifier_total_result: more_original_version = 'zwickau'

print(f'Due to classifier ability to distinguish between 2 version, version candidate to be closer to burchard is: {more_original_version}')

Due to classifier ability to distinguish between 2 version, version candidate to be closer to burchard is: london


In [41]:
print(london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0])
print(zwickau_london_similarities.get_bidirectional_matches_by_threshold(0.5, london_zwickau_similarities)[0])

1 -> 1: 0.8131045786315674
1 -> 1: 0.8111619165215265


In [36]:
print(london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].original_text)
print()
print(london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].match_text)

cum in ueteribus historiis legamus sicut dicit beatus ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex libris nouerant coram positi uiderent ut plato menpiticos uates et egiptum apolonius qui persas intrauit transiuit caucasum albanos scitas massagetas indiam bragmanos quoque ut iartam uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant uanerabantur antiqui sancta sanctorum quia ibi erat arca testamenti et cerubin cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra futuri nonne aput nos uenerabilis est sepulcrum dulcis iesu quod quociens quis ingreditur tociens inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum linteaminibus mulieribus ostendentem

cum sicut dicit ieronimus quosdam inueniamus lustrasse prouintias maria transfreta

In [37]:
' '.join(
    thesisUtils.get_shared_words(
        london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].original_text,
        london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].match_text
    )
)

'cum in sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne nos est sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum mulieribus ostendentem'

In [39]:
' '.join(
    thesisUtils.get_shared_words(
        london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].match_text,
        london_zwickau_similarities.get_bidirectional_matches_by_threshold(0.5, zwickau_london_similarities)[0].original_text,
    )
)

'cum sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne est nos sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis oculis uidet saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum mulieribus ostendentem'

In [93]:
display(thesisCorpusStats.CorpusStats(london_leftovers).get())
display(thesisCorpusStats.CorpusStats(zwickau_leftovers).get())

Unnamed: 0,paragraphs,Total words,Unique words,Avg. paragraph length (words),Avg. paragraph length (characters)
leftovers_london,317,11700,3838,36.908517,207.539432


Unnamed: 0,paragraphs,Total words,Unique words,Avg. paragraph length (words),Avg. paragraph length (characters)
leftovers_zwickau,321,8869,3176,27.629283,157.015576


In [56]:
def general_corpus_date(corpus):
    print(f'len: {len(corpus)}')
    print(f'total words: {sum([ len(i.split()) for i in corpus])}')
    print(f'unique words: {len(set(thesisUtils.flatten([ i.split() for i in corpus])))}')
    print(f'avg paragraph len (words): {np.average([ len(i.split()) for i in corpus])}')
    print(f'avg paragraph len (characters): {np.average([ len("".join(i.split())) for i in corpus])}')    

In [57]:
general_corpus_date(london_corpus.corpus)

len: 317
total words: 30040
unique words: 6507
avg paragraph len (words): 94.76340694006309
avg paragraph len (characters): 506.3659305993691


In [58]:
general_corpus_date(zwickau_corpus.corpus)

len: 321
total words: 26856
unique words: 6060
avg paragraph len (words): 83.66355140186916
avg paragraph len (characters): 447.7725856697819


In [55]:
np.average([ len("".join(i.split())) for i in london_corpus.corpus])

506.3659305993691

In [16]:
import difflib

In [20]:
print(london_corpus.corpus[0])
print(zwickau_corpus.corpus[0])

liber de terra sancta
incipit descriptio terre sancte


In [24]:
a = london_corpus.corpus[0]
b = zwickau_corpus.corpus[0]

In [21]:
s = difflib.SequenceMatcher(None, london_corpus.corpus[0], zwickau_corpus.corpus[0])

In [22]:
for block in s.get_matching_blocks():
    print(block)

Match(a=1, b=0, size=1)
Match(a=5, b=7, size=3)
Match(a=8, b=18, size=5)
Match(a=14, b=24, size=6)
Match(a=21, b=31, size=0)


In [26]:
for i in difflib.context_diff(a.split(), b.split()): print(i)

*** 

--- 

***************

*** 1,4 ****

! liber
! de
! terra
! sancta
--- 1,4 ----

! incipit
! descriptio
! terre
! sancte


In [27]:
london_zwickau_similarities.get_bidirectional_strongly_similar(zwickau_london_similarities)

1 -> 1: 0.8131045786315674
2 -> 2: 0.8958853131371243
3 -> 3: 0.8903220617363937
4 -> 4: 0.8165486916831831
5 -> 5: 0.8642997597862219
6 -> 6: 0.6452430863004893
7 -> 7: 0.7753975488026391
8 -> 8: 0.8293117327749867
9 -> 9: 0.6902343875522335
10 -> 10: 0.5224296396123066
11 -> 11: 0.7390791727469789
13 -> 13: 0.5481486770843644
14 -> 14: 0.668196985320916
16 -> 16: 0.5306360466634273
18 -> 18: 0.5985020501523044
19 -> 19: 0.5091682598222478
21 -> 20: 0.6043295704212351
23 -> 22: 0.6749838320249644
25 -> 24: 0.6175246851311899
26 -> 25: 0.6336833611166691
27 -> 26: 0.5081848038683509
28 -> 27: 0.6529561596139642
29 -> 28: 0.6865907305169173
30 -> 29: 0.7774978841416829
31 -> 30: 0.5248658504923325
32 -> 31: 0.7403961494289866
33 -> 33: 0.6203753656148526
36 -> 35: 0.7211054725447279
37 -> 36: 0.6021472459426359
39 -> 39: 0.7231274168937603
40 -> 40: 0.8063132048187789
41 -> 41: 0.569759580499171
42 -> 42: 0.7524752862513171
43 -> 43: 0.757534896036682
46 -> 46: 0.5308608045105072
47 -> 

In [28]:
london_corpus.corpus[1]

'cum in ueteribus historiis legamus sicut dicit beatus ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex libris nouerant coram positi uiderent ut plato menpiticos uates et egiptum apolonius qui persas intrauit transiuit caucasum albanos scitas massagetas indiam bragmanos quoque ut iartam uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant uanerabantur antiqui sancta sanctorum quia ibi erat arca testamenti et cerubin cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra futuri nonne aput nos uenerabilis est sepulcrum dulcis iesu quod quociens quis ingreditur tociens inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum linteaminibus mulieribus ostendentem'

In [29]:
zwickau_corpus.corpus[1]

'cum sicut dicit ieronimus quosdam inueniamus lustrasse prouintias maria transfretasse ut ea que ex scripturis nouerant coram uiderent ut plato mempiticos uates et egiptum appolonius qui persas intrauit transiuit cancasum albanos sticas massagetas quoque ut iarcam uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant uenerabantur antiqui sancta sanctorum quia ibi erat arca testamenti et cernibulum cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne uenerabilius est apud nos sepulcrum dulcis iesu quod quotiens quis ingreditur totiens inuolutum sindone mentis oculis uidet saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum lintiaminibus mulieribus ostendentem'

In [30]:
burchard_corpus_by_london.corpus[0]

'cum in sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne nos est sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum mulieribus ostendentem'

In [46]:
import re

In [54]:
word = 'erant'
re.search(r'\b' + word + r'\b', london_corpus.corpus[1])

<re.Match object; span=(636, 641), match='erant'>

In [52]:
r = re.search(r'\b' + word + r'\b', zwickau_corpus.corpus[1])
print(r)
print(r.end())

<re.Match object; span=(0, 3), match='cum'>
3


In [74]:
[(m.start(0), m.end(0)) for m in re.finditer(r'\b' + 'cum' + r'\b', london_corpus.corpus[1])]

[(0, 3), (569, 572), (876, 879)]

In [75]:
[(m.start(0), m.end(0)) for m in re.finditer(r'\b' + 'cum' + r'\b', zwickau_corpus.corpus[1])]

[(0, 3), (526, 529), (827, 830)]

In [56]:
zwickau_corpus.corpus[1][593:598]

'erant'

In [42]:
a = london_corpus.corpus[1].split()
b = zwickau_corpus.corpus[1].split()

candidate = []
for block in difflib.SequenceMatcher(None, a, b).get_matching_blocks():
    print(block)
    start = block.a
    end = start + block.size
    candidate = candidate + a[start:end]

Match(a=0, b=0, size=1)
Match(a=5, b=1, size=2)
Match(a=8, b=3, size=2)
Match(a=10, b=6, size=8)
Match(a=19, b=15, size=2)
Match(a=22, b=17, size=3)
Match(a=26, b=21, size=3)
Match(a=30, b=25, size=4)
Match(a=35, b=30, size=1)
Match(a=37, b=32, size=1)
Match(a=40, b=33, size=2)
Match(a=43, b=36, size=27)
Match(a=71, b=64, size=9)
Match(a=81, b=74, size=13)
Match(a=95, b=87, size=1)
Match(a=97, b=91, size=1)
Match(a=100, b=92, size=4)
Match(a=105, b=97, size=2)
Match(a=108, b=100, size=3)
Match(a=111, b=104, size=1)
Match(a=113, b=105, size=14)
Match(a=128, b=120, size=2)
Match(a=130, b=122, size=0)


In [34]:
london_corpus.corpus[1][13:17]

'bus '

In [39]:
a[0:1]

['cum']

In [45]:
" ".join(candidate)

'cum sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne nos sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis uidet saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum mulieribus ostendentem'

In [61]:
burchard_corpus_by_london.corpus

['cum in sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne nos est sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum in eo sedentem et sudarium cum mulieribus ostendentem',
 'quis cristianus hiis uisis non uenire in betleem contemplans puerum in presepio uagientem mariam in diuersorio sub rupe concaua que usque hodie cernitur parientem angelos presentibus pastoribus gloriam deo et decantantes et quod amplius mirandum est quomodo tres 

In [69]:
burchard2_corpus_by_london = dataReader.BurchardCorpus(london_corpus, zwickau_corpus)

In [70]:
burchard2_corpus_by_london.corpus

['cum in sicut dicit ieronimus quosdam lustrasse prouintias maria transfretasse ut ea que ex nouerant coram uiderent ut plato uates et egiptum qui persas intrauit transiuit albanos massagetas quoque ut uideret et tandem egiptum intrauit ut famosam mensam solis uideret in sabulo quid mirum si cristiani terram ilam quam cristi sonant ecclesie uniuerse uidere et uisitare desiderant antiqui sancta sanctorum quia ibi erat arca testamenti et cum propitiatorio et manna et uirga aaron que fronduerat que omnia erant umbra nonne nos est sepulcrum dulcis iesu quod quis ingreditur inuolutum sindone mentis uidet oculis saluatorem et paululum procedens uidet lapidem reuolutum angelum eo sedentem et sudarium cum mulieribus ostendentem',
 'quis cristianus hiis uisis non uenire in betleem contemplans puerum in presepio uagientem mariam in diuersorio sub rupe concaua que usque hodie cernitur parientem angelos presentibus pastoribus gloriam deo et decantantes quod amplius mirandum est quomodo tres ili ma

In [66]:
a1 = "remove word from this word"
a2 = a1.replace("word", '')
print(a2) 

remove  from this 


In [67]:
re.sub(r'\b' + 'word' + r'\b', '', "remove word from this word", count = 1).replace('  ', ' ').strip()

'remove from this word'