In [11]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneOut

import sys
import imp
import time

sys.path.append('../src/')

import data.reader as dataReader
import utils.utils as thesisUtils
import similarities.cosine as thesisCosineSimilarity
import google_sheets.client as googleSheetsClient
import vocabulary.vocabulary as thesisVocabulary
import features.factory as thesisFactoryFeatures
import features.model_features as thesisModelFeatures


imp.reload(dataReader)
imp.reload(thesisUtils)
imp.reload(thesisVocabulary)
imp.reload(googleSheetsClient)
imp.reload(thesisModelFeatures)
imp.reload(thesisFactoryFeatures)
imp.reload(thesisCosineSimilarity)

<module 'similarities.cosine' from '../src/similarities/cosine.py'>

In [13]:
london_corpus = dataReader.CorpusByNewLine.london()
zwickau_corpus = dataReader.CorpusByNewLine.zwickau()
burchard_corpus_by_london = dataReader.BurchardCorpus(london_corpus, zwickau_corpus)

In [14]:
featuresFactory = thesisFactoryFeatures.FeaturesFactory(
    london_corpus = london_corpus,
    zwickau_corpus = zwickau_corpus
)

In [15]:
london_leftovers = dataReader.LeftoversCorpus(london_corpus, zwickau_corpus)
zwickau_leftovers = dataReader.LeftoversCorpus(zwickau_corpus, london_corpus)

In [16]:
burchard_VS_zwickau_features_df = featuresFactory.burchard_by_london_VS_zwickau()
burchard_VS_london_features_df = featuresFactory.burchard_by_london_VS_london()
# london_VS_zwickau_features_df = featuresFactory.london_VS_zwickau()

n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram
n_gram_feature_name: 2_5_gram


In [17]:
len(burchard_VS_zwickau_features_df)

349

In [18]:
burchard_VS_zwickau_wrong_predictions_experiminet = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_VS_zwickau_features_df,
    AdaBoostClassifier(learning_rate=1, n_estimators=2000),
    KFold=LeaveOneOut()
)
burchard_VS_zwickau_wrong_predictions_experiminet.run()

X len is: $349
$1. score is: 1.0
$2. score is: 0.0
$3. score is: 1.0
$4. score is: 1.0
$5. score is: 1.0
$6. score is: 0.0
$7. score is: 1.0
$8. score is: 0.0
$9. score is: 1.0
$10. score is: 1.0
$11. score is: 1.0
$12. score is: 1.0
$13. score is: 1.0
$14. score is: 0.0
$15. score is: 1.0
$16. score is: 1.0
$17. score is: 1.0
$18. score is: 1.0
$19. score is: 1.0
$20. score is: 1.0
$21. score is: 1.0
$22. score is: 1.0
$23. score is: 1.0
$24. score is: 1.0
$25. score is: 1.0
$26. score is: 1.0
$27. score is: 0.0
$28. score is: 1.0
$29. score is: 1.0
$30. score is: 1.0
$31. score is: 0.0
$32. score is: 0.0
$33. score is: 1.0
$34. score is: 1.0
$35. score is: 1.0
$36. score is: 1.0
$37. score is: 0.0
$38. score is: 1.0
$39. score is: 1.0
$40. score is: 1.0
$41. score is: 1.0
$42. score is: 0.0
$43. score is: 1.0
$44. score is: 1.0
$45. score is: 1.0
$46. score is: 1.0
$47. score is: 1.0
$48. score is: 1.0
$49. score is: 0.0
$50. score is: 1.0
$51. score is: 1.0
$52. score is: 1.0
$53. s

In [19]:
burchard_VS_zwickau_wrong_predictions_experiminet.get_burchard_wrong_predictions()

[Row 1 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 3 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 12 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 13 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 31 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 45 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 63 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 64 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 81 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 89 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 100 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 114 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 118 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 121 has been classified as 1(Zwickau) and should be 2(Burchard),
 Row 135 has been classified as 

In [20]:
burchard_VS_london_wrong_predictions_experiminet = thesisModelFeatures.GetModelStratifiedKFoldWrongPredictionExperiment(
    burchard_VS_london_features_df,
    RandomForestClassifier(max_depth=11, n_estimators=300, random_state=0),
    KFold=LeaveOneOut()
)
burchard_VS_london_wrong_predictions_experiminet.run()

X len is: $360
$1. score is: 1.0
$2. score is: 1.0
$3. score is: 1.0
$4. score is: 1.0
$5. score is: 1.0
$6. score is: 1.0
$7. score is: 0.0
$8. score is: 0.0
$9. score is: 1.0
$10. score is: 0.0
$11. score is: 1.0
$12. score is: 1.0
$13. score is: 1.0
$14. score is: 1.0
$15. score is: 1.0
$16. score is: 0.0
$17. score is: 0.0
$18. score is: 1.0
$19. score is: 0.0
$20. score is: 1.0
$21. score is: 0.0
$22. score is: 1.0
$23. score is: 1.0
$24. score is: 0.0
$25. score is: 1.0
$26. score is: 1.0
$27. score is: 1.0
$28. score is: 1.0
$29. score is: 0.0
$30. score is: 1.0
$31. score is: 1.0
$32. score is: 1.0
$33. score is: 1.0
$34. score is: 0.0
$35. score is: 1.0
$36. score is: 0.0
$37. score is: 0.0
$38. score is: 1.0
$39. score is: 0.0
$40. score is: 1.0
$41. score is: 0.0
$42. score is: 0.0
$43. score is: 0.0
$44. score is: 0.0
$45. score is: 0.0
$46. score is: 1.0
$47. score is: 1.0
$48. score is: 1.0
$49. score is: 1.0
$50. score is: 1.0
$51. score is: 1.0
$52. score is: 1.0
$53. s

In [21]:
burchard_VS_london_wrong_predictions_experiminet.get_burchard_wrong_predictions()

[Row 15 has been classified as 0(London) and should be 2(Burchard),
 Row 22 has been classified as 0(London) and should be 2(Burchard),
 Row 44 has been classified as 0(London) and should be 2(Burchard),
 Row 45 has been classified as 0(London) and should be 2(Burchard),
 Row 67 has been classified as 0(London) and should be 2(Burchard),
 Row 100 has been classified as 0(London) and should be 2(Burchard),
 Row 101 has been classified as 0(London) and should be 2(Burchard),
 Row 108 has been classified as 0(London) and should be 2(Burchard),
 Row 114 has been classified as 0(London) and should be 2(Burchard)]

In [22]:
X_london, y_london = thesisModelFeatures.create_X_y(featuresFactory.london_by_burchard_by_london_VS_zwickau_vectorizer())
X, y = thesisModelFeatures.create_X_y(burchard_VS_zwickau_features_df)
burchard_VS_zwickau_best_classifier = AdaBoostClassifier(learning_rate=1, n_estimators=2000).fit(X, y)

n_gram_feature_name: 2_5_gram


In [23]:
london_predictions_by_burchard_vs_zwickau_classifier = burchard_VS_zwickau_best_classifier.predict(X_london)

In [24]:
X_zwickau, y_zwickau = thesisModelFeatures.create_X_y(featuresFactory.zwickau_by_burchard_by_london_VS_london_vectorizer())
X, y = thesisModelFeatures.create_X_y(burchard_VS_london_features_df)
burchard_VS_london_best_classifier = RandomForestClassifier(max_depth=11, n_estimators=300, random_state=0).fit(X, y)

n_gram_feature_name: 2_5_gram


In [25]:
zwickau_predictions_by_burchard_vs_london_classifier = burchard_VS_london_best_classifier.predict(X_zwickau)

In [26]:
print(
    [ 
        thesisModelFeatures.version_label_to_human_readable(int(i))
        for i in zwickau_predictions_by_burchard_vs_london_classifier 
    ]
)

['London', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'London', 'London', 'London', 'London', 'London', 'Burchard', 'London', 'London', 'London', 'Burchard', 'Burchard', 'Burchard', 'Burchard', 'London', 'Burchard', 'London', 'Burchard', 'Burchard', 'London', 'London', 'Burchard', 'London', 'Burchard', '

In [27]:
burchard_spreadsheet = googleSheetsClient.BurchardResults(
    burchard_corpus = burchard_corpus_by_london,
    london_left_overs_corpus = london_leftovers,
    zwickau_left_overs_corpus = zwickau_leftovers,
    
    burchard_vs_london_burchard_wrong_predictions = burchard_VS_london_wrong_predictions_experiminet.get_burchard_wrong_predictions(),
    burchard_vs_zwickau_burchard_wrong_predictions = burchard_VS_zwickau_wrong_predictions_experiminet.get_burchard_wrong_predictions(),
    
    burchard_vs_london_london_wrong_predictions = burchard_VS_london_wrong_predictions_experiminet.get_london_wrong_predictions(),
    burchard_vs_zwickau_zwickau_wrong_predictions = burchard_VS_zwickau_wrong_predictions_experiminet.get_zwickau_wrong_predictions(),
    
    london_predictions_by_burchard_vs_zwickau_classifier = [ [thesisModelFeatures.version_label_to_human_readable(int(i))] for i in london_predictions_by_burchard_vs_zwickau_classifier ],
    zwickau_predictions_by_burchard_vs_london_classifier = [ [thesisModelFeatures.version_label_to_human_readable(int(i))] for i in zwickau_predictions_by_burchard_vs_london_classifier ]
)

In [28]:
burchard_spreadsheet.write()

sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping
sleeping


BrokenPipeError: [Errno 32] Broken pipe

In [30]:
burchard_spreadsheet.write_london_left_overs_paragraphs()

In [33]:
burchard_spreadsheet.colorize_london_leftovers_shared_parts()
burchard_spreadsheet.write_london_wrong_predictions()
burchard_spreadsheet.write_london_predictions_by_burchard_vs_zwickau_classifier()

sleeping
sleeping
sleeping
sleeping
sleeping
sleeping


In [36]:
burchard_spreadsheet.write_zwickau_left_overs_paragraphs()
burchard_spreadsheet.colorize_zwickau_leftovers_shared_parts()
burchard_spreadsheet.write_zwickau_wrong_predictions()
burchard_spreadsheet.write_zwickau_predictions_by_burchard_vs_london_classifier()

sleeping
sleeping
sleeping
sleeping
sleeping
