In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%pip install decorator==5.0.9

import imp
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns

Note: you may need to restart the kernel to use updated packages.


In [20]:
# importing local modules

import sys
sys.path.append('../src/')

import text_cleanup.text_cleanup as thesisCleanUp
import preprocessing.text_preprocessing as thesisTextPreprocessing
import data.reader as thesisDataReader
import utils.utils as thesisUtils
import features.tf_idf.n_gram as thesisTfIdfNgramFeatures
import features.count_vectorizer.n_gram as thesisCountVectorizerNgramFeatures
import similarities.cosine as thesisCosineSimilarities

imp.reload(thesisCleanUp)
imp.reload(thesisTextPreprocessing)
imp.reload(thesisDataReader)
imp.reload(thesisUtils)
imp.reload(thesisCountVectorizerNgramFeatures)
imp.reload(thesisCosineSimilarities)

<module 'similarities.cosine' from '../src/similarities/cosine.py'>

In [4]:
zwickau_corpus = thesisDataReader.get_zwickau_corpus()
london_corpus = thesisDataReader.get_london_corpus()

In [40]:
nodes = []

for i, p in enumerate(zwickau_corpus):
    nodes.append({ 
        "data": { 
            "label": i,
            "id": f'zwickau_{i}', 
            "version": "zwickau"
        },
        "position": { "x": (1*i)+(i*30), "y": 0 }
    })
    
for i, p in enumerate(london_corpus):
    nodes.append({ 
        "data": { 
            "label": i,
            "id": f'london_{i}', 
            "version": "london"
        },
        "position": { "x": (1*i)+(i*30), "y": 200 }
    })

In [41]:
import json
with open('../src/graphs/nodes.json', 'w') as f:
    json.dump(nodes, f)

In [22]:
zwickau_to_london_best_similarities = thesisCosineSimilarities.zwickau_to_london_best_similarities()
london_to_zwickau_best_similarities = thesisCosineSimilarities.london_to_zwickau_best_similarities()

In [23]:
london_to_zwickau_best_similarities

[{'2_gram': (321, 0.4999977369463185),
  '3_gram': (321, 0.28104994954778606),
  '4_gram': (194, 0.1284659108996759),
  '5_gram': (218, 0.09413564509476889),
  'count_vectorizer_5_gram': (274, 0.15894111747616882)},
 {'2_gram': (1, 0.9760541106487549),
  '3_gram': (1, 0.8898156444104706),
  '4_gram': (1, 0.822044488342356),
  '5_gram': (1, 0.7829597577102049),
  'count_vectorizer_5_gram': (1, 0.8408213630139318)},
 {'2_gram': (2, 0.9882488278751222),
  '3_gram': (2, 0.9547237431328962),
  '4_gram': (2, 0.9218141125166134),
  '5_gram': (2, 0.8975286768073327),
  'count_vectorizer_5_gram': (2, 0.926251509677495)},
 {'2_gram': (3, 0.9793378530587341),
  '3_gram': (3, 0.9336609162758183),
  '4_gram': (3, 0.8946592680398504),
  '5_gram': (3, 0.8622085112751442),
  'count_vectorizer_5_gram': (3, 0.8994940483773678)},
 {'2_gram': (4, 0.981200885176096),
  '3_gram': (4, 0.9181876552632372),
  '4_gram': (4, 0.849403821366589),
  '5_gram': (4, 0.7891999341624071),
  'count_vectorizer_5_gram': (4

In [31]:
edges = []

for i, p in enumerate(zwickau_to_london_best_similarities):
    most_similar = p['5_gram']
    edges.append({ 
        "data": { 
            "source": f'zwickau_{i}', 
            "target": f'london_{most_similar[0]}', 
            "weight":most_similar[1], 
            "arrow": 'triangle'
        } 
    })
    
for i, p in enumerate(london_to_zwickau_best_similarities):
    most_similar = p['5_gram']
    edges.append({ 
        "data": { 
            "source": f'london_{i}', 
            "target": f'zwickau_{most_similar[0]}', 
            "weight":most_similar[1], 
            "arrow": 'triangle'
        } 
    })

In [32]:
edges

[{'data': {'source': 'zwickau_0',
   'target': 'london_8',
   'weight': 0.16006926335170077,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_1',
   'target': 'london_1',
   'weight': 0.783605972171981,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_2',
   'target': 'london_2',
   'weight': 0.8969272593860224,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_3',
   'target': 'london_3',
   'weight': 0.860358094160387,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_4',
   'target': 'london_4',
   'weight': 0.785025107503545,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_5',
   'target': 'london_5',
   'weight': 0.8400810955761492,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_6',
   'target': 'london_6',
   'weight': 0.6322286778840712,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_7',
   'target': 'london_7',
   'weight': 0.7502278301502107,
   'arrow': 'triangle'}},
 {'data': {'source': 'zwickau_8',
   'target': 'london_8',

In [33]:
with open('../src/graphs/edges.json', 'w') as f:
    json.dump(edges, f)