In [158]:
import pickle
import os 
from tqdm.notebook import tqdm
import re
import spacy

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000
nlp.disable_pipes('ner', 'parser')

# gensim LDA

from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel

# Visualise
import warnings
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

['ner', 'parser']

# Save Lemmatized words

In [128]:
city_pair_folder = "../../../../enwiki_city_pairs/"
for file in tqdm(os.scandir(city_pair_folder), total=len(os.listdir(city_pair_folder)), leave=True, desc='Folders'): # tqdm(os.listdir(subfolder), total=len(os.listdir(subfolder)), leave=True, desc='Text Files'):                     
    if not file.is_dir():
        lemmatize_file(file.path, city_pair=file.name[:-4])

Folders:   0%|          | 0/12 [00:00<?, ?it/s]

933


  0%|          | 0/2 [00:00<?, ?it/s]

7419


  0%|          | 0/9 [00:00<?, ?it/s]

2320


  0%|          | 0/4 [00:00<?, ?it/s]

2523


  0%|          | 0/4 [00:00<?, ?it/s]

1037


  0%|          | 0/3 [00:00<?, ?it/s]

1130


  0%|          | 0/3 [00:00<?, ?it/s]

7662


  0%|          | 0/9 [00:00<?, ?it/s]

21193


  0%|          | 0/23 [00:00<?, ?it/s]

3294


  0%|          | 0/5 [00:00<?, ?it/s]

3264


  0%|          | 0/5 [00:00<?, ?it/s]

In [127]:
def lemmatize_file(file_path, city_pair):
    with open(file_path, 'r', encoding='utf-16') as f:
        city_pair_text_list = [x.strip().lower() for x in f.read().split('\n') if len(x) and 'title=' not in x]
    print(len(city_pair_text_list))
    
    nr_of_chunks = len(city_pair_text_list)//1000 + 1
    chunk_size = (len(city_pair_text_list)-1)//nr_of_chunks
    chunked_text = [' '.join(city_pair_text_list[offs:offs+chunk_size]) for offs in range(0, len(city_pair_text_list), chunk_size)]
    
    processed_text = [text for text in tqdm(nlp.pipe(chunked_text, n_process=2, batch_size=1, disable=["ner", "parser"]), total=len(chunked_text))]
    lemmatized_text = [[word.lemma_ for word in text if word.pos_ == 'NOUN' and not word.is_punct and not word.is_stop] for text in processed_text]
    regexed_text = [[re.sub(r'\W+', '', word) for word in text] for text in lemmatized_text]
    flattened_words = [item for sublist in regexed_text for item in sublist]
    
    with open(f'../../../../enwiki_city_pairs_nouns/NOUNS_{city_pair}.pickle', 'wb') as fp:
        pickle.dump(flattened_words, fp)

# Open Lemmatized words

In [245]:
word_list_location = '../../../../enwiki_city_pairs_nouns/'
words = []
city_pairs = []
for file in os.scandir(word_list_location):
    with open(file.path, 'rb') as fp:
        words.append(pickle.load(fp))
        city_pairs.append(file.name.split('__')[1])

In [166]:
city_pairs

['berlin_milan',
 'london_berlin',
 'london_madrid',
 'london_milan',
 'madrid_berlin',
 'madrid_milan',
 'paris_berlin',
 'paris_london',
 'paris_madrid',
 'paris_milan']

In [246]:
for l in words:
    print(l[:5])

['tenure', 'academia', 'vevey', 'milan', 'month']
['station', 'spoorwegen', 'railway', 'stop', 'zuid']
['fanbase', 'gooner', 'club', 'nickname', 'gunner']
['plenty', 'air', 'connection', 'city', 'dubai']
['organization', 'country', 'society', 'vienna', 'society']
['partner', 'kind', 'sport', 'football', 'partnership']
['station', 'spoorwegen', 'railway', 'stop', 'zuid']
['revision', 'work', 'composer', 'instrumentation', 'saxophone']
['arrangement', 'public', 'collection', 'gallery', 'collection']
['plenty', 'air', 'connection', 'city', 'dubai']


# Vectorize words (dictionary & corpus)

In [241]:
def vectorize(lemmatized_text, filter_extremes=True, MIN_DF = 1, MAX_DF = 0.6):
    """
    -->
        function that vectorizes preprocessed (lemmatized) text.

        Parameters:
        -----------
            lemmatized_text: list, str -> contains the key words to be matched
            MIN_DF: int (default = 1) -> minimum document frequency
            MAX_DF: int (default = 0.6) -> maximum document frequency

    """
    
    # Get Vocabulary
    dictionary = Dictionary(lemmatized_text)
    
    if filter_extremes:
        dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF)
    
    corpus = [dictionary.doc2bow(text) for text in lemmatized_text]
    
    return(dictionary, corpus)

In [248]:
words[0][:10]

['tenure',
 'academia',
 'vevey',
 'milan',
 'month',
 'florence',
 'year',
 'travel',
 'incident',
 'berlin']

In [249]:
dictionary, corpus = vectorize(words, filter_extremes=False)

In [250]:
len(dictionary)

61461

In [178]:
len(corpus[1])

16456

# Train LDA model

In [251]:
PATH_TO_MALLET = r'C:/mallet/bin/mallet.bat'

def train_model(lemmatized_text, dictionary=[], corpus=[], MIN_DF = 10, MAX_DF = 0.9, N_TOPICS = 10, N_ITERATIONS = 1000):
    # usually 1000 iterations will do
    
#     # Call vectorization function if either dictionary or corpus is missing as parameter
#     if not type(dictionary) == gensim.corpora.dictionary.Dictionary or not corpus:
#         dictionary, corpus = vectorize(lemmatized_text, MIN_DF, MAX_DF)
    
    
    lda_model = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                optimize_interval=10,
                iterations=N_ITERATIONS)
    
    coherence_score = CoherenceModel(model=lda_model, texts=lemmatized_text, dictionary=dictionary, coherence='c_v').get_coherence()
    
    return(lda_model, coherence_score, dictionary, corpus)

In [252]:
%%time

lda_model, coherence_score, dictionary, corpus = train_model(words, dictionary, corpus, N_TOPICS=7)

In [253]:
coherence_score

0.43825568993625336

# Word topic distribution

In [254]:
MAX_WORDS = 20
N_TOPICS = 7

for i in range(N_TOPICS):
    words = lda_model.show_topic(i, topn=MAX_WORDS)
    print(i, ':', [(x[0], round(x[1], 3)) for x in words])

0 : [('club', 0.022), ('season', 0.017), ('team', 0.015), ('champion', 0.012), ('league', 0.012), ('match', 0.011), ('milan', 0.011), ('goal', 0.011), ('title', 0.011), ('year', 0.009), ('time', 0.008), ('competition', 0.007), ('opera', 0.007), ('final', 0.007), ('city', 0.007), ('player', 0.007), ('round', 0.006), ('game', 0.006), ('win', 0.006), ('cup', 0.006)]
1 : [('milan', 0.035), ('opera', 0.03), ('fashion', 0.025), ('model', 0.013), ('week', 0.012), ('role', 0.008), ('debut', 0.008), ('design', 0.007), ('house', 0.007), ('season', 0.007), ('naple', 0.006), ('year', 0.006), ('world', 0.006), ('festival', 0.006), ('florence', 0.005), ('city', 0.005), ('tour', 0.005), ('venice', 0.005), ('stage', 0.005), ('exhibition', 0.005)]
2 : [('festival', 0.026), ('opera', 0.023), ('berlin', 0.02), ('film', 0.019), ('music', 0.01), ('orchestra', 0.009), ('vienna', 0.009), ('state', 0.008), ('debut', 0.007), ('deutsche', 0.006), ('album', 0.006), ('professor', 0.006), ('world', 0.006), ('unive

# Document topic distributions

In [238]:
transformed_docs = lda_model.load_document_topics()

In [239]:
import pandas as pd

transformed_docs = lda_model.load_document_topics()
topic_distributions = pd.DataFrame([[x[1] for x in doc] for doc in transformed_docs], 
             columns=['topic_{}'.format(i) for i in range(N_TOPICS)])
topic_distributions['city_pairs'] = city_pairs
topic_distributions.set_index("city_pairs", inplace=True)
topic_distributions
# topic_distributions.mean().sort_values(ascending=False)

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
city_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
berlin_milan,0.114218,0.114833,0.003972,3.2e-05,0.699728,0.067213,4e-06
london_berlin,4e-05,0.372355,0.045556,3e-06,3e-06,0.582043,
london_madrid,0.075852,1e-05,1e-05,0.776493,1e-05,0.147624,1e-06
london_milan,0.067127,0.000242,9e-06,1.1e-05,0.825628,0.106982,1e-06
madrid_berlin,0.195878,2.4e-05,0.014328,0.703813,2.4e-05,0.085928,4e-06
madrid_milan,0.999554,2e-05,2e-05,0.000296,2.1e-05,8.5e-05,3e-06
paris_berlin,6e-06,0.043976,0.399791,0.016124,2e-06,0.540101,
paris_london,0.00934,0.005654,0.000533,0.06566,0.085956,0.524398,0.308458
paris_madrid,0.15886,7e-06,6e-06,0.704471,0.000354,0.136302,
paris_milan,0.097534,6e-06,6e-06,8e-06,0.818767,0.083679,


# Visualise

In [255]:
warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

lda_conv = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model)
gensimvis.prepare(lda_conv, corpus, dictionary)


In [161]:
# for file in os.scandir(word_list_location):
#     print(file.path)
#     filename = file.path
#     print(filename.split('__'))
#     file = filename.replace('.pickle', '__.pickle')
#     file = filename.replace('NOUNS___', 'NOUNS__')
#     os.rename(filename, file)
#os.rename()

In [None]:
    # dictionary, corpus = vectorize(lemmatized_text, MIN_DF=1, MAX_DF=0.6)
    # model, coherence, dictionary, corpus = train_model(lemmatized_text=lemmatized_text, dictionary=[], corpus=[], N_TOPICS=4)