In [3]:
import pickle
import os 
from tqdm.notebook import tqdm
import re
import spacy

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1500000
nlp.disable_pipes('ner', 'parser')

# gensim LDA

from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel

# Visualise
import warnings
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

  from imp import reload


# Save Lemmatized words

In [128]:
city_pair_folder = "../../../../enwiki_city_pairs/"
for file in tqdm(os.scandir(city_pair_folder), total=len(os.listdir(city_pair_folder)), leave=True, desc='Folders'): # tqdm(os.listdir(subfolder), total=len(os.listdir(subfolder)), leave=True, desc='Text Files'):                     
    if not file.is_dir():
        lemmatize_file(file.path, city_pair=file.name[:-4])

Folders:   0%|          | 0/12 [00:00<?, ?it/s]

933


  0%|          | 0/2 [00:00<?, ?it/s]

7419


  0%|          | 0/9 [00:00<?, ?it/s]

2320


  0%|          | 0/4 [00:00<?, ?it/s]

2523


  0%|          | 0/4 [00:00<?, ?it/s]

1037


  0%|          | 0/3 [00:00<?, ?it/s]

1130


  0%|          | 0/3 [00:00<?, ?it/s]

7662


  0%|          | 0/9 [00:00<?, ?it/s]

21193


  0%|          | 0/23 [00:00<?, ?it/s]

3294


  0%|          | 0/5 [00:00<?, ?it/s]

3264


  0%|          | 0/5 [00:00<?, ?it/s]

In [127]:
def lemmatize_file(file_path, city_pair):
    with open(file_path, 'r', encoding='utf-16') as f:
        city_pair_text_list = [x.strip().lower() for x in f.read().split('\n') if len(x) and 'title=' not in x]
    print(len(city_pair_text_list))
    
    nr_of_chunks = len(city_pair_text_list)//1000 + 1
    chunk_size = (len(city_pair_text_list)-1)//nr_of_chunks
    chunked_text = [' '.join(city_pair_text_list[offs:offs+chunk_size]) for offs in range(0, len(city_pair_text_list), chunk_size)]
    
    processed_text = [text for text in tqdm(nlp.pipe(chunked_text, n_process=2, batch_size=1, disable=["ner", "parser"]), total=len(chunked_text))]
    lemmatized_text = [[word.lemma_ for word in text if word.pos_ == 'NOUN' and not word.is_punct and not word.is_stop] for text in processed_text]
    regexed_text = [[re.sub(r'\W+', '', word) for word in text] for text in lemmatized_text]
    flattened_words = [item for sublist in regexed_text for item in sublist]
    
    with open(f'../../../../enwiki_city_pairs_nouns/NOUNS_{city_pair}.pickle', 'wb') as fp:
        pickle.dump(flattened_words, fp)

# Open Lemmatized words

In [2]:
import os
os.path.exists('../../../../enwiki_city_pairs_nouns/')

False

In [103]:
word_list_location = '../../../../enwiki_city_pairs_nouns/'
words = []
city_pairs = []
for file in os.scandir(word_list_location):
    with open(file.path, 'rb') as fp:
        words.append(pickle.load(fp))
        city_pairs.append(file.name.split('__')[1])

In [5]:
city_pairs

['berlin_milan',
 'london_berlin',
 'london_madrid',
 'london_milan',
 'madrid_berlin',
 'madrid_milan',
 'paris_berlin',
 'paris_london',
 'paris_madrid',
 'paris_milan']

In [87]:
for l in words:
    print(l[:5])

('study', 0.017707267144319343)
('doctorate', 0.010747185261003071)
('piano', 0.007471852610030706)
('literature', 0.006857727737973388)
('philosophy', 0.005834186284544524)
('camp', 0.0053224155578300925)
('budapest', 0.005015353121801433)
('berlin', 0.004708290685772774)
('thesis', 0.004708290685772774)
('concentration', 0.0040941658137154556)
('philology', 0.003991811668372569)
('dissertation', 0.0038894575230296827)
('habilitation', 0.0037871033776867963)
('party', 0.0033776867963152507)
('sorbonne', 0.003172978505629478)
('secession', 0.0030706243602865915)
('coulondre', 0.0030706243602865915)
('comrade', 0.002968270214943705)
('instructor', 0.0028659160696008186)
('apparatus', 0.0027635619242579327)


In [102]:
words

[('paralympic', 0.004208387752140473),
 ('breaststroke', 0.00391815411406182),
 ('murphy', 0.0034828036569438396),
 ('ury', 0.0031925700188651864),
 ('macfarlane', 0.0030474531998258597),
 ('throw', 0.002466985923668553),
 ('raf', 0.0023218691046292266),
 ('archaeopteryx', 0.0023218691046292266),
 ('bowie', 0.0020316354665505733),
 ('bafta', 0.0020316354665505733),
 ('dungeon', 0.0020316354665505733),
 ('prey', 0.0018865186475112464),
 ('wachowski', 0.0018865186475112464),
 ('luftwaffe', 0.0018865186475112464),
 ('dyke', 0.0017414018284719198),
 ('willa', 0.0017414018284719198),
 ('secker', 0.0015962850094325932),
 ('accum', 0.0015962850094325932),
 ('mining', 0.0014511681903932665),
 ('wax', 0.00130605137135394)]

# Vectorize words (dictionary & corpus)

In [113]:
def vectorize(lemmatized_text, filter_extremes=True, MIN_DF = 5, MAX_DF = 1):
    """
    -->
        function that vectorizes preprocessed (lemmatized) text.

        Parameters:
        -----------
            lemmatized_text: list, str -> contains the key words to be matched
            MIN_DF: int (default = 1) -> minimum document frequency (only keep the words occuring in at least one document)
            MAX_DF: int (default = 0.6) -> maximum document frequency (word has to occur in less than 60% of the documents)

    """
    
    # Get Vocabulary
    dictionary = Dictionary(lemmatized_text)
    
    if filter_extremes:
        dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF)
    
    corpus = [dictionary.doc2bow(text) for text in lemmatized_text]
    
    return(dictionary, corpus)

In [114]:
dictionary, corpus = vectorize(words, filter_extremes=True)

In [117]:
len(dictionary)

8740

In [11]:
len(corpus[1])
type(corpus)

list

# Train LDA model

In [126]:
PATH_TO_MALLET = r'C:/mallet/bin/mallet.bat'

def train_model(lemmatized_text, dictionary=[], corpus=[], MIN_DF = 2, MAX_DF = 1, N_TOPICS = 10, N_ITERATIONS = 1000):
    """
    -->
        function that trains model.

        Parameters:
        -----------
            lemmatized_text: list, str -> contains the key words to be matched (created with the lemmatization function)
            dictionary: gensim.corpora.dictionary.Dictionary -> output from vectorization function
            corpus: list ([dictionary.doc2bow(text)...) -> output from vectorization function
            MIN_DF: int (default = 1) -> minimum document frequency
            MAX_DF: int (default = 0.6) -> maximum document frequency
            N_TOPICS: int (default = 10) -> Topics to detect
            N_ITERATIONS: int (default = 1000) -> 1000 often enough

    """

    # Call vectorization function if either dictionary or corpus is missing as parameter
#     if vectorize and not type(dictionary) == gensim.corpora.dictionary.Dictionary or not corpus:
#         dictionary, corpus = vectorize(lemmatized_text, MIN_DF, MAX_DF)
    
    lda_model = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                optimize_interval=10,
                iterations=N_ITERATIONS)
    
    coherence_score = CoherenceModel(model=lda_model, texts=lemmatized_text, dictionary=dictionary, coherence='c_v').get_coherence()
    
    return(lda_model, coherence_score, dictionary, corpus)

In [132]:
%%time

lda_model, coherence_score, dictionary, corpus = train_model(words, dictionary, corpus, N_TOPICS=6)

CPU times: total: 8.19 s
Wall time: 5min 2s


In [141]:
coherence_score

0.3902262937009409

# Word topic distribution

In [142]:
MAX_WORDS = 8
N_TOPICS = 6

for i in range(N_TOPICS):
    topic_words = lda_model.show_topic(i, topn=MAX_WORDS)
    print(i, [(x[0], round(x[1], 3)) for x in topic_words])

0 [('opera', 0.034), ('milan', 0.034), ('fashion', 0.023), ('year', 0.012), ('model', 0.012), ('work', 0.01), ('art', 0.01), ('week', 0.01)]
1 [('berlin', 0.017), ('festival', 0.017), ('opera', 0.016), ('art', 0.016), ('year', 0.013), ('film', 0.013), ('work', 0.012), ('time', 0.009)]
2 [('year', 0.016), ('art', 0.015), ('work', 0.013), ('time', 0.013), ('school', 0.008), ('exhibition', 0.007), ('war', 0.007), ('painting', 0.007)]
3 [('art', 0.017), ('opera', 0.015), ('festival', 0.012), ('city', 0.012), ('work', 0.011), ('year', 0.01), ('museum', 0.008), ('world', 0.007)]
4 [('club', 0.031), ('season', 0.025), ('team', 0.021), ('champion', 0.019), ('milan', 0.018), ('league', 0.018), ('goal', 0.017), ('time', 0.017)]
5 [('art', 0.017), ('year', 0.012), ('world', 0.01), ('music', 0.01), ('gallery', 0.009), ('film', 0.009), ('company', 0.009), ('city', 0.008)]


# Document topic distributions

In [44]:
lda_model.print_topics()

[(0,
  '0.026*"year" + 0.014*"company" + 0.013*"city" + 0.012*"exhibition" + 0.009*"book" + 0.009*"art" + 0.009*"gallery" + 0.009*"theatre" + 0.008*"group" + 0.007*"woman"'),
 (1,
  '0.005*"messager" + 0.003*"westminster" + 0.002*"pastry" + 0.002*"cody" + 0.002*"crane" + 0.002*"truce" + 0.002*"val" + 0.002*"wardrobe" + 0.002*"crédit" + 0.002*"stuff"'),
 (2,
  '0.014*"college" + 0.011*"eurovision" + 0.007*"eurostar" + 0.007*"pound" + 0.005*"investment" + 0.004*"gatwick" + 0.004*"cent" + 0.004*"drury" + 0.003*"edward" + 0.003*"exhibitor"'),
 (3,
  '0.004*"bourbon" + 0.003*"park" + 0.003*"whistler" + 0.003*"creed" + 0.003*"tile" + 0.003*"obelisk" + 0.002*"côte" + 0.002*"woman" + 0.002*"hospital" + 0.002*"mather"'),
 (4,
  '0.052*"milan" + 0.043*"fashion" + 0.023*"model" + 0.015*"week" + 0.010*"race" + 0.009*"design" + 0.009*"opera" + 0.008*"designer" + 0.007*"tour" + 0.007*"naple"'),
 (5,
  '0.029*"art" + 0.017*"work" + 0.015*"time" + 0.015*"world" + 0.011*"film" + 0.010*"museum" + 0.009*

In [138]:
import pandas as pd

transformed_docs = lda_model.load_document_topics()
topic_distributions = pd.DataFrame([[x[1] for x in doc] for doc in transformed_docs], 
             columns=['topic_{}'.format(i) for i in range(6)])
topic_distributions['city_pairs'] = city_pairs
topic_distributions.set_index("city_pairs", inplace=True)
topic_distributions
# topic_distributions.mean().sort_values(ascending=False)

Unnamed: 0_level_0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5
city_pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
berlin_milan,0.385312,0.346716,0.005675,0.014046,0.02974,0.218511
london_berlin,0.01037,0.333222,0.126719,0.018533,6.6e-05,0.51109
london_madrid,0.009128,0.046449,0.105221,0.307612,0.075346,0.456244
london_milan,0.487751,0.043231,0.101632,0.002799,0.044314,0.320273
madrid_berlin,0.011195,0.312714,0.00247,0.323248,0.078263,0.27211
madrid_milan,0.089897,0.012905,0.008892,0.046982,0.756225,0.085099
paris_berlin,0.013869,0.39501,0.247337,0.031856,0.000163,0.311765
paris_london,0.030657,0.046643,0.424022,0.028611,0.001678,0.468389
paris_madrid,0.000654,0.057089,0.23646,0.361039,0.10511,0.239647
paris_milan,0.487361,0.040046,0.210085,0.0007,0.074753,0.187056


# Visualise

In [143]:
warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

lda_conv = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model) # my_models[0]['lda_model__2']['model']
gensimvis.prepare(lda_conv, corpus, dictionary)


# Save Models

In [123]:
%%time

my_models = []

for n_topics in tqdm(range(2,30, 3)):
    model, coherence, _, _ = train_model(words, dictionary, corpus, N_TOPICS=n_topics)
    keyname = f'lda_model__{n_topics}'
    model.save(f'../../../../lda_models/{keyname}.model')

    my_models.append({ keyname: { 'model': model, 'n_topics': n_topics, 'coherence_score': coherence } })
    print(keyname)

In [122]:
my_models

[{'lda_model__2': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d0dad1310>,
   'n_topics': 2,
   'coherence_score': nan}},
 {'lda_model__5': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d0bf79cd0>,
   'n_topics': 5,
   'coherence_score': nan}},
 {'lda_model__8': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d09b54d00>,
   'n_topics': 8,
   'coherence_score': nan}},
 {'lda_model__11': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d09b78880>,
   'n_topics': 11,
   'coherence_score': nan}},
 {'lda_model__14': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d0bfa90d0>,
   'n_topics': 14,
   'coherence_score': nan}},
 {'lda_model__17': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d09b54970>,
   'n_topics': 17,
   'coherence_score': nan}},
 {'lda_model__20': {'model': <gensim.models.wrappers.ldamallet.LdaMallet at 0x17d0bf79a00>,
   'n_topics': 20,
   'coherence_score': nan}},
 {'lda_model__23': {'model

In [97]:
coherence_score = CoherenceModel(model=lda_model, texts=words, dictionary=dictionary, coherence='c_v').get_coherence()

In [98]:
coherence_score

0.6355124667963096

In [73]:
for model in my_models:
    model = list(model.values())[0]
   # print(dlist(model.items())[0][0])
    coherence_score = CoherenceModel(model=model['model'], texts=words, dictionary=dictionary, coherence='c_v').get_coherence()
    print(model['n_topics'], coherence_score)


2 nan
5 nan
8 nan


KeyboardInterrupt: 

# Import Models

In [None]:
# import os
# import glob

# # I used these for importing my models at a later stage
# def import_models():
#     imported_models = []
    
#     folder_path = 'best_models'
#     for filename in glob.glob(os.path.join(folder_path, '*.model')):
#         name = filename[12:-6]
#         POS_combination, = re.findall(r'\[.*?\]', name)
#         n_topics, = int(re.findall(r'\d+', filename))

#         # Get the right lemmatized text
#         for nested_dictionary in lemmatized_texts_list:
#             if POS_combination in nested_dictionary.keys():
#                 lemmatized_text = nested_dictionary[POS_combination]
        
#         model = gensim.models.LdaModel.load(f'models/{name}.model')
        
#         dictionary, corpus = vectorize(lemmatized_text, MIN_DF=1, MAX_DF=0.6)
        
#         coherence_score = CoherenceModel(model=model, texts=lemmatized_text, dictionary=dictionary, coherence='c_v').get_coherence()
        
#         print(f'loaded model: {name} with {n_topics} topics and coherence score of {coherence_score}.')
#         imported_models.append({ 'name': name, 'model': model, 'n_topics': n_topics, 'coherence_score': coherence_score, 'corpus': corpus, 'dictionary': dictionary })

#     return imported_models

# my_models = import_models()

In [161]:
# for file in os.scandir(word_list_location):
#     print(file.path)
#     filename = file.path
#     print(filename.split('__'))
#     file = filename.replace('.pickle', '__.pickle')
#     file = filename.replace('NOUNS___', 'NOUNS__')
#     os.rename(filename, file)
#os.rename()

In [None]:
    # dictionary, corpus = vectorize(lemmatized_text, MIN_DF=1, MAX_DF=0.6)
    # model, coherence, dictionary, corpus = train_model(lemmatized_text=lemmatized_text, dictionary=[], corpus=[], N_TOPICS=4)