In [10]:
import os
import pandas as pd
import warnings
from tqdm.notebook import tqdm
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import re
import time
from itertools import combinations

# Settings

In [36]:
# Settings that can be changed
basefolder = '../../../enwiki_extracted/'
corpus_path = '../output/'
corpus_name = 'city_matrix.csv'
new_corpus = False

cities_df = pd.read_csv(f'../input/List_of_cities_300k.csv', delimiter=';')
nr_of_cities = 5 # by population numbers
window_size = 0

In [39]:
list_of_cities = list(cities_df['Mua_en'][0:nr_of_cities].str.lower())
city_pairs = list(combinations(list_of_cities, 2))
list_of_cities.sort()
list_of_cities

['berlin', 'london', 'madrid', 'milan', 'paris']

In [34]:
# All possible city pairs for 100 cities
len(list(combinations([x for x in range(100)], 2)))

4950

# Get articles per city pair

In [321]:
%%time

from IPython.display import clear_output
from ipywidgets import Output
from IPython.display import display
out = Output()
display(out)

def process_wikidump(basefolder=basefolder,
                cities=list_of_cities,
                city_pairs = city_pairs,
                window_size=window_size):
#                 corpus_path=corpus_path,
#                 corpus_name=corpus_name,
#                 new_corpus=new_corpus,
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # for file in tqdm(os.listdir(basefolder), total=len(os.listdir(basefolder)), desc='files'):
        folders = [f.path for f in os.scandir(basefolder) if f.is_dir()]
        i = 0
        for folder in tqdm(folders, total=len(folders), leave=True, desc='Folders'):
            i+=1
            subfolders = [ f.path for f in os.scandir(folder) if f.is_dir()]
            for subfolder in tqdm(subfolders, total=len(subfolders), leave=False, desc='Sub Folders'):
                if 'processed' in subfolder:
                    continue

                for file in os.listdir(subfolder): # tqdm(os.listdir(subfolder), total=len(os.listdir(subfolder)), leave=True, desc='Text Files'):                            
                    with open(os.path.join(subfolder, file), "r", encoding="utf-8") as f:
                        process_file(file, f.read(), city_pairs) 
                    with out:
                        print(subfolder)
                        clear_output(wait=True)
                
                os.replace(subfolder, subfolder + '_processed')
                
            if i == 5:
                break
            

process_wikidump()

Output()

Folders:   0%|          | 0/62 [00:00<?, ?it/s]

Sub Folders:   0%|          | 0/4 [00:00<?, ?it/s]

Sub Folders:   0%|          | 0/6 [00:00<?, ?it/s]

Sub Folders:   0%|          | 0/6 [00:00<?, ?it/s]

Sub Folders:   0%|          | 0/1 [00:00<?, ?it/s]

Sub Folders:   0%|          | 0/4 [00:00<?, ?it/s]

CPU times: total: 2min 40s
Wall time: 3min 36s


In [307]:
import sys
tokenizer = RegexpTokenizer('\w+')

def process_file(filename, file_content, city_pairs):
    articles = file_content.split('</doc>')
    articles = [x.strip() for x in articles if len(x.strip())]
    
    for article in articles:
        for paragraph in article.split('\n'):
            tokenized_paragraph = tokenizer.tokenize(paragraph.lower())
        
            word_count = Counter(tokenized_paragraph)
            words = set(word_count.keys())
            
            for city_pair in city_pairs:
                detected_cities = words.intersection(set(city_pair))

                if len(detected_cities) == 2:
                    reg_str = "title=\"" + "(.*?)" + "\""
                    title = re.findall(reg_str, article)

                    reg_str = "id=\"" + "(.*?)" + "\""
                    article_id = re.findall(reg_str, article)
                    
                    with out:
                        print('\r',filename," || ", "title: ", title[0], " || ", "citypair: ", city_pair, end='                                            ')

                    content = f'title="{title[0]}", id={article_id[0]} \n{paragraph} \n\n'
                    
                    textfile = f'../../../enwiki_city_pairs/{city_pair[0]}_{city_pair[1]}.txt'
#                     if not os.path.exists(textfile):
#                         open(textfile, "x")
                        
#                     with open(textfile, 'r', encoding='utf-16') as f:
#                         if f'id={article_id[0]}' not in f.read():
                    with open(textfile, 'a+', encoding='utf-16') as f:
                        f.writelines(content)
#                         else:
#                             print('exists!')

In [5]:
textfile = f'../../../enwiki_city_pairs/paris_milan.txt'

with open(textfile, 'r', encoding='utf-16') as f:
    parismilan = [x.strip() for x in f.read().split('\n') if len(x) and 'title=' not in x]
    
parismilan[:10]

["There are plenty of air connections between Yerevan and other regional cities, including Athens, Barcelona, Beirut, Berlin, Bucharest, Brussels, Damascus, Doha, Dubai, Istanbul, Kyiv, Kuwait City, London, Milan, Minsk, Moscow, Paris, Prague, Riga, Rome, Tehran, Tel-Aviv, Tbilisi, Vienna, Venice and Warsaw, as well as daily connections to most major cities within the CIS region. Statistics show that the number of tourists arriving in the country by air transportation increases yearly. In 2018, passenger flow at the two main airports of Armenia reached a record high of 2,856,673 million people. In December 2019, yearly passenger flow exceeded 3,000,000 million people for the first time in Armenia's history.",
 "In 1478, Guido Antonio Vespucci led a Florentine diplomatic mission to Paris and invited his younger cousin, Amerigo Vespucci, to join him. Amerigo's role is not clear, but it was likely as an attache or private secretary. Along the way they had business in Bologna, Milan, and L

# Topic Modeling

In [40]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import spacy
nlp = spacy.load("en_core_web_sm")

In [366]:
# import wget
# !python -m wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip


Saved under mallet-2.0.8.zip


## Important Functions

In [13]:
def lemmatize(texts, POSfilter=[]):
    POStags=["PROPN", "NOUN", "ADJ", "VERB", "ADV"]

    # Checks if user gave their own (list of) Part of Speech tag(s)
    if POSfilter:
        if isinstance(POSfilter, list):
            for POS in POSfilter:
                if POS not in POStags:
                    print(f'POSfilter only allows a list with one or multiple from the following tags: {POStags}.')
                    return
            POStags = POSfilter
        
        elif isinstance(POSfilter, str):
            if POSfilter not in POStags:
                print(f'Provided POStag should be in the following list: {POStags}.')
                return
            POStags = [POSfilter]
        else:
            print('POSfilter should either be left out, a list of POS tags or a single POS tag.')
            return
        
    # Gets triggered if a single string is given
    if isinstance(texts, str) and len(texts):
        processed_text = nlp(texts.lower())
        lemmatized_text = [word.lemma_.lower() for word in processed_text if word.pos_ in POStags and not word.is_punct and not word.is_stop]
        regexed_text = [re.sub(r'\W+', '', word) for word in lemmatized_text]
    
    # Gets triggered if an array of strings is given
    elif isinstance(texts, (pd.Series, list)) and len(texts):
        processed_text = [text for text in tqdm(nlp.pipe(texts, n_process=-1, disable=["ner", "parser"]), total=len(texts))]
        lemmatized_text = [[word.lemma_.lower() for word in text if word.pos_ in POStags and not word.is_punct and not word.is_stop] for text in processed_text]
        regexed_text = [[re.sub(r'\W+', '', word) for word in text] for text in lemmatized_text]
    
    else:
        print('Your provided text could not be processed. Check if the format of your provided text is either a string or a list of strings.')
        return
    
    return regexed_text

lemmatized_text = lemmatize(parismilan[:10])

  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
def vectorize(lemmatized_text, MIN_DF = 1, MAX_DF = 0.6):
    # MIN_DF minium document frequency
    # MAX_DF maximum document frequency

    # Get vocabulary
    dictionary = Dictionary(lemmatized_text)
    dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF)
    
    corpus = [dictionary.doc2bow(text) for text in lemmatized_text]
    
    return(dictionary, corpus)

In [17]:
from gensim.models.wrappers import LdaMallet
PATH_TO_MALLET = r'C:/mallet/bin/mallet.bat'

def train_model(lemmatized_text, dictionary=[], corpus=[], MIN_DF = 1, MAX_DF = 0.6, N_TOPICS = 5, N_ITERATIONS = 1000):
    # usually 1000 iterations will do
    
    # Call vectorization function if either dictionary or corpus is missing as parameter
    if not type(dictionary) == gensim.corpora.dictionary.Dictionary or not corpus:
        dictionary, corpus = vectorize(lemmatized_text, MIN_DF, MAX_DF)
    
    print(N_TOPICS)
    
    lda_model = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                optimize_interval=10,
                iterations=N_ITERATIONS)
    
    coherence_score = CoherenceModel(model=lda_model, texts=lemmatized_text, dictionary=dictionary, coherence='c_v').get_coherence()
    
    return(lda_model, coherence_score, dictionary, corpus)

## Word Topic Distribution

In [43]:
MAX_WORDS = 10

for i in range(N_TOPICS):
    words = model.show_topic(i, topn=MAX_WORDS)
    print([(x[0], round(x[1], 3)) for x in words])

[('city', 0.022), ('barcelona', 0.022), ('napoleon', 0.016), ('la', 0.016), ('international', 0.011), ('london', 0.011), ('real', 0.011), ('owner', 0.011), ('germain', 0.011), ('galaxy', 0.011)]
[('card', 0.048), ('madrid', 0.029), ('napoleon', 0.029), ('match', 0.029), ('red', 0.029), ('war', 0.029), ('play', 0.019), ('request', 0.01), ('beckham', 0.01), ('peacemaker', 0.01)]


## Document Topic Distribution

In [46]:
transformed_docs = model.load_document_topics()

for i, document in enumerate(transformed_docs):
    print('Topic distributions for document {}'.format(i))
    print('\t', [topic for topic in document])

Topic distributions for document 0
	 [(0, 0.4337883703631253), (1, 0.1628256929806356), (2, 0.1773988097267572), (3, 0.225987126929482)]
Topic distributions for document 1
	 [(0, 0.40355466170712806), (1, 0.21564374468325456), (2, 0.15102766396483033), (3, 0.229773929644787)]
Topic distributions for document 2
	 [(0, 0.39401957732231), (1, 0.220972288904759), (2, 0.1703822301782096), (3, 0.21462590359472147)]
Topic distributions for document 3
	 [(0, 0.3668454753226894), (1, 0.20059171240504356), (2, 0.1567951791783993), (3, 0.2757676330938677)]
Topic distributions for document 4
	 [(0, 0.3874072337391567), (1, 0.22110848103015396), (2, 0.1699773841879418), (3, 0.2215069010427475)]
Topic distributions for document 5
	 [(0, 0.38747894997849935), (1, 0.19766838356049), (2, 0.17198409777825072), (3, 0.24286856868275988)]
Topic distributions for document 6
	 [(0, 0.40942740417301976), (1, 0.263816158685384), (2, 0.16178818539519574), (3, 0.16496825174640056)]
Topic distributions for docume

## Sample Text

In [32]:
parismilan[:10]

["There are plenty of air connections between Yerevan and other regional cities, including Athens, Barcelona, Beirut, Berlin, Bucharest, Brussels, Damascus, Doha, Dubai, Istanbul, Kyiv, Kuwait City, London, Milan, Minsk, Moscow, Paris, Prague, Riga, Rome, Tehran, Tel-Aviv, Tbilisi, Vienna, Venice and Warsaw, as well as daily connections to most major cities within the CIS region. Statistics show that the number of tourists arriving in the country by air transportation increases yearly. In 2018, passenger flow at the two main airports of Armenia reached a record high of 2,856,673 million people. In December 2019, yearly passenger flow exceeded 3,000,000 million people for the first time in Armenia's history.",
 "In 1478, Guido Antonio Vespucci led a Florentine diplomatic mission to Paris and invited his younger cousin, Amerigo Vespucci, to join him. Amerigo's role is not clear, but it was likely as an attache or private secretary. Along the way they had business in Bologna, Milan, and L

## Visualisation

In [20]:
pyLDAvis.enable_notebook()

dictionary, corpus = vectorize(lemmatized_text, MIN_DF=1, MAX_DF=0.6)
model, coherence, dictionary, corpus = train_model(lemmatized_text=lemmatized_text, dictionary=[], corpus=[], N_TOPICS=4)

lda_conv = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
gensimvis.prepare(lda_conv, corpus, dictionary)

4


  result = func(self.values, **kwargs)
  default_term_info = default_term_info.sort_values(
  result = func(self.values, **kwargs)


In [2]:
# 

# MIN_DF = 1 # minium document frequency
# MAX_DF = 0.6 # maximum document frequency

# tokenized_texts = [['politics', 'is', 'quite', 'intresting', 'but', 'polticians', 'are','not'],
#                     ['cats', 'like', 'to', 'explore'],
#                     ['johnson', 'trump', 'politicians', 'of', 'the', 'new', 'era'] ,
#                      ['war', 'is', 'not', 'essential', 'to', 'politics'],
#                      ['dogs', 'and', 'cats', 'fight', 'sometimes'],
#                      ['where', 'can', 'i', 'buy', 'guns', 'for', 'the', 'war'],
#                     ['my','favorite', 'animal', 'is', 'my', 'cat'],
#                     ['my', 'least', 'favorite', 'my', 'dog'],
#                     ['world', 'war', 'two', 'started', 'in', '1939']
#                     ]
# dictionary = Dictionary(tokenized_texts) # get the vocabulary
# dictionary.filter_extremes(no_below=MIN_DF, 
#                            no_above=MAX_DF)
# corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

In [3]:

# #models.wrappers import LdaMallet

# PATH_TO_MALLET = r'C:/mallet/bin/mallet.bat'

# N_TOPICS = 2 # k
# N_ITERATIONS = 1000 # usually 1000 will do

# lda = LdaMallet(PATH_TO_MALLET,
#                 corpus=corpus,
#                 id2word=dictionary,
#                 num_topics=N_TOPICS,
#                 optimize_interval=10,
#                 iterations=N_ITERATIONS)