# Matrix Construction

In [1]:
# load packages
import re, string
import pandas as pd
import numpy as np
import os
import time
from tqdm.notebook import tqdm
import unidecode
import matplotlib.pyplot as plt
%matplotlib inline

# import from script
from preprocessing_functions import *

## Create Language Dictionary

In [2]:
language = 'fr' # ['fr', 'en']

In [3]:
# cities dataframe
fp = '../../input/List_of_cities_300k.csv' # path to csv with city information
cities = pd.read_csv(fp, sep=';')
name_col = f'Mua_{language}'

# list of complete city names
city_l = [unidecode.unidecode(city) for city in cities[name_col]]

In [4]:
indir = f'../../../../data/'
inputfp = os.path.join(indir, f'{language}wiki/')


In [5]:
#- loop over .csv files create dataframes
df = pd.DataFrame(columns = ['article_id', 'title', 'text'])
# iterate over directory for each file path, create a dataframe
for file in os.listdir(inputfp):
    fp = os.path.join(inputfp, file)
    print(fp)
    df_temp = pd.read_csv(fp)
    df = pd.concat([df, df_temp])

../../../../data/frwiki/frwikidump_ms5.csv
../../../../data/frwiki/frwikidump_ms4.csv
../../../../data/frwiki/frwikidump_ms6.csv
../../../../data/frwiki/frwikidump_ms7.csv
../../../../data/frwiki/frwikidump_ms3.csv
../../../../data/frwiki/frwikidump_ms2.csv
../../../../data/frwiki/frwikidump_ms1.csv
../../../../data/frwiki/frwikidump_ms9.csv
../../../../data/frwiki/frwikidump_ms8.csv
../../../../data/frwiki/frwikidump_ms11.csv
../../../../data/frwiki/frwikidump_ms10.csv
../../../../data/frwiki/frwikidump_ms12.csv
../../../../data/frwiki/frwikidump_ms13.csv


In [6]:
df.shape

(274318, 3)

In [7]:
df.head()

Unnamed: 0,article_id,title,text
0,2977301,Antoine Marie Philippe Asinari de Saint-Marsan,\nAntoine Marie Philippe Asinari de Saint-Mars...
1,2977313,Anne de Montafie,"\nAnne de Montafie\n\nAnne de Montafie, comtes..."
2,2977316,Grand Siecle (histoire de France),\nGrand Siecle (histoire de France)\n\nLe term...
3,2977365,Alessandro Benedetti,\nAlessandro Benedetti\n\nAlessandro Benedetti...
4,2977409,Institut wallon de formation en alternance et ...,\nInstitut wallon de formation en alternance e...


In [8]:
t0 = time.time()

french_matrix = process_corpus(df.text, city_l)

t1 = time.time()
total = t1-t0
print(f"It took {total}s to process the corpus.")


Articles processed:   0%|          | 0/274318 [00:00<?, ?it/s]

It took 8747.869585990906s to process the corpus.


In [9]:
french_matrix

Unnamed: 0_level_0,Paris,Londres,Madrid,Berlin,Milan,Barcelone,Athenes,Rome,Birmingham,Lisbonne,...,Cordoue,La Corogne,Craiova,Caserte,Coventry,Brasov,Bonn,La Valette,Gand,Gdynia
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Paris,0.0,10733.0,2404.0,4723.0,2713.0,1696.0,957.0,6116.0,171.0,807.0,...,44.0,39.0,24.0,5.0,22.0,12.0,329.0,51.0,753.0,5.0
Londres,10733.0,0.0,913.0,2484.0,1022.0,648.0,520.0,1673.0,712.0,428.0,...,14.0,7.0,1.0,2.0,159.0,2.0,115.0,20.0,189.0,4.0
Madrid,2404.0,913.0,0.0,482.0,730.0,2760.0,130.0,1051.0,28.0,500.0,...,181.0,173.0,2.0,11.0,4.0,2.0,31.0,3.0,46.0,3.0
Berlin,4723.0,2484.0,482.0,0.0,500.0,419.0,251.0,1129.0,54.0,175.0,...,5.0,6.0,7.0,3.0,15.0,6.0,670.0,0.0,71.0,10.0
Milan,2713.0,1022.0,730.0,500.0,0.0,591.0,138.0,2574.0,14.0,176.0,...,18.0,22.0,10.0,18.0,4.0,2.0,23.0,5.0,180.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Brasov,12.0,2.0,2.0,6.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bonn,329.0,115.0,31.0,670.0,23.0,25.0,20.0,100.0,1.0,12.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,1.0
La Valette,51.0,20.0,3.0,0.0,5.0,6.0,2.0,28.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gand,753.0,189.0,46.0,71.0,180.0,34.0,13.0,99.0,3.0,16.0,...,2.0,3.0,0.0,1.0,1.0,0.0,14.0,0.0,0.0,0.0


In [12]:
# save french matrix
OUTDIR = "../../output/"
FILENAME = "french_matrix_20220601.csv"

write_matrix(matrix = french_matrix, 
             outdir = OUTDIR, 
             filename = FILENAME)

Matrix has been written to: ../../output/french_matrix_20220601.csv


In [13]:
french_matrix = pd.read_csv(os.path.join(OUTDIR, FILENAME))
french_matrix.set_index('index', inplace = True)

In [16]:
french_matrix.head(20)

Unnamed: 0_level_0,Paris,Londres,Madrid,Berlin,Milan,Barcelone,Athenes,Rome,Birmingham,Lisbonne,...,Cordoue,La Corogne,Craiova,Caserte,Coventry,Brasov,Bonn,La Valette,Gand,Gdynia
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Paris,0.0,10733.0,2404.0,4723.0,2713.0,1696.0,957.0,6116.0,171.0,807.0,...,44.0,39.0,24.0,5.0,22.0,12.0,329.0,51.0,753.0,5.0
Londres,10733.0,0.0,913.0,2484.0,1022.0,648.0,520.0,1673.0,712.0,428.0,...,14.0,7.0,1.0,2.0,159.0,2.0,115.0,20.0,189.0,4.0
Madrid,2404.0,913.0,0.0,482.0,730.0,2760.0,130.0,1051.0,28.0,500.0,...,181.0,173.0,2.0,11.0,4.0,2.0,31.0,3.0,46.0,3.0
Berlin,4723.0,2484.0,482.0,0.0,500.0,419.0,251.0,1129.0,54.0,175.0,...,5.0,6.0,7.0,3.0,15.0,6.0,670.0,0.0,71.0,10.0
Milan,2713.0,1022.0,730.0,500.0,0.0,591.0,138.0,2574.0,14.0,176.0,...,18.0,22.0,10.0,18.0,4.0,2.0,23.0,5.0,180.0,1.0
Barcelone,1696.0,648.0,2760.0,419.0,591.0,0.0,208.0,551.0,38.0,239.0,...,122.0,118.0,3.0,2.0,3.0,1.0,25.0,6.0,34.0,4.0
Athenes,957.0,520.0,130.0,251.0,138.0,208.0,0.0,690.0,21.0,76.0,...,3.0,8.0,1.0,2.0,7.0,1.0,20.0,2.0,13.0,4.0
Rome,6116.0,1673.0,1051.0,1129.0,2574.0,551.0,690.0,0.0,105.0,301.0,...,44.0,11.0,4.0,56.0,8.0,2.0,100.0,28.0,99.0,1.0
Birmingham,171.0,712.0,28.0,54.0,14.0,38.0,21.0,105.0,0.0,6.0,...,0.0,0.0,0.0,0.0,91.0,0.0,1.0,1.0,3.0,0.0
Lisbonne,807.0,428.0,500.0,175.0,176.0,239.0,76.0,301.0,6.0,0.0,...,28.0,34.0,0.0,0.0,3.0,0.0,12.0,2.0,16.0,2.0


In [14]:
# create french city_link dictionary
citylink_fr = create_citylink(french_matrix)


In [15]:
# save dictionary
FILENAME = "citypairs_french_20220601.txt"
fp = os.path.join(OUTDIR, FILENAME)

with open(fp, 'w') as f: 
    for key, value in citylink_fr.items(): 
        f.write('%s: %s\n' % (key, value))


***

## Matrix

- ~loop over .csv files create dataframes~
- ~split df.text into paragraph lists~
- loop over paragraph lists with window
<font color='blue'>window is currently single paragraph. issue of not double counting within paragraph occurences with moving window. (2 paragraph co-occurences - single paragraph co-occurences?)</font>
- ~(tokenize here)~ <font color='red'>IF THE TEXT IS TOKENIZED BEFORE DOING THE CITY COMPARISON THAN MULTIPLE WORD CITIES WILL NOT BE FOUND!!</font>
- count occurences of cities within window
<font color='blue'>currently placenames can only co-occur ones per paragraph.</font>
- ~create co-occurences based on occurences~

- ~trial with 10 cities on like 1 ms~ (176 seconds)

##### <font color='blue'>Currently placenames can only co-occur ones per paragraph.</font>
- currently using a boolean expression if in +1. How about instead of `re.search(CITYSTRING)` we use `re.findall(CITYSTRING)`. and then append the cityname as often as `len(re.findall(CITYSTRING))`.




In [141]:
# for paragraph in processed_paragraph:
def city_appearance(text, dictionary):
    
    # instantiate empty list of standardised city names and city name variations
    cities_variants = []
    cities_standard = []
    # for each word in the text check if the word is a key word in the dictionary(one of the variants)
    for word in dictionary:
        pattern = r"\b" + word + r"\b" #add word boundaries to dictionary word
        match = re.search(pattern, text)
        if match:
            cities_variants.append(word)
            
    # for each word in the variant replace name with the standard form
    for city in cities_variants:
        city_standard = city.replace(city, dictionary[city])
        cities_standard.append(city_standard)
    
    return cities_variants, cities_standard


In [161]:
t0 = time.time()

new_matrix = process_corpus(df.text, city_l)

t1 = time.time()
total = t1-t0
print(f"It took {total}s to process the corpus.")


It took 176.7749147415161s to process the corpus.


In [124]:
# # - loop over paragraph lists with window
# window = 2
# ls = ['hello', 'goodbye', 'never again'] # should be para list
# for n in range(len(ls)-window +1): 
#     print(ls[n:n+window])
#     # here combine the two paragraphs together?

In [None]:
# - count occurences of cities within window
# count city_name in paragraph_window

In [None]:
# - create co-occurences based on occurences
# cooccurence[citya_cityb] = min(count(city_a), count(city_b)) only if city_a !=city_b
# something along that line since that would then do the co-occurence of 2 in a case where amsterdam is mentioned thrice and rotterdam twice.