# Matrix Construction

In [1]:
# load packages
import re, string
import pandas as pd
import numpy as np
import os
import time
from tqdm.notebook import tqdm
import unidecode
import matplotlib.pyplot as plt
%matplotlib inline

# import from script
from preprocessing_functions import *

In [113]:
# count = 0
# for city_1, city_2 in city_link:
#     print(f"{city_1}, {city_2}, ({city_link[(city_1, city_2)]})")
#     count += 1
#     if count > 20:
#         break

## Create Language Dictionary

In [2]:
language = 'fr'

In [3]:
# cities dataframe
fp = '../input/List_of_cities_300k.csv' # path to csv with city information
cities = pd.read_csv(fp, sep=';')
name_col = f'Mua_{language}'

# list of complete city names
city_l = [unidecode.unidecode(city) for city in cities[name_col]]

In [4]:
indir = f'../../../data/'
language = 'fr' # ['fr', 'en']
inputfp = os.path.join(indir, f'{language}wiki/')


In [5]:
#- loop over .csv files create dataframes
df = pd.DataFrame(columns = ['article_id', 'title', 'text'])
# iterate over directory for each file path, create a dataframe
for file in os.listdir(inputfp):
    fp = os.path.join(inputfp, file)
    print(fp)
    df_temp = pd.read_csv(fp)
    df = pd.concat([df, df_temp])

../../../data/frwiki/frwikidump_ms5.csv
../../../data/frwiki/frwikidump_ms4.csv
../../../data/frwiki/frwikidump_ms6.csv
../../../data/frwiki/frwikidump_ms7.csv
../../../data/frwiki/frwikidump_ms3.csv
../../../data/frwiki/frwikidump_ms2.csv
../../../data/frwiki/frwikidump_ms1.csv
../../../data/frwiki/frwikidump_ms9.csv
../../../data/frwiki/frwikidump_ms8.csv
../../../data/frwiki/frwikidump_ms11.csv
../../../data/frwiki/frwikidump_ms10.csv
../../../data/frwiki/frwikidump_ms12.csv
../../../data/frwiki/frwikidump_ms13.csv


In [6]:
df.shape

(274318, 3)

In [7]:
df.head()

Unnamed: 0,article_id,title,text
0,2977301,Antoine Marie Philippe Asinari de Saint-Marsan,\nAntoine Marie Philippe Asinari de Saint-Mars...
1,2977313,Anne de Montafie,"\nAnne de Montafie\n\nAnne de Montafie, comtes..."
2,2977316,Grand Siecle (histoire de France),\nGrand Siecle (histoire de France)\n\nLe term...
3,2977365,Alessandro Benedetti,\nAlessandro Benedetti\n\nAlessandro Benedetti...
4,2977409,Institut wallon de formation en alternance et ...,\nInstitut wallon de formation en alternance e...


In [None]:
t0 = time.time()

french_matrix = process_corpus(df.text, city_l)

t1 = time.time()
total = t1-t0
print(f"It took {total}s to process the corpus.")


In [None]:
# save french matrix
outdir = "../output/"
filename = "french_matrix_20220523.csv"

def write_matrix(matrix, outdir, filename): 
    outfp = os.path.join(outdir, filename)
    
    if os.path.exists(outfp):
        print(f"File {outfp} already exists.")
        print("Are you sure you want to continue and overwrite the file?")
        decision = input('Continue? [y/n]')
        if decision == 'y':
            df.to_csv(outputfp, index = True)
            print(f"Matrix has been written to: {outfp}")
        elif decision == 'n': 
            print("The process has been halted.")
        else:
            print("You did not enter a valid option.\nThe process has halted.")
    else:
        df.to_csv(outputfp, index = True)
        print(f"Matrix has been written to: {outfp}")
        
    return

In [None]:
# create french city_link dictionary
citylink_fr = create_citylink(french_matrix)

# create df based on this dictionary
dictionary = citylink_fr

city_pairs = pd.DataFrame(columns = ["city1", "city2", "co-occurence"])
i = 0
for element1, element2 in dictionary: 
    city_pairs[["city1", "city2", "co-occurence"]][i] = element1, element2, citylink_fr[(element1, element2)]
    i += 1

***

In [130]:
def create_city_dict(city_list):
    """function that creates a dictionary of name variants to the standard form"""
    
    city_dict = dict()
    for city in city_list:
        keys = city.split('-')
        for key in keys:
            city_dict[key] = city
            
    return city_dict

def city_matrix(city_list):
    matrix = np.zeros((len(city_list), len(city_list)))
    matrix = pd.DataFrame(matrix, columns = city_list)
    matrix['index'] = city_list
    matrix.set_index('index', inplace = True)
    
    return matrix

## Matrix

- loop over .csv files create dataframes
- split df.text into paragraph lists
- loop over paragraph lists with window
- (tokenize here)
- count occurences of cities within window
- create co-occurences based on occurences

- trial with 10 cities on like 1 ms

<font color='red'>IF THE TEXT IS TOKENIZED BEFORE DOING THE CITY COMPARISON THAN MULTIPLE WORD CITIES WILL NOT BE FOUND!!</font>

In [141]:
# for paragraph in processed_paragraph:
def city_appearance(text, dictionary):
    
    # instantiate empty list of standardised city names and city name variations
    cities_variants = []
    cities_standard = []
    # for each word in the text check if the word is a key word in the dictionary(one of the variants)
    for word in dictionary:
        pattern = r"\b" + word + r"\b" #add word boundaries to dictionary word
        match = re.search(pattern, text)
        if match:
            cities_variants.append(word)
            
    # for each word in the variant replace name with the standard form
    for city in cities_variants:
        city_standard = city.replace(city, dictionary[city])
        cities_standard.append(city_standard)
    
    return cities_variants, cities_standard        

In [142]:
# - split df.text into paragraph lists
# for each article in the dataframe create list of paragraphs



def process_article(article, dictionary, matrix):
    paragraphs = article.splitlines()
    for paragraph in paragraphs:
        if not paragraph:
            continue
    
        cities_variants, cities_standard = city_appearance(paragraph, dictionary)

        if len(set(cities_standard)) < 2: 
            continue
            
        else: 
            # create the co-occurences that appear
            for city_i in cities_standard:
                for city_j in cities_standard:
                    if city_i != city_j: 
                        matrix.at[city_i, city_j] += 1
            

    return matrix

In [159]:
def process_corpus(corpus, city_list): 
    """function that processes the entire corpus and creates co-occurence matrix"""
    
    dictionary = create_city_dict(city_list)
    matrix = city_matrix(city_list)
    
    for article in corpus:
        process_article(article, dictionary, matrix)
    
    return matrix

In [161]:
t0 = time.time()

new_matrix = process_corpus(df.text, city_l)

t1 = time.time()
total = t1-t0
print(f"It took {total}s to process the corpus.")


It took 176.7749147415161s to process the corpus.


In [165]:
# create city_links on the basis of matrix
    # loop on basis of tongjing's code

def create_citylink(matrix): 
    city_link = {}
    for i in range(len(matrix)-1):
        for j in range(i+1, len(matrix)-1):
            city_link[(matrix.index[i], matrix.columns[j])] = matrix.iloc[i,j]
    return city_link

In [124]:
# # - loop over paragraph lists with window
# window = 2
# ls = ['hello', 'goodbye', 'never again'] # should be para list
# for n in range(len(ls)-window +1): 
#     print(ls[n:n+window])
#     # here combine the two paragraphs together

In [None]:
# - count occurences of cities within window
# count city_name in paragraph_window

In [None]:
# - create co-occurences based on occurences
# cooccurence[citya_cityb] = max(count(city_a), count(city_b)) only if city_a !=city_b

In [None]:
# - trial with 10 cities on like 1 ms