<div style="background-color: lightgreen; border-radius: 5px; padding: 10px;">
    <h4>Occurence Finding, Paragraphs Collection and Matrix Construction</h4>
    <p>...</p>
</div>

In [13]:
# !pip install Unidecode

## Packages and Loading Files

In [3]:
# Load Packages
import os
import time
import re

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import unidecode


# Import Scripts

## Functions

In [None]:
# function to read all files from a directory
def read_stream(indir):
    """
    Function to read all the files in a directory (or nested directories)
    containing wikidump extracts.
    Returns a list of strings each element representing the entire contents
    of a single file

        Parameters:
        -----------
        indir (str): path to a directory containing text files
                     or directories with text files

    """
    wikidump = []
    t0 = time.time()

    for root, dirs, files in os.walk(indir):

        for filename in files:
            if not filename.startswith("."):
                fp = os.path.join(root, filename)

                with open(fp, 'r') as f:
                    wikidump.append(f.read())

    t1 = time.time()

    total = t1-t0
    print(f"It took {total}s to read {indir}.")

    return wikidump

In [40]:
# function to split dumps into flat list
def split_dump(input_dump, split_pattern = "c>"):
    """
    splits list of wikidump documents into a flat list of articles

        Parameters:
        -----------
            input_dump:    a list of strings
            split_pattern: str, optional
                string pattern at which the strings
                should be split into articles. default = 'c>'

    """

    article_list = [
        article for dump
        in tqdm(input_dump, total = len(input_dump), desc = "Progress split_dump()")
        for article in dump.split(split_pattern)]

    return article_list

In [39]:
def process_dump2(dump, key_words, message = True):
    """extracts titles and ids from articles containing key words and returns as a list"""

    articles = []
    for article in tqdm(dump, total = len(dump), desc = "Progress process_dump()"):
        article = unidecode.unidecode(article)
        if (list_in_corpus(key_words, article)):
            try:
                article_id = find_id(article)
                title = find_title(article)
                article_body = find_article(article)
                articles.append((article_id, title, article_body))
            except:
                pass
        else:
            pass

    if message:
        print(f"{len(articles)} articles out of {len(dump)} contain 2 toponyms")

    return articles

In [41]:
# function integrating the other functions
def preprocess(base_dir, outdir, key_words, remove_referral=True, overwrite_protection=True):
    """
        params:
            base_dir:             str;
                path to directory where extracted wikidump files can be found
            outdir:               str;
                path where processed files will be saved to (one file per multistream)
            language:             str;
                one of the following ['en', 'fr']
            key_words:            str, list;
                list of strings which must be included in article
            remove_referral:      bool, optional; default is True.
                if True referral pages will be removed
            overwrite_protection: bool, optional; default is True.
                if True confirmation will be asked before overwriting files
    """

    # establish that a valid language was chosen, if not abort function:
#     lang_list = ['fr', 'en']
#     if language not in lang_list:
#         print(f"Invalid language was chosen. \n Please choose one of the following: {lang_list}")
#         return

    # creating an output directory
    outdir = os.path.join(outdir, 'enwiki/')

    if not os.path.exists(outdir):
        os.mkdir(outdir)
        print(f'created directory at: {outdir}')


    # list of multistream directories in base_dir
    dir_list = os.listdir(base_dir)


#     for directory in dir_list:
    for directory in tqdm(dir_list, total = len(dir_list), desc = "Progress Total"):
        dir_fp = os.path.join(base_dir, directory)
        
        if not directory.startswith("."):
            print(f"\nStarting preprocessing on: {dir_fp}")
            wikidump = read_stream(dir_fp) # read the files in the directory

            wikidump = split_dump(wikidump) # split the files
            wikidump = process_dump2(wikidump, key_words) # extract id, title, article

            df = pd.DataFrame(wikidump, columns = ['article_id', 'title', 'text'])

            if remove_referral:
                try:
                    df['length'] = [len(text.split()) for text in df.text]
                    df['length_title'] = [len(title.split()) for title in df.title]
                    n_referral = len(df[df.length == df.length_title])

                    df = df[['article_id', 'title', 'text']][df.length != df.length_title]
                    print(f"Removing {n_referral} referral pages")

                except:
                    print(f"Referral pages were not removed from multistream {directory}")
                    pass


            # saving the output
            outfile = f'enwikidump_{directory}.csv'
            outputfp = os.path.join(outdir, outfile)

            # call write_outputcsv function
            write_outputcsv(df, outputfp, overwrite_protection = overwrite_protection)
        else:
            print(f"Skipping: {dir_fp}")

    print(f"----------\nFiles in {base_dir} have been processed\n----------")

    return

In [50]:
# matrix generation related functions
def create_city_dict(city_list):
    """
    function that creates a dictionary of name variants to the standard form
    output: a dictionary where the keys are variant names and the values are
    standard names.
    """

    # instantiate dictionary
    city_dict = dict()

    # split up the city names in the city list where a '-' occurs
    # (the symbol used to split separate placenames)
    for city in city_list:
        keys = city.split('-')
        if len(keys) > 1:
            keys.append(city)
        for key in keys:
            city_dict[key] = city

    return city_dict

In [51]:
def city_matrix(city_list):
    """generates an empty matrix with the index/columns consisting of the city names"""

    # create zero matrix with the correct dimensions
    matrix = np.zeros((len(city_list), len(city_list)))

    # transform into dataframe with the columns and index set to the list of cities
    matrix = pd.DataFrame(matrix, columns = city_list)
    matrix['index'] = city_list
    matrix.set_index('index', inplace = True)

    return matrix

In [54]:
def city_appearance(text, dictionary):
    """function to check which placenames appear in the input text per paragraph"""

    # instantiate empty list of standardised city names and city name variations
    cities_variants = []
    cities_standard = []

    # for each word in the text check if the word is a key word in the dictionary(one of the variants)
    for word in dictionary:
        pattern = r"\b" + word + r"\b" #add word boundaries to dictionary word
        match = re.search(pattern, text)
        if match:
            cities_variants.append(word)

    # for each word in the variant replace name with the standard form
    for city in cities_variants:
        city_standard = city.replace(city, dictionary[city])
        cities_standard.append(city_standard)

    return cities_variants, cities_standard

In [52]:
def process_article(article, dictionary, matrix):
    """IMPROVE DOC string
    function that processes each article in order to update the co-occurence values
    in a co-occurence matrix"""

    # split article into paragraphs (by using '\n' as end of paragraph)
    paragraphs = article.splitlines()
    for paragraph in paragraphs:
        
        # if paragraph empty skip
        if not paragraph:
            continue

        # generate list of cities that appear in the paragraph
        cities_variants, cities_standard = city_appearance(paragraph, dictionary)

        # skip if fewer than 2 cities appear
        if len(set(cities_standard)) < 2:
            continue

        else:
            # create the co-occurences that appear
            for city_i in cities_standard:
                for city_j in cities_standard:
                    if city_i != city_j: # make sure cities don't co-occure with themselves
                        matrix.at[city_i, city_j] += 1 # update value in matrix

    return matrix

In [67]:
def process_corpus(corpus, city_list):
    """function that processes the entire corpus and creates co-occurence matrix"""

    # generate dictionary and matrix and paragraphs dataframe
    dictionary = create_city_dict(city_list)
    matrix = city_matrix(city_list)
    
    # loop over each article in the corpus and update the matrix
    for article in tqdm(corpus, total = len(corpus), desc = "Articles processed"):
        process_article(article, dictionary, matrix)

    return matrix

In [55]:
def write_matrix(matrix, outdir, filename):
    """function to write matrix to csv"""
    outfp = os.path.join(outdir, filename)

    if os.path.exists(outfp):
        print(f"File {outfp} already exists.")
        print("Are you sure you want to continue and overwrite the file?")
        decision = input('Continue? [y/n]')
        if decision == 'y':
            matrix.to_csv(outfp, index = True)
            print(f"Matrix has been written to: {outfp}")
        elif decision == 'n':
            print("The process has been halted.")
        else:
            print("You did not enter a valid option.\nThe process has halted.")
    else:
        matrix.to_csv(outfp, index = True)
        print(f"Matrix has been written to: {outfp}")

    return


## Variables

In [26]:
fp = '../../input/List_of_cities_300k.csv' # path to csv with city information
cities = pd.read_csv(fp, sep=';')

name_col = f'Mua_en'
cities_list = [unidecode.unidecode(city_component) 
             for city in cities[name_col] 
             for city_component in city.split('-')]

In [5]:
data_dir = '../../../../data_clean/' # directory where selected articles will be saved, change if you want to save these elsewhere
out_dir = 'output/'
in_dir = '../../input/'
# extr_dir = path/to/wikidump/extracted

## Discard Unrelated Articles
This creates multiple .csv files with the 'title', 'title', 'text' of wikipedia articles with at least 2 toponym ocurrences.

In [None]:
wiki_dir = f"enwiki_extracted" # language specific directory
base_dir = os.path.join(extr_dir, wiki_dir)

# do the whole preprocessing/extraction thing
preprocess(base_dir, data_dir, lang, cities_list, overwrite_protection = False)

## Create Co-Occurrence Matrix
Iterates over the .csv files created in the previous chunk to identify toponym co-occurences and add these to a matrix. Function documentation within `preprocessing_functions.py`

In [43]:
# list of complete city names
city_l = [unidecode.unidecode(city) for city in cities[name_col]]

In [68]:
# # create matrix
# matrix = process_corpus(df, city_l)

# # save matrix
# FILENAME = f"en_matrix.csv"

# #write_matrix(matrix = matrix, 
# #             outdir = out_dir, 
# #             filename = FILENAME)

Articles processed:   0%|          | 0/508671 [00:00<?, ?it/s]

article_id
title
text


## Articles with city pair co-occurences

In [66]:
# folder of extracted streams
inputfp = os.path.join(data_dir, f'enwiki/')

#- loop over .csv files create dataframes
df = pd.DataFrame(columns = ['article_id', 'title', 'text'])

# iterate over directory for each file path, create a dataframe
for file in os.listdir(inputfp):
    fp = os.path.join(inputfp, file)
    df_temp = pd.read_csv(fp)
    df = pd.concat([df, df_temp])

df

Unnamed: 0,article_id,title,text
0,30143,Economy of Togo,\nEconomy of Togo\n\nThe economy of Togo has s...
1,30159,History of Tonga,\nHistory of Tonga\n\nThe history of Tonga is ...
2,30169,History of Trinidad and Tobago,\nHistory of Trinidad and Tobago\n\nThe histor...
3,30178,Tromelin Island,"\nTromelin Island\n\nTromelin Island (; , ) is..."
4,30212,Economy of Turkmenistan,\nEconomy of Turkmenistan\n\nThe economy of Tu...
...,...,...,...
17413,3963512,Elchonon Wasserman,\nElchonon Wasserman\n\nElchonon Bunim Wasserm...
17414,3963783,Naipes Heraclio Fournier,\nNaipes Heraclio Fournier\n\nNaipes Heraclio ...
17415,3963794,Banovina of Croatia,\nBanovina of Croatia\n\nThe Banovina of Croat...
17416,3963903,II Corps (Australia),\nII Corps (Australia)\n\nII Corps was an Aust...


Turning dataframe into a dictionary to improve computational speed (see speed tests at the bottom of this notebook).

In [74]:
%%time 
df_dict = df.to_dict('records')
sorted_df_dict = sorted(df_dict, key=lambda d: d['article_id']) 
# paragraphs_df = pd.DataFrame(columns= ['city_pair', 'paragraph_id', 'paragraph', 'article_id', 'title'])

CPU times: total: 1.33 s
Wall time: 1.32 s


### Chunk up articles

In [240]:
num, div = len(sorted_df_dict), 5
chunks = [num // div + (1 if x < num % div else 0)  for x in range (div)]
cum_chunks = [0]

for i, x in enumerate(chunks):
    cum_chunks.append(sum(chunks[:i+1]))

In [241]:
chunks_min_max = list(zip(cum_chunks, cum_chunks[1:]))
chunks_min_max

[(0, 101735),
 (101735, 203469),
 (203469, 305203),
 (305203, 406937),
 (406937, 508671)]

In [106]:
# Get date/time of code running, could be useful
# str(time.time()).split('.')[0]

'1659534902'

## Collecting paragraphs with city pair co-occurences

In [260]:
%%time

dictionary = create_city_dict(city_l)
paragraphs_df = pd.DataFrame(columns= ['city_1', 'city_2', 'paragraph_id', 'paragraph', 'article_id', 'title'])

count = 0
for chunk in tqdm(chunks_min_max):
    list_of_paragraphs=[]
    file_path = f"paragraphs_{chunk[0]}_{chunk[1]}.csv"
    
    if os.path.exists(file_path):
        print(f"{file_path} already exists.")
        continue
        
    for row in tqdm(sorted_df_dict[chunk[0]:chunk[1]], desc = f"{chunk[0]} - {chunk[1]}"):
        # split article into paragraphs (by using '\n' as end of paragraph)
        paragraphs = row['text'].splitlines()
        for paragraph in paragraphs:

            # if paragraph empty skip
            if not paragraph:
                continue

            # generate list of cities that appear in the paragraph
            cities_variants, cities_standard = city_appearance(paragraph, dictionary)

            # skip if fewer than 2 cities appear
            if len(set(cities_standard)) < 2:
                continue

            for city_i in cities_standard:
                for city_j in cities_standard:
                    if city_i != city_j:
                        count += 1
                        list_of_paragraphs.append([city_i, city_j, count, paragraph, row['article_id'], row['title']])
                    
    temp_paragraphs_df = pd.DataFrame(columns= ['city_1', 'city_2', 'paragraph_id', 'paragraph', 'article_id', 'title'], data = list_of_paragraphs)
    temp_paragraphs_df.to_csv(f"paragraphs_{chunk[0]}_{chunk[1]}.csv", index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

0 - 101735:   0%|          | 0/101735 [00:00<?, ?it/s]

101735 - 203469:   0%|          | 0/101734 [00:00<?, ?it/s]

203469 - 305203:   0%|          | 0/101734 [00:00<?, ?it/s]

305203 - 406937:   0%|          | 0/101734 [00:00<?, ?it/s]

406937 - 508671:   0%|          | 0/101734 [00:00<?, ?it/s]

CPU times: total: 2h 55min 2s
Wall time: 2h 57min 28s


## Merge all paragraphs

In [263]:
all_paragraphs_df = pd.DataFrame(columns= ['city_1', 'city_2', 'paragraph_id', 'paragraph', 'article_id', 'title'])

for chunk in chunks_min_max:
    temp_df2 = pd.read_csv(os.path.join(data_dir, "paragraphs", f"paragraphs_{chunk[0]}_{chunk[1]}.csv"))
    all_paragraphs_df = pd.concat([paragraphs_df, temp_df2])

In [265]:
all_paragraphs_df['paragraph'].count()

2076404

In [7]:
# last_scanned_article = temp_paragraphs_df.iloc[-1].article_id
# len(list_of_paragraphs)
subset_paragraphs_df = pd.read_csv(os.path.join(data_dir, "paragraphs", f"paragraphs_0_101735.csv"))

In [8]:
subset_paragraphs_df

Unnamed: 0,city_1,city_2,paragraph_id,paragraph,article_id,title
0,Birmingham,Florence,1,The first community of adherents of the Baha'i...,303,Alabama
1,Florence,Birmingham,2,The first community of adherents of the Baha'i...,303,Alabama
2,Paris,London,3,A major revision of the work by composer and a...,309,An American in Paris
3,London,Paris,4,A major revision of the work by composer and a...,309,An American in Paris
4,Madrid,Rome,5,Access to biocapacity in Algeria is lower than...,358,Algeria
...,...,...,...,...,...,...
488331,Dublin,London,488332,Allman-Smith played hockey for Dublin Universi...,3001932,Edward Allman-Smith
488332,London,Dublin,488333,O'Kelly and Condell met in Dublin in 1969 and ...,3001953,Tir na nOg (band)
488333,Dublin,London,488334,O'Kelly and Condell met in Dublin in 1969 and ...,3001953,Tir na nOg (band)
488334,Birmingham,Dublin,488335,"Tir na nOg reformed in 1985, releasing the sin...",3001953,Tir na nOg (band)


## Extra: Speed Comparison

In [232]:
%%time

# dictionary iteration
for row in tqdm(df_dict):
    pass

  0%|          | 0/508671 [00:00<?, ?it/s]

CPU times: total: 188 ms
Wall time: 166 ms


In [80]:
%%time

# numpy array iteration
for row in tqdm(df.values):
    pass

  0%|          | 0/508671 [00:00<?, ?it/s]

CPU times: total: 250 ms
Wall time: 201 ms


In [81]:
%%time

# itertuples
for row in tqdm(df.itertuples()):
    pass

0it [00:00, ?it/s]

CPU times: total: 453 ms
Wall time: 494 ms


In [82]:
%%time

# iterrows
for row in tqdm(df.iterrows()):
    pass

0it [00:00, ?it/s]

CPU times: total: 12.2 s
Wall time: 12.4 s
