# Select relevant extracted wikidump files

In [1]:
import re
import pandas as pd
import os
import time
from tqdm.notebook import tqdm
import unidecode

## Functions

In [2]:
# define functions for extracting metadata
def find_id(string):
    """function to extract id from article"""
    
    id_pattern = 'id=\"\d+\"'
#     id_float = '[0-9]+'
    
    short_string = re.findall(id_pattern, string)[0]
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    actual_id = int(short_string[start:end])

#     actual_id = re.findall(id_float, short_string)
    
    return actual_id
 
def find_url(string):
    """function to extract url from article"""
    
    url_pattern = 'https://en\.wikipedia\.org/[a-zA-z\?=\d]+'
    
    url = re.findall(url_pattern, string)
    
    return url[0]

def find_title(string):
    """function to extract title from article"""
    
    pattern = 'title=\"[^>]+'
    short_string = re.findall(pattern, string)[0]
    
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    title = short_string[start:end]
    
    return title

In [3]:
# function that checks if key words in corpus
def list_in_corpus(list_of_words, text_corpus):
    inclusion = False
    count = 0
    for word in list_of_words:
        if count < 2: # only compare if inclusion condition has not yet been met
            if word in text_corpus: 
                count += 1 # add 1 every time a city name is in the corpus
        else: 
            pass
    if count > 1: 
        inclusion = True 
        # thus the corpus is only marked for inclusion if 
        # at least two cities from the list have been mentioned
    return inclusion

In [4]:
# function to split dumps into flat list

def split_dump(input_dump, key_words, split_pattern = "</doc>"):
    """
    splits list of wikidump documents into a flat list of articles
    
        Parameters:
        -----------
            input_dump:    a list of strings
            key_words: list of strings, which should be included in the remaining articles
            split_pattern: str, optional
                string pattern at which the strings 
                should be split into articles. default = '<\doc>'
        
    """
    
    article_list = [
        article for dump 
        in tqdm(input_dump, total = len(input_dump), desc = "Progress") 
        for article in dump.split(split_pattern)]
    
    return article_list

In [5]:
## old list_in_corpus function
# def list_in_corpus_dep(list_of_words, text_corpus):
#     inclusion = False
#     count = 0
#     for word in list_of_words:
#         if word in text_corpus:
#             count += 1 # add 1 every time a city name is in the corpus
#         else: 
#             pass
#     if count > 1: 
#         inclusion = True 
#         # thus the corpus is only marked for inclusion if 
#         # at least two cities from the list have been mentioned
#     return inclusion, count

In [12]:
def process_dump(dump, key_words, message = True): 
    """extracts titles and ids from articles containing key words and returns as a list"""
    
    articles = []
    for article in tqdm(dump, total = len(dump), desc = "Progress"):
        article = unidecode.unidecode(article)
        if (list_in_corpus(cities_ls, article)):
            try: 
                article_id = find_id(article)
                title = find_title(article)
                articles.append((article_id, title, article)) 
            except: 
                pass
        else: 
            pass
            
    if message: 
        print(f"After processing {len(articles)} articles remain, " 
              f"that is {round(((len(articles)/len(dump))*100), 2)}% "
              f"of the total number of articles ({len(articles)}) in this dump.")
    
    return articles

## Load Data

In [7]:
# load articles from dump
indir = "/Volumes/NIJMAN/THESIS/enwiki_extracted" # path/to/wikidump_extracted
wikidump = [] # initialize empty list

In [None]:
t0 = time.time()
fp_list = []

for root, dirs, files in os.walk(indir):

    for filename in files:
        if not filename.startswith("."): 
            fp = os.path.join(root, filename)
#             fp_list.append(fp)
            
            with open(fp, 'r') as f: 
                wikidump.append(f.read())

t1 = time.time()

total = t1-t0
print(f"This took {total}s.")

In [None]:
len(wikidump)

## City List

In [7]:
# load cities csv
fp = '../input/List_of_cities_300k.csv' # path to csv with city information

cities = pd.read_csv(fp, sep=';')


In [8]:
# list of all cities
name_col = 'Mua_Eng' # for french 'Mua_Fr'
cities_ls = [city for city in cities[name_col]]

# split combined citynames 

cities_ls = [city.split('-') for city in cities_ls] # splits Muas that are made up of multiple cities e.g. Essen-Oberhausen
cities_ls = [city_component for city in cities_ls for city_component in city]
cities_ls = [unidecode.unidecode(word) for word in cities_ls] # remove accents


In [9]:
cities_ls

['Paris',
 'London',
 'Madrid',
 'Berlin',
 'Milan',
 'Barcelona',
 'Athens',
 'Rome',
 'Birmingham',
 'Lisbon',
 'Naples',
 'Katowice',
 'Manchester',
 'Hamburg',
 'Budapest',
 'Bucharest',
 'Warsaw',
 'Stuttgart',
 'Vienna',
 'Munich',
 'Brussels',
 'Stockholm',
 'Frankfurt',
 'Cologne',
 'Copenhagen',
 'Valencia',
 'Turin',
 'Glasgow',
 'Prague',
 'Lyon',
 'Sofia',
 'Liverpool',
 'Porto',
 'Seville',
 'Dublin',
 'Helsinki',
 'Amsterdam',
 'Rotterdam',
 'Dusseldorf',
 'Essen',
 'Oberhausen',
 'Lille',
 'Lodz',
 'Marseille',
 'Antwerp',
 'Bilbao',
 'Newcastle',
 'Krakow',
 'Bochum',
 'Herne',
 'Thessaloniki',
 'Nuremberg',
 'Riga',
 'Duisburg',
 'Dortmund',
 'Hanover',
 'Zurich',
 'Oslo',
 'Bremen',
 'Dresden',
 'Sheffield',
 'Palermo',
 'Poznan',
 'Gelsenkirchen',
 'Bottrop',
 'Bordeaux',
 'Wroclaw',
 'Gothenburg',
 'Zaragoza',
 'Genoa',
 'Catania',
 'The Hague',
 'Toulouse',
 'Bristol',
 'Vilnius',
 'Saarbrucken',
 'Malaga',
 'Nantes',
 'Leeds',
 'Nottingham',
 'Florence',
 'Gdansk'

***

## Trial

In [14]:
# trial = split_dump(wikidump[:1000])
# len(trial)

In [15]:
# trial_d = process_dump(trial, cities_ls)

In [16]:
# df = pd.DataFrame(trial_d, columns = ['article_id', 'title', 'text'])


In [17]:
# list_in_corpus(cities_ls, df.text[1])

## Dump 1

In [13]:
# extract articles from first half of dump
wikidump1 = split_dump(wikidump[:round(len(wikidump)/2)], cities_ls)

Progress:   0%|          | 0/8249 [00:00<?, ?it/s]

In [None]:
# wikilength1 = len(wikidump1)
wikilength1 = 5090764
print(wikilength1)


In [None]:
p_wikidump1 = process_dump(wikidump1, cities_ls)

In [None]:
p_wikilength1 = len(p_wikidump1)
# p_wikilength1 = 'nr that should actually go here'
print(p_wikilength1)

In [None]:
# save to .csv
dumps = [p_wikidump1, p_wikidump2]
i = 0
df = pd.DataFrame(dumps[i], columns = ['article_id', 'title', 'text'])
outputfp = f'../../../data/enwikidump{i+1}.csv'
df.to_csv(outputfp)

## Dump2

In [None]:
# extract articles from second half of dump

wikidump2 = split_dump(wikidump, cities_ls)

In [None]:
wikilength2 = len(wikidump2)
# wikilength2 = nr that it should be

print(wikilength2)

In [None]:
wikilength1 = 5090764 #len(wikidump1)
wikilength2 = len(wikidump2)
wikilength = wikilength1 + wikilength2

print(f"nr of articles in the enwiki dump is {wikilength}.")

In [None]:
p_wikidump2 = process_dump(wikidump2, cities_ls)

In [None]:
p_wikilength2 = len(p_wikidump2)
# p_wikilength2 = 'nr that should actually go here'
print(p_wikilength2)

In [None]:
p_wikilength = p_wikilength1 + p_wikilength2
print(p_wikilength)

In [None]:
# save to .csv
dumps = [p_wikidump1, p_wikidump2]
i = 1
df = pd.DataFrame(dumps[i], columns = ['article_id', 'title', 'text'])
outputfp = f'../../../data/enwikidump{i+1}.csv'
df.to_csv(outputfp)

In [20]:
# dumps = [p_wikidump1, p_wikidump2]
# for i in range(2): 
#     df = pd.DataFrame(dumps[i], columns = ['article_id', 'title', 'text'])
#     outputfp = f'../../../data/enwikidump{i+1}.csv'
#     df.to_csv(outputfp)
    

***