In [1]:
import os
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from tqdm.notebook import tqdm
import warnings

# To Do List

In [300]:
# Read wikidump [CHECK]
# clean wikidump (wiki extractor) [CHECK]
# open cleaned wikidump text files [CHECK]
# create csv with word counts [CHECK]
# create co-occurence matrix for wikidump text files [CHECK]
# Change cities to English version
# Add distance methods like Levenshtein
# Think about window size options
# Normalize outcome based on city features (size, etc.)

# Settings

In [291]:
# Settings that can be changed
basefolder = '../../../enwiki/AA/'
corpus_path = '../output/'
corpus_name = 'city_matrix.csv'
new_corpus = False

cities_df = pd.read_csv(f'../input/List_of_cities.csv', delimiter=';')
nr_of_cities = 100 # by population numbers
window_size = 0

# Functions

In [None]:
list_of_cities = list(cities_df['Mua'][0:nr_of_cities].str.lower())

In [292]:
def connect_corpus(new_corpus=True):
    """function to set up csv_matrix.csv file"""
    if new_corpus:
        if os.path.exists(corpus_path+corpus_name):
            print(f"Are you sure you want to override the existing file? Remove the existing file first or change new_corpus to False.")
            return False
        
        corpus = pd.DataFrame(columns=list_of_cities)
        corpus['index'] = list_of_cities
        corpus.set_index('index', inplace=True)
        corpus.fillna(0, inplace=True)
    else:
        if os.path.exists(corpus_path+corpus_name):
            try:
                corpus = pd.read_csv(f'{corpus_path}{corpus_name}')
                corpus.set_index('index', inplace=True)
            except:
                print(f"Could not open{corpus_path}{corpus_name}")
                return False
        else:
            print(f"{corpus_path}{corpus_name} isn't a valid csv file.")
            return False

    return corpus


def preprocess_files(basefolder=basefolder,
                     corpus_path=corpus_path,
                     corpus_name=corpus_name, 
                     cities=list_of_cities,
                     new_corpus=new_corpus,
                     window_size=window_size):
    """function to walk through files and process their content"""
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
            
        # Instantiate city-matrix
        corpus = connect_corpus(new_corpus=new_corpus)
        if not isinstance(corpus, pd.DataFrame):
            return
        
        # Update city-matrix
        
        x, files_to_rename = 0, []
        for file in tqdm(os.listdir(basefolder), total=len(os.listdir(basefolder)), desc='files'):
            if 'processed' in file:
                continue
                
            full_path = os.path.join(basefolder, file)
            
            if os.path.exists(full_path):
                with open(full_path, "r", encoding="utf-8") as f:
                    corpus = preprocess_file(f.read(), corpus=corpus, cities=cities, window_size=window_size)
                    files_to_rename.append(full_path)
                    x += 1
            
            # Temporary Save
            if x % 5 == 0:
                corpus.to_csv(corpus_path + corpus_name)
                
                for file in files_to_rename:
                    os.replace(file, file + '_processed')
                files_to_rename = []
                
        # Save Corpus
        corpus.to_csv(corpus_path + corpus_name)
        
        return corpus

# Creates co-occurence matrix

In [252]:
tokenizer = RegexpTokenizer('\w+')

def preprocess_file(file_content, corpus, cities, window_size):
    """function to process content of individual file"""
    
    articles = file_content.replace('\n',' ').split('</doc>')
    articles = [x.strip() for x in articles if len(x.strip())]
    for article in articles:
        #         id = find_id(article)
        #         title = find_title(article)

        tokenized = tokenizer.tokenize(article.lower())
        word_count = Counter(tokenized)

        detected_cities = set(word_count.keys()).intersection(cities)
        for city1 in detected_cities:
            for city2 in detected_cities:
                corpus.at[city1, city2] = corpus.loc[city1,city2] + 1
                
    return corpus

# Code to create/update the co-occurence matrix

In [299]:
corpus = preprocess_files()
corpus.head(10)

files:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0_level_0,paris,london,madrid,berlin,milano,barcelona,athinia,roma,birmingham,lisboa,...,karlsruhe,bergamo,palma de mallorca,bologna,bielefeld,rouen,strasbourg,tallinn,szczecin,grenoble
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
paris,574,250,39,105,3,40,0,8,19,0,...,5,2,0,23,0,13,26,6,1,2
london,250,846,45,129,3,42,0,12,66,0,...,4,1,0,13,0,8,25,7,0,0
madrid,39,45,89,16,0,32,0,8,5,0,...,0,1,0,7,0,2,4,0,0,0
berlin,105,129,16,288,2,23,0,9,9,0,...,6,1,0,11,0,2,15,4,4,0
milano,3,3,0,2,6,0,0,1,0,0,...,0,1,0,2,0,0,0,0,0,0
barcelona,40,42,32,23,0,98,0,10,3,0,...,0,1,0,6,0,1,5,0,0,0
athinia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
roma,8,12,8,9,1,10,0,40,1,0,...,0,1,0,6,0,0,0,0,0,0
birmingham,19,66,5,9,0,3,0,1,99,0,...,0,0,0,1,0,0,1,0,1,0
lisboa,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [3]:
## RESET FILES

for file in os.listdir(basefolder):
    if '_processed' in file:
        os.replace(basefolder+file, basefolder+file.replace('_processed', ''))

In [7]:
# TURN INTO TEXT FILES
import os
basefolder = '../../../enwiki/AA/'

for file in os.listdir(basefolder):
    if '.txt' not in file:
        os.replace(basefolder+file, basefolder+file+'.txt')

In [296]:
c = pd.read_csv(corpus_path+'city_matrix.csv')
c.head(5)

Unnamed: 0,index,paris,london,madrid,berlin,milano,barcelona,athinia,roma,birmingham,...,karlsruhe,bergamo,palma de mallorca,bologna,bielefeld,rouen,strasbourg,tallinn,szczecin,grenoble
0,paris,574,250,39,105,3,40,0,8,19,...,5,2,0,23,0,13,26,6,1,2
1,london,250,846,45,129,3,42,0,12,66,...,4,1,0,13,0,8,25,7,0,0
2,madrid,39,45,89,16,0,32,0,8,5,...,0,1,0,7,0,2,4,0,0,0
3,berlin,105,129,16,288,2,23,0,9,9,...,6,1,0,11,0,2,15,4,4,0
4,milano,3,3,0,2,6,0,0,1,0,...,0,1,0,2,0,0,0,0,0,0


----

# NOT USED

In [210]:
# define functions for extracting metadata
def find_id(string):
    """function to extract id from article"""
    
    id_pattern = 'id=\"\d+\"'
#     id_float = '[0-9]+'
    
    short_string = re.findall(id_pattern, string)[0]
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    actual_id = int(short_string[4:-1])

#     actual_id = re.findall(id_float, short_string)
    
    return actual_id
 
def find_url(string):
    """function to extract url from article"""
    
    url_pattern = 'https://en\.wikipedia\.org/[a-zA-z\?=\d]+'
    
    url = re.findall(url_pattern, string)
    
    return url[0]

def find_title(string):
    """function to extract title from article"""
    
    pattern = 'title=\"[^>]+'
    short_string = re.findall(pattern, string)[0]

    title = short_string[7:-1]
    print(title)
    return title

# Gets number of cities per article

In [251]:
"""
# wiki_corpus_non_matrix.csv

tokenizer = RegexpTokenizer('\w+')

def preprocess_file(file_content, corpus, cities, window_size):
    # ""function to process content of individual file""
    
    articles = file_content.replace('\n',' ').split('</doc>')
    articles = [x.strip() for x in articles if len(x.strip())]
    
    for article in articles:
        id = find_id(article)
        title = find_title(article)
        
        tokenized = tokenizer.tokenize(article.lower())
        word_count = Counter(tokenized)

        detected_cities = set(word_count.keys()).intersection(cities)
        
        new_city = list(detected_cities - set(corpus['city']))
        corpus = pd.concat([corpus, pd.DataFrame({'city': new_city})], ignore_index=True)

        wordlist = []
        for city in corpus['city']:
            if city in word_count.keys():
                city_count.append(word_count[city])
            else:
                city_count.append(0)

        corpus[title] = wordlist

    return corpus
"""

'\n# wiki_corpus_non_matrix.csv\n\ntokenizer = RegexpTokenizer(\'\\w+\')\n\ndef preprocess_file(file_content, corpus, cities, window_size):\n    # ""function to process content of individual file""\n    \n    articles = file_content.replace(\'\n\',\' \').split(\'</doc>\')\n    articles = [x.strip() for x in articles if len(x.strip())]\n    \n    for article in articles:\n        id = find_id(article)\n        title = find_title(article)\n        \n        tokenized = tokenizer.tokenize(article.lower())\n        word_count = Counter(tokenized)\n\n        detected_cities = set(word_count.keys()).intersection(cities)\n        \n        new_city = list(detected_cities - set(corpus[\'city\']))\n        corpus = pd.concat([corpus, pd.DataFrame({\'city\': new_city})], ignore_index=True)\n\n        wordlist = []\n        for city in corpus[\'city\']:\n            if city in word_count.keys():\n                city_count.append(word_count[city])\n            else:\n                city_count.ap

In [185]:
import itertools
list(itertools.combinations(['Amsterdam', 'Paris', 'Berlin', 'Dublin'], 2))

[('Amsterdam', 'Paris'),
 ('Amsterdam', 'Berlin'),
 ('Amsterdam', 'Dublin'),
 ('Paris', 'Berlin'),
 ('Paris', 'Dublin'),
 ('Berlin', 'Dublin')]