In [11]:
import os
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from tqdm.notebook import tqdm
import warnings

In [264]:
# Settings that can be changed
basefolder = '../../../enwiki/AA/'
corpus_path = '../output/'
corpus_name = 'city_matrix.csv'
new_corpus = True

cities_df = pd.read_csv(f'../input/List_of_cities.csv', delimiter=';')
nr_of_cities = 100 # by population numbers
window_size = 0

In [None]:
list_of_cities = list(cities_df['Mua'][0:nr_of_cities].str.lower())

In [210]:
# define functions for extracting metadata
def find_id(string):
    """function to extract id from article"""
    
    id_pattern = 'id=\"\d+\"'
#     id_float = '[0-9]+'
    
    short_string = re.findall(id_pattern, string)[0]
    quotation_sym = '"'
    start = short_string.index(quotation_sym) + len(quotation_sym)
    end = short_string.index(quotation_sym, start + 1)
    actual_id = int(short_string[4:-1])

#     actual_id = re.findall(id_float, short_string)
    
    return actual_id
 
def find_url(string):
    """function to extract url from article"""
    
    url_pattern = 'https://en\.wikipedia\.org/[a-zA-z\?=\d]+'
    
    url = re.findall(url_pattern, string)
    
    return url[0]

def find_title(string):
    """function to extract title from article"""
    
    pattern = 'title=\"[^>]+'
    short_string = re.findall(pattern, string)[0]

    title = short_string[7:-1]
    print(title)
    return title

In [274]:
def connect_corpus(new_corpus=True):
    """function to set up csv_matrix.csv file"""
    if new_corpus:
        corpus = pd.DataFrame(columns=list_of_cities)
        corpus['index'] = list_of_cities
        corpus.set_index('index', inplace=True)
        corpus.fillna(0, inplace=True)
    else:
        if os.path.exists(corpus_path+corpus_name):
            try:
                corpus = pd.read_csv(f'{corpus_path}{corpus_name}')
                corpus.set_index('index', inplace=True)
            except:
                print(f"Could not open{corpus_path}{corpus_name}")
                return False
        else:
            print(f"{corpus_path}{corpus_name} isn't a valid csv file.")
            return False

    return corpus


def preprocess_files(basefolder=basefolder,
                     corpus_path=corpus_path,
                     corpus_name=corpus_name, 
                     cities=list_of_cities,
                     new_corpus=new_corpus,
                     window_size=window_size):
    """function to walk through files and process their content"""
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
            
        # Instantiate Corpus
        corpus = connect_corpus(new_corpus=new_corpus)
        if not isinstance(corpus, pd.DataFrame):
            return
        
        # Update Corpus
        x = 0
        files_to_rename = []
        for file in tqdm(os.listdir(basefolder), total=len(os.listdir(basefolder)), desc='files'):
            if 'processed' in file:
                continue
                
            full_path = os.path.join(basefolder, file)
            
            if os.path.exists(full_path):
                with open(full_path, "r", encoding="utf-8") as f:
                    corpus = preprocess_file(f.read(), corpus=corpus, cities=cities, window_size=window_size)
                    files_to_rename.append(full_path)
                    x += 1
            
            # Temporary Save
            if x % 5 == 0:
                print(files_to_rename)
                corpus.to_csv(corpus_path + corpus_name)
                
                for file in files_to_rename:
                    os.replace(file, file + '_processed')
                files_to_rename = []
                
        # Save Corpus
        corpus.to_csv(corpus_path + corpus_name)
        
        return corpus

# Gets number of cities per article

In [251]:
"""
# wiki_corpus_non_matrix.csv

tokenizer = RegexpTokenizer('\w+')

def preprocess_file(file_content, corpus, cities, window_size):
    # ""function to process content of individual file""
    
    articles = file_content.replace('\n',' ').split('</doc>')
    articles = [x.strip() for x in articles if len(x.strip())]
    
    for article in articles:
        id = find_id(article)
        title = find_title(article)
        
        tokenized = tokenizer.tokenize(article.lower())
        word_count = Counter(tokenized)

        detected_cities = set(word_count.keys()).intersection(cities)
        
        new_city = list(detected_cities - set(corpus['city']))
        corpus = pd.concat([corpus, pd.DataFrame({'city': new_city})], ignore_index=True)

        wordlist = []
        for city in corpus['city']:
            if city in word_count.keys():
                city_count.append(word_count[city])
            else:
                city_count.append(0)

        corpus[title] = wordlist

    return corpus
"""

'\n# wiki_corpus_non_matrix.csv\n\ntokenizer = RegexpTokenizer(\'\\w+\')\n\ndef preprocess_file(file_content, corpus, cities, window_size):\n    # ""function to process content of individual file""\n    \n    articles = file_content.replace(\'\n\',\' \').split(\'</doc>\')\n    articles = [x.strip() for x in articles if len(x.strip())]\n    \n    for article in articles:\n        id = find_id(article)\n        title = find_title(article)\n        \n        tokenized = tokenizer.tokenize(article.lower())\n        word_count = Counter(tokenized)\n\n        detected_cities = set(word_count.keys()).intersection(cities)\n        \n        new_city = list(detected_cities - set(corpus[\'city\']))\n        corpus = pd.concat([corpus, pd.DataFrame({\'city\': new_city})], ignore_index=True)\n\n        wordlist = []\n        for city in corpus[\'city\']:\n            if city in word_count.keys():\n                city_count.append(word_count[city])\n            else:\n                city_count.ap

# Creates co-occurence matrix

In [185]:
import itertools
list(itertools.combinations(['Amsterdam', 'Paris', 'Berlin', 'Dublin'], 2))

[('Amsterdam', 'Paris'),
 ('Amsterdam', 'Berlin'),
 ('Amsterdam', 'Dublin'),
 ('Paris', 'Berlin'),
 ('Paris', 'Dublin'),
 ('Berlin', 'Dublin')]

In [252]:
tokenizer = RegexpTokenizer('\w+')

def preprocess_file(file_content, corpus, cities, window_size):
    """function to process content of individual file"""
    
    articles = file_content.replace('\n',' ').split('</doc>')
    articles = [x.strip() for x in articles if len(x.strip())]
    for article in articles:
        #         id = find_id(article)
        #         title = find_title(article)

        tokenized = tokenizer.tokenize(article.lower())
        word_count = Counter(tokenized)

        detected_cities = set(word_count.keys()).intersection(cities)
        for city1 in detected_cities:
            for city2 in detected_cities:
                corpus.at[city1, city2] = corpus.loc[city1,city2] + 1
        
        ## Corpus should already have all required cities
        # new_city = list(detected_cities - set(corpus['city']))
        # corpus = pd.concat([corpus, pd.DataFrame({'city': new_city})], ignore_index=True)

        #         wordlist = []
        #         for city in corpus['city']:
        #             if city in word_count.keys():
        #                 city_count.append(word_count[city])
        #             else:
        #                 city_count.append(0)

        #         corpus[title] = wordlist

    return corpus

In [271]:
corpus = preprocess_files()

            paris  london  madrid  berlin  milano  barcelona  athinia  roma  \
index                                                                         
paris           0       0       0       0       0          0        0     0   
london          0       0       0       0       0          0        0     0   
madrid          0       0       0       0       0          0        0     0   
berlin          0       0       0       0       0          0        0     0   
milano          0       0       0       0       0          0        0     0   
...           ...     ...     ...     ...     ...        ...      ...   ...   
rouen           0       0       0       0       0          0        0     0   
strasbourg      0       0       0       0       0          0        0     0   
tallinn         0       0       0       0       0          0        0     0   
szczecin        0       0       0       0       0          0        0     0   
grenoble        0       0       0       0       0   

files:   0%|          | 0/100 [00:00<?, ?it/s]

['../../../enwiki/AA/wiki_00', '../../../enwiki/AA/wiki_01', '../../../enwiki/AA/wiki_02', '../../../enwiki/AA/wiki_03', '../../../enwiki/AA/wiki_04']
['../../../enwiki/AA/wiki_05', '../../../enwiki/AA/wiki_06', '../../../enwiki/AA/wiki_07', '../../../enwiki/AA/wiki_08', '../../../enwiki/AA/wiki_09']
['../../../enwiki/AA/wiki_10', '../../../enwiki/AA/wiki_11', '../../../enwiki/AA/wiki_12', '../../../enwiki/AA/wiki_13', '../../../enwiki/AA/wiki_14']
['../../../enwiki/AA/wiki_15', '../../../enwiki/AA/wiki_16', '../../../enwiki/AA/wiki_17', '../../../enwiki/AA/wiki_18', '../../../enwiki/AA/wiki_19']
['../../../enwiki/AA/wiki_20', '../../../enwiki/AA/wiki_21', '../../../enwiki/AA/wiki_22', '../../../enwiki/AA/wiki_23', '../../../enwiki/AA/wiki_24']
['../../../enwiki/AA/wiki_25', '../../../enwiki/AA/wiki_26', '../../../enwiki/AA/wiki_27', '../../../enwiki/AA/wiki_28', '../../../enwiki/AA/wiki_29']
['../../../enwiki/AA/wiki_30', '../../../enwiki/AA/wiki_31', '../../../enwiki/AA/wiki_32', '..

In [272]:
corpus

Unnamed: 0_level_0,paris,london,madrid,berlin,milano,barcelona,athinia,roma,birmingham,lisboa,...,karlsruhe,bergamo,palma de mallorca,bologna,bielefeld,rouen,strasbourg,tallinn,szczecin,grenoble
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
paris,574,250,39,105,3,40,0,8,19,0,...,5,2,0,23,0,13,26,6,1,2
london,250,846,45,129,3,42,0,12,66,0,...,4,1,0,13,0,8,25,7,0,0
madrid,39,45,89,16,0,32,0,8,5,0,...,0,1,0,7,0,2,4,0,0,0
berlin,105,129,16,288,2,23,0,9,9,0,...,6,1,0,11,0,2,15,4,4,0
milano,3,3,0,2,6,0,0,1,0,0,...,0,1,0,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rouen,13,8,2,2,0,1,0,0,0,0,...,0,0,0,0,0,21,3,0,0,0
strasbourg,26,25,4,15,0,5,0,0,1,0,...,2,0,0,5,0,3,51,1,0,0
tallinn,6,7,0,4,0,0,0,0,0,0,...,1,0,0,0,0,0,1,17,1,0
szczecin,1,0,0,4,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,6,0


In [247]:
c = pd.read_csv(corpus_path+'city_matrix.csv')
c.head(10)

Unnamed: 0,index,paris,london,madrid,berlin,milano,barcelona,athinia,roma,birmingham,...,karlsruhe,bergamo,palma de mallorca,bologna,bielefeld,rouen,strasbourg,tallinn,szczecin,grenoble
0,paris,26,16,5,5,0,4,0,1,2,...,0,0,0,1,0,0,0,0,0,0
1,london,16,32,3,10,0,2,0,1,4,...,0,0,0,2,0,0,0,0,0,0
2,madrid,5,3,8,2,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,berlin,5,10,2,15,0,3,0,1,2,...,0,0,0,2,0,0,1,0,0,0
4,milano,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,barcelona,4,2,2,3,0,6,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,athinia,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,roma,1,1,0,1,0,1,0,2,0,...,0,0,0,1,0,0,0,0,0,0
8,birmingham,2,4,0,2,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
9,lisboa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [206]:
corpus_test.at['paris', 'london'] = corpus_test.loc['paris']['london'] + 2
corpus_test

Unnamed: 0_level_0,paris,london,madrid,berlin,milano,barcelona,athinia,roma,birmingham,lisboa,...,karlsruhe,bergamo,palma de mallorca,bologna,bielefeld,rouen,strasbourg,tallinn,szczecin,grenoble
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
paris,2,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
london,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
madrid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
berlin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
milano,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rouen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
strasbourg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tallinn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
szczecin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
corpus_test = pd.DataFrame(columns=list_of_cities)
corpus_test['index'] = list_of_cities
corpus_test.set_index('index', inplace=True)
corpus_test.fillna(0, inplace=True)
corpus_test
corpus_test.to_csv('city_matrix.csv')

In [187]:
list_of_cities

['paris',
 'london',
 'madrid',
 'berlin',
 'milano',
 'barcelona',
 'athinia',
 'roma',
 'birmingham',
 'lisboa',
 'napoli',
 'katowice',
 'manchester',
 'hamburg',
 'budapest',
 'bucuresti',
 'warszawa',
 'stuttgart',
 'wien',
 'münchen',
 'brussels',
 'stockholm',
 'frankfurt am main',
 'köln',
 'kobenhavn',
 'valencia',
 'torino',
 'glasgow',
 'praha',
 'lyon',
 'sofia',
 'liverpool',
 'porto',
 'sevilla',
 'dublin',
 'helsinki',
 'amsterdam',
 'rotterdam',
 'düsseldorf',
 'essen-oberhausen',
 'lille',
 'lodz',
 'marseille',
 'antwerp',
 'bilbao',
 'newcastle',
 'krakow',
 'bochum-herne',
 'thessaloniki',
 'nürnberg',
 'riga',
 'duisburg',
 'dortmund',
 'hannover',
 'zürich',
 'oslo',
 'bremen',
 'dresden',
 'sheffield',
 'palermo',
 'poznan',
 'gelsenkirchen-bottrop',
 'bordeaux',
 'wroclaw',
 'göteborg',
 'zaragoza',
 'genova',
 'catania',
 'den haag',
 'toulouse',
 'bristol',
 'vilnius',
 'saarbrücken',
 'malaga',
 'nantes',
 'leeds',
 'nottingham',
 'firenze',
 'gdansk',
 'leip

In [100]:
v = 'www'

In [7]:
def z(y):
    if y:
        print('check')
        return



def x(y=True):
    z(y)
    print(v)
    c = 'ccc'

x()
print(c)

check
www


NameError: name 'c' is not defined

In [273]:
## RESET FILES

for file in os.listdir(basefolder):
    if '_processed' in file:
        os.replace(basefolder+file, basefolder+file.replace('_processed', ''))