In [1]:
import numpy as np
import json
import pandas as pd
import time 
import os

#decompressing
import bz2
import lzma
import zstandard as zstd

#NLP
from tqdm import tqdm_notebook
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")

In [16]:
#files to still download
files = ['2017-07','2020-01', '2020-04','2020-11','2020-12', '2021-01']

files = ['RC_' + file + '.txt' for file in files]
files

['RC_2017-07.txt',
 'RC_2020-01.txt',
 'RC_2020-04.txt',
 'RC_2020-11.txt',
 'RC_2020-12.txt',
 'RC_2021-01.txt']

In [10]:
city_df = pd.read_csv('city_variations.csv')

#filter to top 50. This should fix the 'Nice' occurrance issue
#add lowercase to set
top50_df = city_df.head(50)

cities = set(list(top50_df['eng'])+list(top50_df['wiki'])+list(top50_df['local']))
len(cities)

67

In [12]:
features = [ 'created_utc', 'score', 'subreddit', 'link_id', 'subreddit_id', 'body']
path = r"C:\Users\kodri\Desktop\CITYNET Europe\occurrances"

In [19]:
comment_cooc = {}

total = len(files)
counter = 0 

for index, file in enumerate(files):
    counter += 1
    fp = os.path.join(path, file)
    occurrances = {}
    
    with open(fp, 'r', encoding = 'utf-8') as f:
        comments = f.read().split('\n--------\n')[:-1] #the last comment ofter the split is an empty string
        
        for idx, comment in enumerate(comments):
            city_list = []
            comment = comment.lower()
            counter +=1
            parts = comment.split(',')
            body = parts[-2]
            #split the comment body and search for toponym occurrances
            words = body.split(' ')
            for city in cities:
                if city.lower() in words:
                    city_list.append(city)
            if len(city_list)>1:            
                occurrances[idx] = city_list    
    comment_cooc[file] = occurrances
            
    print(str(100*index//total) + '% files complete')

0% files complete
16% files complete
33% files complete
50% files complete
66% files complete
83% files complete


In [22]:
#create column labels for frequency matrix
columns = ['doc', 'line']
cities = list(cities)
cities.sort()
cols = columns + cities

In [25]:
# create dummy variables and cooccurrance matrix
cooc_matrix = []
for i in comment_cooc.keys():
    doc = i[-14:-4]
    dummies = []
    for line in comment_cooc[i]:
        vector = [doc]
        vector.append(line)
        tops = comment_cooc[i][line]
        for city in cities:
            if city in tops:
                vector.append(1)
            else:
                vector.append(0)
        dummies.append(vector)
    cooc_matrix = cooc_matrix + dummies

In [29]:
cooc_df = pd.DataFrame(cooc_matrix, columns=cols)

Unnamed: 0,doc,line,Amsterdam,Antwerp,Athens,Athinia,Barcelona,Berlin,Bilbao,Birmingham,...,Stockholm,Stuttgart,Thessaloniki,Torino,Turin,Valencia,Vienna,Warsaw,Warszawa,Wien
0,RC_2017-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,RC_2017-07,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RC_2017-07,13,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RC_2017-07,19,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,RC_2017-07,48,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96877,RC_2021-01,303263,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96878,RC_2021-01,303270,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
96879,RC_2021-01,303280,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
96880,RC_2021-01,303287,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#find synonyms
synonyms = []
for i, row in top50_df.iterrows():
    names = set([row['local'], row['eng'], row['wiki']])
    if len(names) > 1:
        synonyms.append(names)
    else:
        synonyms.append(np.nan)
top50_df['synonyms'] = synonyms

synonyms = list(top50_df[~top50_df['synonyms'].isna()]['synonyms'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top50_df['synonyms'] = synonyms


In [37]:
#merge count columns referring to the same city

tot = len(synonyms)
counter = 1
for pair in synonyms:
    synonym = list(pair)
    df = cooc_df[synonym]
    merged = []
    for idx, row in df.iterrows():
        merged.append(row[synonym[0]]+row[synonym[1]])
    new_col = synonym[0]+'_merged'
    cooc_df[new_col] = merged
    print(str(100*counter//tot)+'% complete')
    counter+=1

5% complete
11% complete
17% complete
23% complete
29% complete
35% complete
41% complete
47% complete
52% complete
58% complete
64% complete
70% complete
76% complete
82% complete
88% complete
94% complete
100% complete


In [46]:
list(cooc_df.columns)

['doc',
 'line',
 'Amsterdam',
 'Antwerp',
 'Athens',
 'Athinia',
 'Barcelona',
 'Berlin',
 'Bilbao',
 'Birmingham',
 'Bochum-Herne',
 'Brussels',
 'Bucharest',
 'Bucuresti',
 'Budapest',
 'Cologne',
 'Copenhagen',
 'Dublin',
 'Düsseldorf',
 'Essen-Oberhausen',
 'Frankfurt am Main',
 'Glasgow',
 'Hamburg',
 'Helsinki',
 'Herne,_North_Rhine-Westphalia',
 'Katowice',
 'Kobenhavn',
 'Krakow',
 'Köln',
 'Lille',
 'Lisboa',
 'Lisbon',
 'Liverpool',
 'Lodz',
 'London',
 'Lyon',
 'Madrid',
 'Manchester',
 'Marseille',
 'Milan',
 'Milano',
 'Munich',
 'München',
 'Naples',
 'Napoli',
 'Newcastle',
 'Nuremberg',
 'Nürnberg',
 'Oberhausen',
 'Paris',
 'Porto',
 'Prague',
 'Praha',
 'Roma',
 'Rome',
 'Rotterdam',
 'Sevilla',
 'Seville',
 'Sofia',
 'Stockholm',
 'Stuttgart',
 'Thessaloniki',
 'Torino',
 'Turin',
 'Valencia',
 'Vienna',
 'Warsaw',
 'Warszawa',
 'Wien',
 'Milan_merged',
 'Athinia_merged',
 'Roma_merged',
 'Lisbon_merged',
 'Napoli_merged',
 'Bucharest_merged',
 'Warszawa_merged',
 '

In [51]:
cooc_df.rename(columns = {'Athinia_merged':'Athens_merged',
                      'Lisbon_merged':'Lisboa_merged',
                      'Napoli_merged':'Naples_merged',
                      'Bucharest_merged':'Bucaresti_merged',
                      'Warszawa_merged':'Warsaw_merged',
                      'Wien_merged':'Vienna_merged',
                      'Köln_merged':'Cologne_merged',
                      'Turin_merged':'Torino_merged',
                      'Seville_merged':'Sevilla_merged',
                      'Bochum-Herne_merged':'Herne,_North_Rhine-Westphalia_merged',
                      'Nürnberg_merged':'Nuremberg_merged',
                     }, inplace = True)

In [44]:
for i in synonyms:
    print(list(
        i))

['Milan', 'Milano']
['Athinia', 'Athens']
['Roma', 'Rome']
['Lisbon', 'Lisboa']
['Napoli', 'Naples']
['Bucharest', 'Bucuresti']
['Warszawa', 'Warsaw']
['Wien', 'Vienna']
['Munich', 'München']
['Köln', 'Cologne']
['Copenhagen', 'Kobenhavn']
['Turin', 'Torino']
['Prague', 'Praha']
['Seville', 'Sevilla']
['Oberhausen', 'Essen-Oberhausen']
['Bochum-Herne', 'Herne,_North_Rhine-Westphalia']
['Nürnberg', 'Nuremberg']


In [38]:
#check that synonym columns sum to '_merged'
for i in synonyms:
    a = list(synonyms[0])
    a.append(a[0]+'_merged')
    print(cooc_df[a].sum()[0]+cooc_df[a].sum()[1] == cooc_df[a].sum()[2])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [39]:
#remove columns that were already merged 
drop = set()
for i in synonyms:
    drop = drop.union(i)

cols = [i for i in df.columns if i not in drop]
df = cooc_df[cols]

In [40]:
cooc_df

Unnamed: 0,doc,line,Amsterdam,Antwerp,Athens,Athinia,Barcelona,Berlin,Bilbao,Birmingham,...,Wien_merged,Munich_merged,Köln_merged,Copenhagen_merged,Turin_merged,Prague_merged,Seville_merged,Oberhausen_merged,Bochum-Herne_merged,Nürnberg_merged
0,RC_2017-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,RC_2017-07,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RC_2017-07,13,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RC_2017-07,19,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,RC_2017-07,48,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96877,RC_2021-01,303263,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96878,RC_2021-01,303270,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
96879,RC_2021-01,303280,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96880,RC_2021-01,303287,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cooc

In [56]:
#create and save cooccurrance index
cooc_index = {}
for i in cooc_df['doc'].unique():
    temp_df = cooc_df[cooc_df['doc']==i]
    indices = list(temp_df['line'])
    cooc_index[i] = indices


In [57]:
#save cooc_index checkpoint to file
with open(r'cooc_index_b.txt', 'w') as fp:
    for item in cooc_index:
        # write each item on a new line
        fp.write("%s" % item)
        fp.write(": %s\n" % cooc_index[item])
    print('Done')

Done


In [64]:
files[0][:10]

True

In [61]:
cooc_index.keys()

dict_keys(['RC_2017-07', 'RC_2020-01', 'RC_2020-04', 'RC_2020-11', 'RC_2020-12', 'RC_2021-01'])

In [66]:
#create df with all metadata except for text body
counter = 0
path = r"C:\Users\kodri\Desktop\CITYNET Europe\occurrances"
counter = 0
tracker = 0
total = len(cooc_matrix)

with open ('metadata2.csv', 'w', encoding = 'utf-8') as w: 
    w.write(str(features[:5]).replace('[','').replace(']','').replace('\'','')+'\n')
    for file in files:
        month = file[:10]
        counter+=1
        if month in list(cooc_index.keys()):
            indices = cooc_index[month]
            fp = os.path.join(path, file)

            with open(fp, 'r', encoding = 'utf-8') as f:
                comments = f.read().split('\n--------\n')

                for idx in indices:
                    chunks = comments[idx].split(',')[:5]
                    w.write(str(chunks).replace('[','').replace(']','').replace('\'','')+'\n')
                    counter+=1
                    tracker+=1
            if tracker > 100000:
                tracker = 0
                print(str(100*counter//total) + '% done')
        else:
            print('No co-occurrances in ' + month)

In [70]:
meta_df = pd.read_csv(r"C:\Users\kodri\Desktop\CITYNET Europe\metadata2.csv")

In [71]:
meta_df

Unnamed: 0,created_utc,score,subreddit,link_id,subreddit_id
0,1498867214,4,footballmanagergames,t3_6khdia,t5_2s0w5
1,1498867370,-32,Ice_Poseidon,t3_6kjrfq,t5_3aelr
2,1498867522,1,todayilearned,t3_6kgfog,t5_2qqjc
3,1498867556,10,AskHistorians,t3_6kj7r0,t5_2ssp3
4,1498868170,5,churning,t3_6kf8kx,t5_2vrf0
...,...,...,...,...,...
96877,1612137329,9,AskReddit,t3_l9o83n,t5_2qh1i
96878,1612137375,1,AskReddit,t3_l9fk3v,t5_2qh1i
96879,1612137451,2,poland,t3_l9n944,t5_2qkmn
96880,1612137510,1,Libertarian,t3_l9do1i,t5_2qh63


In [74]:
df = pd.concat([cooc_df, meta_df.reindex(cooc_df.index)], axis=1)

In [80]:
#remove columns that were already merged 
drop = set()
for i in synonyms:
    drop = drop.union(i)

cols = [i for i in df.columns if i not in drop]
df = df[cols]

## Merge this dataframe with the other months

In [83]:
df2 = pd.read_csv('cooc_df.csv')

  df2 = pd.read_csv('cooc_df.csv')


In [92]:
col1 = list(df.columns)
col2 = list(df2.columns)
for i in range(len(df2.columns)):
    if col1[i] != col2[i]:
        print(col1[i], col2[i])

In [97]:
frames = [df, df2]

result = pd.concat(frames)


In [107]:
drop = ['R', 'C' , '_', '.']
result['doc'] = result['doc'].apply(lambda x: x.replace(drop[0], '').replace(drop[1], '').replace(drop[2], '').replace(drop[3], ''))

In [108]:
result

Unnamed: 0,doc,line,Amsterdam,Antwerp,Barcelona,Berlin,Bilbao,Birmingham,Brussels,Budapest,...,Prague_merged,Sevilla_merged,Oberhausen_merged,"Herne,_North_Rhine-Westphalia_merged",Nuremberg_merged,created_utc,score,subreddit,link_id,subreddit_id
0,2017-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1498867214,4,footballmanagergames,t3_6khdia,t5_2s0w5
1,2017-07,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1498867370,-32,Ice_Poseidon,t3_6kjrfq,t5_3aelr
2,2017-07,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1498867522,1,todayilearned,t3_6kgfog,t5_2qqjc
3,2017-07,19,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1498867556,10,AskHistorians,t3_6kj7r0,t5_2ssp3
4,2017-07,48,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1498868170,5,churning,t3_6kf8kx,t5_2vrf0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133322,2020-11,70318,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1604906161,1,realmadrid,t3_jqkjat,t5_2rr0e
1133323,2020-11,70331,0,0,1,1,1,0,0,0,...,0,1,0,0,0,1604906319,19,soccer,t3_jqqr3w,t5_2qi58
1133324,2020-11,70336,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1604906391,5,IWantOut,t3_jqlpll,t5_2r5hw
1133325,2020-11,70337,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1604906395,1,soccer,t3_jqqr3w,t5_2qi58


In [109]:
result = result.sort_values('doc').reset_index(drop = True)

In [115]:
result.to_csv('cooccurrance_df.csv')