In [175]:
import numpy as np
import json
import pandas as pd
import time 
import os

#decompressing
import bz2
import lzma
import zstandard as zstd

#NLP
from tqdm import tqdm_notebook
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")

In [627]:
#files to still download
to_download = ['2017-07', '2020-12','2020-01', '2021-1'] #missing entirely from data
too_small = ['RC_2020-11', '2020-04'] #both too small
files = to_download + too_small
files

['2017-07', '2020-12', '2020-01', '2021-1', 'RC_2020-11', '2020-04']

In [626]:
city_df = pd.read_csv('city_variations.csv')

#filter to top 50. This should fix the 'Nice' occurrance issue
#add lowercase to set
top50_df = city_df.head(50)

cities = set(list(top50_df['eng'])+list(top50_df['wiki'])+list(top50_df['local']))

In [176]:
#find synonyms
synonyms = []
for i, row in top50_df.iterrows():
    names = set([row['local'], row['eng'], row['wiki']])
    if len(names) > 1:
        synonyms.append(names)
    else:
        synonyms.append(np.nan)
top50_df['synonyms'] = synonyms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top50_df['synonyms'] = synonyms


In [None]:
path = 'citynet occurrances'
files = os.listdir('citynet occurrances')

In [320]:
features = [ 'created_utc', 'score', 'subreddit', 'link_id', 'subreddit_id', 'body']

### Create Co-Occurrance Matrix

In [None]:
# For every month in the corpus, create a df of cooccurrances within comments. This should be a manageable size. 
# instad of copying all metadata, keep a record of index to go back and retrieve. 
# Later on group by link_id to find cooccurrances within comment threads
#make a point to replace synonyms in cooccurrances so that Koln and Cologne don't appear, for example

In [146]:
comment_cooc = {}
path = 'citynet occurrances'
files = os.listdir('citynet occurrances')

total = len(files)
counter = 0 

for index, file in enumerate(files):
    counter += 1
    fp = os.path.join(path, file)
    occurrances = {}
    
    with open(fp, 'r', encoding = 'utf-8') as f:
        comments = f.read().split('\n--------\n')[:-1] #the last comment ofter the split is an empty string
        
        for idx, comment in enumerate(comments):
            city_list = []
            comment = comment.lower()
            counter +=1
            parts = comment.split(',')
            body = parts[-2]
            #split the comment body and search for toponym occurrances
            words = body.split(' ')
            for city in cities:
                if city.lower() in words:
                    city_list.append(city)
            if len(city_list)>1:            
                occurrances[idx] = city_list    
    comment_cooc[fp] = occurrances
            
    print(str(100*index//total) + '% files complete')


0% files complete
0% files complete
1% files complete
1% files complete
2% files complete
2% files complete
3% files complete
3% files complete
4% files complete
5% files complete
5% files complete
6% files complete
6% files complete
7% files complete
7% files complete
8% files complete
8% files complete
9% files complete
10% files complete
10% files complete
11% files complete
11% files complete
12% files complete
12% files complete
13% files complete
14% files complete
14% files complete
15% files complete
15% files complete
16% files complete
16% files complete
17% files complete
17% files complete
18% files complete
19% files complete
19% files complete
20% files complete
20% files complete
21% files complete
21% files complete
22% files complete
23% files complete
23% files complete
24% files complete
24% files complete
25% files complete
25% files complete
26% files complete
26% files complete
27% files complete
28% files complete
28% files complete
29% files complete
29% files c

In [261]:
#create column labels for frequency matrix
columns = ['doc', 'line']
cities = list(cities)
cities.sort()
cols = columns + cities

In [258]:
# create dummy variables and cooccurrance matrix
cooc_matrix = []
for i in comment_cooc.keys():
    doc = i[-14:-4]
    dummies = []
    for line in comment_cooc[i]:
        vector = [doc]
        vector.append(line)
        tops = comment_cooc[i][line]
        for city in cities:
            if city in tops:
                vector.append(1)
            else:
                vector.append(0)
        dummies.append(vector)
    cooc_matrix = cooc_matrix + dummies

In [262]:
cooc_df = pd.DataFrame(cooc_matrix, columns=cols)

In [319]:
#merge count columns referring to the same city

tot = len(synonyms)
counter = 1
for pair in synonyms:
    synonym = list(pair)
    df = cooc_df[synonym]
    merged = []
    for idx, row in df.iterrows():
        merged.append(row[synonym[0]]+row[synonym[1]])
    new_col = synonym[0]+'_merged'
    cooc_df[new_col] = merged
    print(str(100*counter//tot)+'% complete')
    counter+=1

5% complete
11% complete
17% complete
23% complete
29% complete
35% complete
41% complete
47% complete
52% complete
58% complete
64% complete
70% complete
76% complete
82% complete
88% complete
94% complete
100% complete


In [344]:
#check that synonym columns sum to '_merged'
for i in synonyms:
    a = list(synonyms[0])
    a.append(a[0]+'_merged')
    print(cooc_df[a].sum()[0]+cooc_df[a].sum()[1] == cooc_df[a].sum()[2])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [354]:
#remove columns that were already merged 
drop = set()
for i in synonyms:
    drop = drop.union(i)

cols = [i for i in df.columns if i not in drop]
df = cooc_df[cols]

#### Save/load checkpoint

In [388]:
df.to_csv('cooc_matrix_checkpoint.csv')

In [447]:
df = pd.read_csv('cooc_matrix_checkpoint.csv')

### Add other metadata to cooccurrance matrix

In [647]:
len(list(df.columns))

52

In [572]:
#create and save cooccurrance index
cooc_index = {}
for i in df['doc'].unique():
    temp_df = df[df['doc']==i]
    indices = list(temp_df['line'])
    cooc_index[i] = indices

#save cooc_index checkpoint to file
with open(r'cooc_index.txt', 'w') as fp:
    for item in cooc_index:
        # write each item on a new line
        fp.write("%s" % item)
        fp.write(": %s\n" % cooc_index[item])
    print('Done')

Done


In [466]:
#create df with all metadata except for text body
counter = 0
path = r"C:\Users\kodri\Desktop\CITYNET Europe\citynet occurrances"
meta_df = pd.DataFrame(columns=features[:5])
counter = 0
tracker = 0
total = len(cooc_matrix)

for month in cooc_index:
    counter+=1
    file = month + '.txt'
    indices = cooc_index[month]
    fp = os.path.join(path, file)
    
    with open(fp, 'r', encoding = 'utf-8') as f:
        comments = f.read().split('\n--------\n')
        for idx in indices:
            chunks = comments[idx].split(',')
            meta_df. loc[counter]=chunks[:5]
            counter+=1
            tracker+=1
    if tracker > 100000:
        tracker = 0
        print(str(100*counter//total) + '% done')

8% done
18% done
28% done
36% done
46% done
55% done
64% done


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\kodri\\Desktop\\CITYNET Europe\\citynet occurrances\\C_2018-11..txt'

In [633]:
features[:]

['created_utc', 'score', 'subreddit', 'link_id', 'subreddit_id', 'body']

In [603]:
#create df with all metadata except for text body
counter = 0
path = r"C:\Users\kodri\Desktop\CITYNET Europe\citynet occurrances"
counter = 0
tracker = 0
total = len(cooc_matrix)

with open ('metadata.csv', 'w', encoding = 'utf-8') as w: 
    w.write(str(features[:5]).replace('[','').replace(']','').replace('\'','')+'\n')
    for file in files:
        month = file[:10]
        counter+=1
        if month in list(cooc_index.keys()):
            indices = cooc_index[month]
            fp = os.path.join(path, file)

            with open(fp, 'r', encoding = 'utf-8') as f:
                comments = f.read().split('\n--------\n')

                for idx in indices:
                    chunks = comments[idx].split(',')[:5]
                    w.write(str(chunks).replace('[','').replace(']','').replace('\'','')+'\n')
                    counter+=1
                    tracker+=1
            if tracker > 100000:
                tracker = 0
                print(str(100*counter//total) + '% done')
        else:
            print('No co-occurrances in ' + month)

No co-occurrances in RC_2005-12
No co-occurrances in RC_2006-01
8% done
18% done
28% done
36% done
46% done
55% done
64% done
73% done
83% done
92% done


In [611]:
meta_df = pd.read_csv(r"C:\Users\kodri\Desktop\CITYNET Europe\metadata.csv")
meta_df.head()

  meta_df = pd.read_csv(r"C:\Users\kodri\Desktop\CITYNET Europe\metadata.csv")


Unnamed: 0,created_utc,score,subreddit,link_id,subreddit_id
0,1139003115,8,reddit.com,t3_18gy,t5_6
1,1139517217,4,reddit.com,t3_1dp6,t5_6
2,1143810425,16,reddit.com,t3_3qiz,t5_6
3,1144908432,1,reddit.com,t3_4b3w,t5_6
4,1146177212,2,reddit.com,t3_5y81a,t5_6


In [637]:
len(cooc_df) == len(meta_df)

True

In [475]:
meta_df.to_csv('meta_df_checkpoint.csv')

In [648]:
df = pd.concat([cooc_df, meta_df.reindex(cooc_df.index)], axis=1)

In [652]:
#remove columns that were already merged 
drop = set()
for i in synonyms:
    drop = drop.union(i)

cols = [i for i in df.columns if i not in drop]
df = df[cols]

In [657]:
len(df.columns)

57

In [656]:
df.to_csv('cooc_df.csv')