In [1]:
import os
import csv
import spacy
import numpy as np
import pandas as pd
spacy.load('en_core_web_sm')
from collections import defaultdict

#parser = English()
nlp = spacy.load('en')

In [2]:
file = 'SOCC/raw/gnm_comments.csv'
df = pd.read_csv(file, low_memory=False)
df['new_timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['new_post_time'] = pd.to_datetime(df['post_time'], errors='coerce')

In [3]:
df_timestamped = df.dropna(how='all', subset=['new_timestamp', 'new_post_time'])

In [4]:
print('Total number of comments:', len(df.index))
print('Total number of timestamped comments:', len(df_timestamped.index))
print('Missing timestamps:', len(df.index) - len(df_timestamped.index))

Total number of comments: 663173
Total number of timestamped comments: 662933
Missing timestamps: 240


In [5]:
df[df['new_timestamp'].isna() & df['new_post_time'].isna()].head()

Unnamed: 0,article_id,comment_counter,comment_id,author,post_time,timestamp,comment_text,reactions,replies,TotalVotes,negVotes,posVotes,new_timestamp,new_post_time
644186,13702405,source2_13702405_21,160056_0,PABLO53,Feb 2,,The the number of comments on this column (or ...,{},No,,,,NaT,NaT
644200,13827141,source2_13827141_10,212029_0,DarrenMacKay,8 days ago,,"Unfortunately, Citizens cannot hold the Govern...",{},No,,,,NaT,NaT
645257,19071121,source2_19071121_156,909071_0,Slightly Concerned,Feb 7,,"There can't be Catholic schools in Canada, if ...","{u'reaction_list': [], u'reaction_counts': [u'...",No,,,,NaT,NaT
645339,19704906,source2_19704906_33,722020_0,PWup,Jan 2,,The 'real world'? Why would I want to live in ...,{},No,,,,NaT,NaT
645340,19704906,source2_19704906_51,434069_0,JDS363,Jan 1,,"Wynne, Notley and Trudeau should all be voted ...",{},No,,,,NaT,NaT


In [6]:
timestamps = pd.concat([df_timestamped['new_timestamp'], df_timestamped['new_post_time']]).dropna()

In [7]:
timestamp_list = timestamps.dt.year.astype('str').str.cat(timestamps.dt.month.astype('str').str.pad(2, fillchar='0'), sep='-').values
flat_list = df_timestamped['comment_text'].values

In [8]:
tokens = []
pos = []
freq = 0
adj_count = 0

for doc in nlp.pipe(flat_list, batch_size=50, n_threads=80):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # to make sure the indices will line up
        tokens.append(None)
        pos.append(None)

In [9]:
with open('socc_wordcount.csv', 'w+') as f:
    f.write(str(sum([len(sublist) for sublist in pos])))

In [10]:
adv_adj_dict = defaultdict(int)
adv_adj_timestamp_dict = defaultdict(int)
adj_timestamp_dict = defaultdict(int)

for comment_index, comment in enumerate(pos):
    length = len(comment)
    timestamp = timestamp_list[comment_index]
    for word_index, part in enumerate(comment):
        if (part == 'ADJ'):
            adj_count += 1
            adj_timestamp_dict[timestamp] += 1
        
        # for each adverb ending in -ly
        if (part == 'ADV') and tokens[comment_index][word_index][-2:] == 'ly':
            # if the next word is adj
            if (word_index+1 < length):
                if (comment[word_index+1] == 'ADJ'):
                    # count it
                    freq = freq + 1
                    # dict key is 'word1 word2' in lowercase
                    key = tokens[comment_index][word_index].lower() + " " + tokens[comment_index][word_index+1].lower()
                    # add to the dict
                    adv_adj_dict[key] += 1
                    # count for the timestamp as well
                    adv_adj_timestamp_dict[timestamp] += 1

In [11]:
with open("socc_overview.txt", "w+") as f:
    f.write("Frequency: " + str(freq) + "\n")
    f.write("Adjective count: " + str(adj_count) + "\n")
    f.write("Number of tokens: " + str(sum([len(sublist) for sublist in pos])) + "\n")
    types = set([i for sublist in tokens for i in sublist])
    f.write("Overall lexical diversity: " + str(len(types) / len(tokens)) + "\n")

In [14]:
pos_list = []
for comment_index,text in enumerate(pos):
    for word_index,part in enumerate(text):
        pos_list.append([comment_index, tokens[comment_index][word_index], part])

In [19]:
pd.DataFrame(pos_list).rename(columns={0 : 'text_num', 1 : 'token', 2 : 'pos'}).to_csv('SOCC_pos.csv')