In [1]:
import pandas as pd
import liwc
from collections import Counter
import re
import time
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [2]:
def stem_words(word_list):
    word_list = re.sub('[\'\]\[,]', '', word_list).split()
    output = [ps.stem(w) for w in word_list]
    output = " ".join(output)
    return output

In [3]:
df = pd.read_csv('datasets/50M_250_1000_2_final.csv', index_col=None, header=0, usecols=['text','text_token_rem_stop','tweet_id'], lineterminator='\n')

In [4]:
df['text_token_stem_str'] = df.text_token_rem_stop.apply(lambda t: stem_words(t)) 

In [5]:
clf1 = joblib.load('misc/CCAT.pkl')
clf2 = joblib.load('misc/GCAT.pkl')
clf3 = joblib.load('misc/ECAT.pkl')
clf4 = joblib.load('misc/MCAT.pkl')

# Add columns to final dataframe
df['CCAT'] = clf1.predict(df.text_token_stem_str)
df['GCAT'] = clf2.predict(df.text_token_stem_str)
df['ECAT'] = clf3.predict(df.text_token_stem_str)
df['MCAT'] = clf4.predict(df.text_token_stem_str)

In [6]:
interesting_punct = []
pattern_same_punctuation = re.compile('(([-/\\\\()!"+,&\'.])\\2+)')
pattern_inter1 = re.compile('\?!+')
pattern_inter2 = re.compile('!\?+')
for tweet in df.text:
    match1 = pattern_same_punctuation.search(tweet)
    match2 = pattern_inter1.findall(tweet)
    match3 = pattern_inter1.findall(tweet)
    if match1 or match2.__len__() > 0 or match3.__len__() > 0:
        interesting_punct.append(1)
    else:
        interesting_punct.append(0)
        
# Add column to final dataframe
df['interesting_punct'] = interesting_punct

# LIWC

In [None]:
import liwc
from collections import Counter

In [None]:
def liwc_features(tokens: [str]) -> {str: int}:
    c = Counter(category for token in tokens for category in liwc_parse(token) if category in ['social','affect','cogmech','percept','bio','relativ'])
    return dict(c)

In [None]:
liwc_parse, liwc_categories = liwc.load_token_parser('LIWC2007_English100131.dic')
len(liwc_categories)

In [None]:
# Iterate through each .csv file 1000 rows at a time (chunksize=1000), map the liwc features to a liwc_df_temp and concatenate
# onto liwc_df_final

numTweets = 1000
liwc_df_final = pd.DataFrame()
for df_temp in pd.read_csv('datasets/50M_250_1000_2_final.csv', index_col=None, header=0, usecols=['text','text_token','tweet_id'], lineterminator='\n', chunksize=1000):
    have_tokens = ~pd.isna(df_temp.text_token)

    df_temp.loc[have_tokens, 'text_token'] = df_temp[have_tokens].text_token.map(eval)

    liwc_df_temp = pd.DataFrame(df_temp[have_tokens].text_token.map(liwc_features).tolist())\
        .fillna(0)\
        .astype(int)
    # liwc_df.rename({c: 'liwc_' + c for c in liwc_df}, axis='columns')

    for c in liwc_df_temp:
        df_temp.loc[have_tokens, c] = liwc_df_temp[c].values
    frames = [liwc_df_final, liwc_df_temp]
    del liwc_df_temp
    liwc_df_final = pd.concat(frames)
#     print('Processed tweets through', numTweets)
    numTweets+=1000

In [None]:
# rename columns in dataframe from 'dimension_name' to 'liwc_dimension_name'
liwc_df_final.columns = ['liwc_' + c for c in liwc_df_final.columns]

# Add columns to final dataframe
df['liwc_social'] = liwc_df_final['liwc_social']
df['liwc_affect'] = liwc_df_final['liwc_affect']
df['liwc_cogmech'] = liwc_df_final['liwc_cogmech']
df['liwc_percept'] = liwc_df_final['liwc_percept']
df['liwc_bio'] = liwc_df_final['liwc_bio']
df['liwc_relativ'] = liwc_df_final['liwc_relativ']