In [1]:
import pandas as pd
import liwc
from collections import Counter
import re
import time
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [2]:
def stem_words(word_list):
    word_list = re.sub('[\'\]\[,]', '', word_list).split()
    output = [ps.stem(w) for w in word_list]
    output = " ".join(output)
    return output

In [None]:
df = pd.read_csv('datasets/twitterFinal_5mil.csv', index_col=None, header=0, usecols=['text','text_token_rem_stop','tweet_id'], lineterminator='\n')

In [None]:
df['text_token_stem_str'] = df.text_token_rem_stop.apply(lambda t: stem_words(t)) 

In [None]:
clf1 = joblib.load('misc/CCAT.pkl')
clf2 = joblib.load('misc/GCAT.pkl')
clf3 = joblib.load('misc/ECAT.pkl')
clf4 = joblib.load('misc/MCAT.pkl')

# Add columns to final dataframe
df['CCAT'] = clf1.predict(df.text_token_stem_str)
df['GCAT'] = clf2.predict(df.text_token_stem_str)
df['ECAT'] = clf3.predict(df.text_token_stem_str)
df['MCAT'] = clf4.predict(df.text_token_stem_str)

# Interesting Punctuation

In [None]:
interesting_punct = []
pattern_same_punctuation = re.compile('(([-/\\\\()!"+,&\'.])\\2+)')
pattern_inter1 = re.compile('\?!+')
pattern_inter2 = re.compile('!\?+')
for tweet in df.text:
    match1 = pattern_same_punctuation.search(tweet)
    match2 = pattern_inter1.findall(tweet)
    match3 = pattern_inter1.findall(tweet)
    if match1 or match2.__len__() > 0 or match3.__len__() > 0:
        interesting_punct.append(1)
    else:
        interesting_punct.append(0)
        
# Add column to final dataframe
df['interesting_punct'] = interesting_punct

# LIWC

In [3]:
import liwc
from collections import Counter

In [4]:
def liwc_features(tokens: [str]) -> {str: int}:
    c = Counter(category for token in tokens for category in liwc_parse(token) if category in ['affect', 'bio', 'cogmech', 'percept', 'relativ', 'social'])
    return dict(c)

In [5]:
liwc_parse, liwc_categories = liwc.load_token_parser('LIWC2007_English100131.dic')

In [None]:
# Iterate through each .csv file 1000 rows at a time (chunksize=1000), map the liwc features to a liwc_df_temp and concatenate
# onto liwc_df_final

numTweets = 1000
liwc_df_final = pd.DataFrame()
for df_temp in pd.read_csv('datasets/twitterFinal_5mil.csv', index_col=None, header=0, usecols=['text','text_token','tweet_id'], lineterminator='\n', chunksize=1000):
    have_tokens = ~pd.isna(df_temp.text_token)

    df_temp.loc[have_tokens, 'text_token'] = df_temp[have_tokens].text_token.map(eval)

    liwc_df_temp = pd.DataFrame(df_temp[have_tokens].text_token.map(liwc_features).tolist())\
        .fillna(0)\
        .astype(int)
    liwc_df_temp.rename({c: 'liwc_' + c for c in liwc_df_temp}, axis='columns')

    for c in liwc_df_temp:
        df_temp.loc[have_tokens, c] = liwc_df_temp[c].values
    
    del df_temp

    liwc_df_final = pd.concat([liwc_df_final, liwc_df_temp])
    liwc_df_final.reset_index(drop=True, inplace=True)
    del liwc_df_temp
    print('Processed tweets through', numTweets)
    numTweets+=1000
print(liwc_df_final['cogmech'].head())
print(liwc_df_final[liwc_df_final['cogmech'].isnull()])

Processed tweets through 1000
Processed tweets through 2000
Processed tweets through 3000
Processed tweets through 4000
Processed tweets through 5000
Processed tweets through 6000
Processed tweets through 7000
Processed tweets through 8000
Processed tweets through 9000
Processed tweets through 10000
Processed tweets through 11000
Processed tweets through 12000
Processed tweets through 13000
Processed tweets through 14000
Processed tweets through 15000
Processed tweets through 16000
Processed tweets through 17000
Processed tweets through 18000
Processed tweets through 19000
Processed tweets through 20000
Processed tweets through 21000
Processed tweets through 22000
Processed tweets through 23000
Processed tweets through 24000
Processed tweets through 25000
Processed tweets through 26000
Processed tweets through 27000
Processed tweets through 28000
Processed tweets through 29000
Processed tweets through 30000
Processed tweets through 31000
Processed tweets through 32000
Processed tweets 

Processed tweets through 261000
Processed tweets through 262000
Processed tweets through 263000
Processed tweets through 264000
Processed tweets through 265000
Processed tweets through 266000
Processed tweets through 267000
Processed tweets through 268000
Processed tweets through 269000
Processed tweets through 270000
Processed tweets through 271000
Processed tweets through 272000
Processed tweets through 273000
Processed tweets through 274000
Processed tweets through 275000
Processed tweets through 276000
Processed tweets through 277000
Processed tweets through 278000
Processed tweets through 279000
Processed tweets through 280000
Processed tweets through 281000
Processed tweets through 282000
Processed tweets through 283000
Processed tweets through 284000
Processed tweets through 285000
Processed tweets through 286000
Processed tweets through 287000
Processed tweets through 288000
Processed tweets through 289000
Processed tweets through 290000
Processed tweets through 291000
Processe

Processed tweets through 519000
Processed tweets through 520000
Processed tweets through 521000
Processed tweets through 522000
Processed tweets through 523000
Processed tweets through 524000
Processed tweets through 525000
Processed tweets through 526000
Processed tweets through 527000
Processed tweets through 528000
Processed tweets through 529000
Processed tweets through 530000
Processed tweets through 531000
Processed tweets through 532000
Processed tweets through 533000
Processed tweets through 534000
Processed tweets through 535000
Processed tweets through 536000
Processed tweets through 537000
Processed tweets through 538000
Processed tweets through 539000
Processed tweets through 540000
Processed tweets through 541000
Processed tweets through 542000
Processed tweets through 543000
Processed tweets through 544000
Processed tweets through 545000
Processed tweets through 546000
Processed tweets through 547000
Processed tweets through 548000
Processed tweets through 549000
Processe

Processed tweets through 777000
Processed tweets through 778000
Processed tweets through 779000
Processed tweets through 780000
Processed tweets through 781000
Processed tweets through 782000
Processed tweets through 783000
Processed tweets through 784000
Processed tweets through 785000
Processed tweets through 786000
Processed tweets through 787000
Processed tweets through 788000
Processed tweets through 789000
Processed tweets through 790000
Processed tweets through 791000
Processed tweets through 792000
Processed tweets through 793000
Processed tweets through 794000
Processed tweets through 795000
Processed tweets through 796000
Processed tweets through 797000
Processed tweets through 798000
Processed tweets through 799000
Processed tweets through 800000
Processed tweets through 801000
Processed tweets through 802000
Processed tweets through 803000
Processed tweets through 804000
Processed tweets through 805000
Processed tweets through 806000
Processed tweets through 807000
Processe

Processed tweets through 1034000
Processed tweets through 1035000
Processed tweets through 1036000
Processed tweets through 1037000
Processed tweets through 1038000
Processed tweets through 1039000
Processed tweets through 1040000
Processed tweets through 1041000
Processed tweets through 1042000
Processed tweets through 1043000
Processed tweets through 1044000
Processed tweets through 1045000
Processed tweets through 1046000
Processed tweets through 1047000
Processed tweets through 1048000
Processed tweets through 1049000
Processed tweets through 1050000
Processed tweets through 1051000
Processed tweets through 1052000
Processed tweets through 1053000
Processed tweets through 1054000
Processed tweets through 1055000
Processed tweets through 1056000
Processed tweets through 1057000
Processed tweets through 1058000
Processed tweets through 1059000
Processed tweets through 1060000
Processed tweets through 1061000
Processed tweets through 1062000
Processed tweets through 1063000
Processed 

Processed tweets through 1283000
Processed tweets through 1284000
Processed tweets through 1285000
Processed tweets through 1286000
Processed tweets through 1287000
Processed tweets through 1288000
Processed tweets through 1289000
Processed tweets through 1290000
Processed tweets through 1291000
Processed tweets through 1292000
Processed tweets through 1293000
Processed tweets through 1294000
Processed tweets through 1295000
Processed tweets through 1296000
Processed tweets through 1297000
Processed tweets through 1298000
Processed tweets through 1299000
Processed tweets through 1300000
Processed tweets through 1301000
Processed tweets through 1302000
Processed tweets through 1303000
Processed tweets through 1304000
Processed tweets through 1305000
Processed tweets through 1306000
Processed tweets through 1307000
Processed tweets through 1308000
Processed tweets through 1309000
Processed tweets through 1310000
Processed tweets through 1311000
Processed tweets through 1312000
Processed 

In [None]:
combined = '\t'.join(list(liwc_df_final))

# rename columns in dataframe from 'dimension_name' to 'liwc_dimension_name'
if not 'liwc_' in combined:
    liwc_df_final.columns = ['liwc_' + c for c in liwc_df_final.columns]
    
# Add columns to final dataframe
df['liwc_social'] = [int(i) for i in list(liwc_df_final['liwc_social'])]
df['liwc_affect'] = [int(i) for i in list(liwc_df_final['liwc_affect'])]
df['liwc_cogmech'] = [int(i) for i in list(liwc_df_final['liwc_cogmech'])]
df['liwc_percept'] = [int(i) for i in list(liwc_df_final['liwc_percept'])]
df['liwc_bio'] = [int(i) for i in list(liwc_df_final['liwc_bio'])]
df['liwc_relativ'] = [int(i) for i in list(liwc_df_final['liwc_relativ'])]

print(df.head())