In [1]:
import pandas as pd
import liwc
from collections import Counter
import re
import time
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [2]:
def stem_words(word_list):
    word_list = re.sub('[\'\]\[,]', '', word_list).split()
    output = [ps.stem(w) for w in word_list]
    output = " ".join(output)
    return output

In [3]:
df = pd.read_csv('datasets/50M_250_1000_3_final.csv', index_col=None, header=0, usecols=['text','text_token_rem_stop','tweet_id'], lineterminator='\n')

In [4]:
df['text_token_stem_str'] = df.text_token_rem_stop.apply(lambda t: stem_words(t)) 

In [5]:
clf1 = joblib.load('misc/CCAT.pkl')
clf2 = joblib.load('misc/GCAT.pkl')
clf3 = joblib.load('misc/ECAT.pkl')
clf4 = joblib.load('misc/MCAT.pkl')

# Add columns to final dataframe
df['CCAT'] = clf1.predict(df.text_token_stem_str)
df['GCAT'] = clf2.predict(df.text_token_stem_str)
df['ECAT'] = clf3.predict(df.text_token_stem_str)
df['MCAT'] = clf4.predict(df.text_token_stem_str)

# Interesting Punctuation

In [6]:
interesting_punct = []
pattern_same_punctuation = re.compile('(([-/\\\\()!"+,&\'.])\\2+)')
pattern_inter1 = re.compile('\?!+')
pattern_inter2 = re.compile('!\?+')
for tweet in df.text:
    match1 = pattern_same_punctuation.search(tweet)
    match2 = pattern_inter1.findall(tweet)
    match3 = pattern_inter1.findall(tweet)
    if match1 or match2.__len__() > 0 or match3.__len__() > 0:
        interesting_punct.append(1)
    else:
        interesting_punct.append(0)
        
# Add column to final dataframe
df['interesting_punct'] = interesting_punct

# LIWC

In [7]:
import liwc
from collections import Counter

In [8]:
def liwc_features(tokens: [str]) -> {str: int}:
    c = Counter(category for token in tokens for category in liwc_parse(token) if category in ['affect', 'bio', 'cogmech', 'percept', 'relativ', 'social'])
    return dict(c)

In [9]:
liwc_parse, liwc_categories = liwc.load_token_parser('LIWC2007_English100131.dic')

In [13]:
# Iterate through each .csv file 1000 rows at a time (chunksize=1000), map the liwc features to a liwc_df_temp and concatenate
# onto liwc_df_final

numTweets = 1000
liwc_df_final = pd.DataFrame()
for df_temp in pd.read_csv('datasets/50M_250_1000_3_final.csv', index_col=None, header=0, usecols=['text','text_token','tweet_id'], lineterminator='\n', chunksize=1000):
    have_tokens = ~pd.isna(df_temp.text_token)

    df_temp.loc[have_tokens, 'text_token'] = df_temp[have_tokens].text_token.map(eval)

    liwc_df_temp = pd.DataFrame(df_temp[have_tokens].text_token.map(liwc_features).tolist())\
        .fillna(0)\
        .astype(int)
    liwc_df_temp.rename({c: 'liwc_' + c for c in liwc_df_temp}, axis='columns')

    for c in liwc_df_temp:
        df_temp.loc[have_tokens, c] = liwc_df_temp[c].values
#     liwc_df_temp = liwc_df_temp[['affect', 'bio', 'cogmech', 'percept', 'relativ', 'social']]
#     frames = [liwc_df_final, liwc_df_temp]
    
    liwc_df_final = pd.concat([liwc_df_final, liwc_df_temp])
    liwc_df_final.reset_index(drop=True, inplace=True)
    del liwc_df_temp
    print('Processed tweets through', numTweets)
    numTweets+=1000
print(liwc_df_final['cogmech'].head())
print(liwc_df_final[liwc_df_final['cogmech'].isnull()])

Processed tweets through 1000
Processed tweets through 2000
Processed tweets through 3000
Processed tweets through 4000
Processed tweets through 5000
Processed tweets through 6000
Processed tweets through 7000
Processed tweets through 8000
Processed tweets through 9000
Processed tweets through 10000
Processed tweets through 11000
Processed tweets through 12000
Processed tweets through 13000
Processed tweets through 14000
Processed tweets through 15000
Processed tweets through 16000
Processed tweets through 17000
Processed tweets through 18000
Processed tweets through 19000
Processed tweets through 20000
Processed tweets through 21000
Processed tweets through 22000
Processed tweets through 23000
Processed tweets through 24000
Processed tweets through 25000
Processed tweets through 26000
Processed tweets through 27000
Processed tweets through 28000
Processed tweets through 29000
Processed tweets through 30000
Processed tweets through 31000
Processed tweets through 32000
Processed tweets 

Processed tweets through 262000
Processed tweets through 263000
Processed tweets through 264000
Processed tweets through 265000
Processed tweets through 266000
Processed tweets through 267000
Processed tweets through 268000
Processed tweets through 269000
Processed tweets through 270000
Processed tweets through 271000
Processed tweets through 272000
Processed tweets through 273000
Processed tweets through 274000
Processed tweets through 275000
Processed tweets through 276000
Processed tweets through 277000
Processed tweets through 278000
Processed tweets through 279000
Processed tweets through 280000
Processed tweets through 281000
Processed tweets through 282000
Processed tweets through 283000
Processed tweets through 284000
Processed tweets through 285000
Processed tweets through 286000
Processed tweets through 287000
Processed tweets through 288000
Processed tweets through 289000
Processed tweets through 290000
Processed tweets through 291000
Processed tweets through 292000
Processe

Processed tweets through 519000
Processed tweets through 520000
Processed tweets through 521000
Processed tweets through 522000
Processed tweets through 523000
Processed tweets through 524000
Processed tweets through 525000
Processed tweets through 526000
Processed tweets through 527000
Processed tweets through 528000
Processed tweets through 529000
Processed tweets through 530000
Processed tweets through 531000
Processed tweets through 532000
Processed tweets through 533000
Processed tweets through 534000
Processed tweets through 535000
Processed tweets through 536000
Processed tweets through 537000
Processed tweets through 538000
Processed tweets through 539000
Processed tweets through 540000
Processed tweets through 541000
Processed tweets through 542000
Processed tweets through 543000
Processed tweets through 544000
Processed tweets through 545000
Processed tweets through 546000
Processed tweets through 547000
Processed tweets through 548000
Processed tweets through 549000
Processe

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Processed tweets through 661000
Processed tweets through 662000
Processed tweets through 663000
Processed tweets through 664000
Processed tweets through 665000
Processed tweets through 666000
Processed tweets through 667000
Processed tweets through 668000
Processed tweets through 669000
Processed tweets through 670000
Processed tweets through 671000
Processed tweets through 672000
Processed tweets through 673000
Processed tweets through 674000
Processed tweets through 675000
Processed tweets through 676000
Processed tweets through 677000
Processed tweets through 678000
Processed tweets through 679000
Processed tweets through 680000
Processed tweets through 681000
Processed tweets through 682000
Processed tweets through 683000
Processed tweets through 684000
Processed tweets through 685000
Processed tweets through 686000
Processed tweets through 687000
Processed tweets through 688000
Processed tweets through 689000
Processed tweets through 690000
Processed tweets through 691000
Processe

Processed tweets through 918000
Processed tweets through 919000
Processed tweets through 920000
Processed tweets through 921000
Processed tweets through 922000
Processed tweets through 923000
Processed tweets through 924000
Processed tweets through 925000
Processed tweets through 926000
Processed tweets through 927000
Processed tweets through 928000
Processed tweets through 929000
Processed tweets through 930000
Processed tweets through 931000
Processed tweets through 932000
Processed tweets through 933000
Processed tweets through 934000
Processed tweets through 935000
Processed tweets through 936000
Processed tweets through 937000
Processed tweets through 938000
Processed tweets through 939000
Processed tweets through 940000
Processed tweets through 941000
Processed tweets through 942000
Processed tweets through 943000
Processed tweets through 944000
Processed tweets through 945000
Processed tweets through 946000
Processed tweets through 947000
Processed tweets through 948000
Processe

Processed tweets through 1170000
Processed tweets through 1171000
Processed tweets through 1172000
Processed tweets through 1173000
Processed tweets through 1174000
Processed tweets through 1175000
Processed tweets through 1176000
Processed tweets through 1177000
Processed tweets through 1178000
Processed tweets through 1179000
Processed tweets through 1180000
Processed tweets through 1181000
Processed tweets through 1182000
Processed tweets through 1183000
Processed tweets through 1184000
Processed tweets through 1185000
Processed tweets through 1186000
Processed tweets through 1187000
Processed tweets through 1188000
Processed tweets through 1189000
Processed tweets through 1190000
Processed tweets through 1191000
Processed tweets through 1192000
Processed tweets through 1193000
Processed tweets through 1194000
Processed tweets through 1195000
Processed tweets through 1196000
Processed tweets through 1197000
Processed tweets through 1198000
Processed tweets through 1199000
Processed 

Processed tweets through 1419000
Processed tweets through 1420000
Processed tweets through 1421000
Processed tweets through 1422000
Processed tweets through 1423000
Processed tweets through 1424000
Processed tweets through 1425000
Processed tweets through 1426000
Processed tweets through 1427000
Processed tweets through 1428000
Processed tweets through 1429000
Processed tweets through 1430000
Processed tweets through 1431000
Processed tweets through 1432000
Processed tweets through 1433000
Processed tweets through 1434000
Processed tweets through 1435000
Processed tweets through 1436000
Processed tweets through 1437000
Processed tweets through 1438000
Processed tweets through 1439000
Processed tweets through 1440000
Processed tweets through 1441000
Processed tweets through 1442000
Processed tweets through 1443000
Processed tweets through 1444000
Processed tweets through 1445000
Processed tweets through 1446000
Processed tweets through 1447000
Processed tweets through 1448000
Processed 

Processed tweets through 1668000
Processed tweets through 1669000
Processed tweets through 1670000
Processed tweets through 1671000
Processed tweets through 1672000
Processed tweets through 1673000
Processed tweets through 1674000
Processed tweets through 1675000
Processed tweets through 1676000
Processed tweets through 1677000
Processed tweets through 1678000
Processed tweets through 1679000
Processed tweets through 1680000
Processed tweets through 1681000
Processed tweets through 1682000
Processed tweets through 1683000
Processed tweets through 1684000
Processed tweets through 1685000
Processed tweets through 1686000
Processed tweets through 1687000
Processed tweets through 1688000
Processed tweets through 1689000
Processed tweets through 1690000
Processed tweets through 1691000
Processed tweets through 1692000
Processed tweets through 1693000
Processed tweets through 1694000
Processed tweets through 1695000
Processed tweets through 1696000
Processed tweets through 1697000
Processed 

In [19]:
combined = '\t'.join(list(liwc_df_final))

# rename columns in dataframe from 'dimension_name' to 'liwc_dimension_name'
if not 'liwc_' in combined:
    liwc_df_final.columns = ['liwc_' + c for c in liwc_df_final.columns]
    
# Add columns to final dataframe
df['liwc_social'] = [int(i) for i in list(liwc_df_final['liwc_social'])]
df['liwc_affect'] = [int(i) for i in list(liwc_df_final['liwc_affect'])]
df['liwc_cogmech'] = [int(i) for i in list(liwc_df_final['liwc_cogmech'])]
df['liwc_percept'] = [int(i) for i in list(liwc_df_final['liwc_percept'])]
df['liwc_bio'] = [int(i) for i in list(liwc_df_final['liwc_bio'])]
df['liwc_relativ'] = [int(i) for i in list(liwc_df_final['liwc_relativ'])]

print(df.head())

0    0
1    1
2    0
3    0
4    0
Name: liwc_cogmech, dtype: int32
Empty DataFrame
Columns: [liwc_affect, liwc_bio, liwc_cogmech, liwc_percept, liwc_relativ, liwc_social]
Index: []
1719971
        liwc_affect  liwc_bio  liwc_cogmech  liwc_percept  liwc_relativ  \
660000            0       0.0             0           NaN             1   
660001            0       0.0             0           NaN             1   
660002            0       0.0             0           NaN             0   
660003            0       0.0             0           NaN             1   
660004            1       1.0             0           NaN             3   
660005            0       0.0             0           NaN             0   
660006            0       0.0             0           NaN             0   
660007            0       0.0             0           NaN             0   
660008            0       0.0             0           NaN             0   
660009            0       0.0             0           NaN   