# Data Preprocessing:

### Features (as in paper):
1. 11.140 ngram features: tf and tf-idf weighted word and character ngrams stemmed with Porter's stemmer
2. type-token ratio
3. ratio of comments in English
4. ratio of British english vs. American English words
5. 93 features from LIWC 
6. 26 PSYCH features (Preotiuc: Paraphrase Database and NRC Psycholinguistics Database)

### Columns (from the description of the dataset):
1. 'global':[7,10], #subreddits_commented, subreddits_commented_mbti, num_comments
2. 'liwc':[10,103], #liwc
3. 'word':[103,3938], #top1000 word ngram (1,2,3) per dimension based on chi2
4. 'char':[3938,7243], #top1000 char ngrams (2,3) per dimension based on chi2
5. 'sub':[7243,12228], #number of comments in each subreddit
6. 'ent':[12228,12229], #entropy
7. 'subtf':[12229,17214], #tf-idf on subreddits
8. 'subcat':[17214,17249], #manually crafted subreddit categories
9. 'lda50':[17249,17299], #50 LDA topics
10. 'posts':[17299,17319], #posts statistics
11. 'lda100':[17319,17419], #100 LDA topics
12. 'psy':[17419,17443], #psycholinguistic features
13. 'en':[17443,17444], #ratio of english comments
14. 'ttr':[17444,17445], #type token ratio
15. 'meaning':[17445,17447], #additional pyscholinguistic features
16. 'time_diffs':[17447,17453], #commenting time diffs
17. 'month':[17453,17465], #monthly distribution
18. 'hour':[17465,17489], #hourly distribution
19. 'day_of_week':[17489,17496], #daily distribution
20. 'word_an':[17496,21496], #word ngrams selected by F-score
21. 'word_an_tf':[21496,25496], #tf-idf ngrams selected by F-score
22. 'char_an':[25496,29496], #char ngrams selected by F-score
23. 'char_an_tf':[29496,33496], #tf-idf char ngrams selected by F-score
24. 'brit_amer':[33496,33499], #british vs american english ratio


## Import packages

In [67]:
import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams, ngrams
import string
from string import punctuation
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from num2words import num2words 
import pandas as pd
import numpy as np
from empath import Empath
import random
random.seed(32)

# close nltk download window to continue

[nltk_data] Downloading package punkt to /home/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import data

In [68]:
pandora = pd.read_csv('/home/sophia/ma_py/pandora_bigfive1000.csv')

authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')

bigfive = authors[['author', 'agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive.dropna()

pandoradf = pd.merge(pandora, bigfive, on='author', how='outer')
pandoradf = pandoradf.dropna()
pandoradf = pandoradf.reset_index()
pandoradf.tail()

Unnamed: 0,index,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,...,subreddit,ups,word_count,word_count_quoteless,lang,agreeableness,openness,conscientiousness,extraversion,neuroticism
82,932,Xaielao,Pharah,It seems to me that the least played character...,0.0,1463691000.0,t5_2u5kl,t3_4k2vck,t3_4k2vck,2.0,...,Overwatch,2.0,98.0,96.0,en,78.0,57.0,38.0,31.0,10.0
83,944,BadgerKid96,19,Close.,0.0,1469684000.0,t5_2rjli,t3_4ur4p7,t1_d5sbnrs,1.0,...,teenagers,1.0,1.0,1.0,en,77.0,73.0,73.0,1.0,98.0
84,962,Ambedo_1,INTJ,ahh gotcha. thanks for replying. i think i do ...,0.0,1479464000.0,t5_2qowo,t3_54j3ww,t1_da5gu9x,1.0,...,intj,0.0,120.0,120.0,en,11.0,6.0,61.0,1.0,45.0
85,968,WhatINeverSaid,[ISFJ],No it isn't too much information. I would say ...,0.0,1429376000.0,t5_2s90r,t3_32vycz,t1_cqfnjda,1.0,...,mbti,1.0,42.0,42.0,en,34.0,10.0,54.0,33.0,46.0
86,988,mdhh99,http://smile.amazon.com/gp/registry/wishlist/2...,What type of skate? The trick to ice skate is ...,0.0,1436839000.0,t5_2tx47,t3_3d6q1c,t1_ct2e6qm,2.0,...,Random_Acts_Of_Amazon,2.0,27.0,27.0,en,8.0,9.0,14.0,14.0,29.0


## Feature extraction

In [69]:
def choose_stopwordlist(df, mode):
    if mode == 'NLTK':
        stopwordList = stopwords.words('english')
    if mode == 'NLTK-neg':
        stopwordList = stopwords.words('english')
        stopwordList.remove('no')
        stopwordList.remove('nor')
        stopwordList.remove('not')
    return stopwordList

stopwordList = choose_stopwordlist(pandoradf, mode='NLTK-neg')

print(stopwordList)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

In [70]:
def create_features(workdata):

    # Total number of characters (including space)
    workdata['char_count'] = workdata['body'].str.len()

    # Total number of stopwords
    workdata['stopwords'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))

    # Total number of punctuation or special characters
    workdata['total_punc'] = workdata['body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))

    # Total number of numerics
    workdata['total_num'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # Total number of uppercase words
    workdata['total_uppercase'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    
    return workdata

featuredf = create_features(pandoradf)
featuredf.head()

Unnamed: 0,index,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,...,agreeableness,openness,conscientiousness,extraversion,neuroticism,char_count,stopwords,total_punc,total_num,total_uppercase
0,1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,...,8.0,11.0,74.0,1.0,25.0,53,3,2,0,0
1,21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,...,76.0,47.0,1.0,4.0,75.0,95,0,8,0,0
2,34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,...,26.0,93.0,49.0,70.0,16.0,207,16,16,0,4
3,37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,...,70.0,64.0,5.0,5.0,95.0,28,2,2,0,1
4,62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,...,26.0,98.0,75.0,93.0,29.0,108,5,2,0,2


### Create ngrams

1. lower 
2. tokenize
3. numbers to words
4. delete special tokens

In [71]:
def preprocessing(workdf):
    # lower, remove special characters, remove stopwords
    workdf['probody'] = workdf['body'].apply(lambda x: ' '.join([x.lower() for x in x.split() if x.isalnum()]))
    workdf['probody'] = workdf['probody'].apply(lambda x: ' '.join([x for x in x.split() if (x not in stopwordList)]))
    newbody = []
    # num2words
    for sentence in workdf['probody']:
        # string to list
        inputtext = sentence.split()
        numlist = []
        for i in range(len(inputtext)):
            if inputtext[i].isnumeric():
                numlist.append(i)
        for number in numlist:
            inputtext[number] = num2words(inputtext[number])
        
        # list to string
        celltext = ' '.join(inputtext)
        # tokenize
        celltext = word_tokenize(celltext)
        newbody.append(celltext)   
    workdf['tokens'] = newbody
    return workdf

preprocesseddf = preprocessing(featuredf)
print(preprocesseddf.iloc[2]['tokens'])
preprocesseddf.head()
preprocesseddf.info()

['man', 'call', 'man', 'guess', 'mouse', 'agree', 'not', 'safe', 'space', 'per', 'everything', 'want', 'not', 'mess']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 87 non-null     int64  
 1   author                87 non-null     object 
 2   author_flair_text     87 non-null     object 
 3   body                  87 non-null     object 
 4   downs                 87 non-null     float64
 5   created_utc           87 non-null     float64
 6   subreddit_id          87 non-null     object 
 7   link_id               87 non-null     object 
 8   parent_id             87 non-null     object 
 9   score                 87 non-null     float64
 10  controversiality      87 non-null     float64
 11  gilded                87 non-null     float64
 12  id                    87 non-null     object 
 13  subreddit

In [72]:
# Porter Stemmer
def stemming(df):
    ps = PorterStemmer()
    df['tokens'] = df['tokens'].apply(lambda x:([ps.stem(word) for word in x]))
    return df

stemmeddf = stemming(preprocesseddf)
print(stemmeddf.iloc[1]['tokens'])
stemmeddf.head()

['downturn', 'dirti', 'small', 'small', 'obviou', 'would', 'not']


Unnamed: 0,index,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,...,conscientiousness,extraversion,neuroticism,char_count,stopwords,total_punc,total_num,total_uppercase,probody,tokens
0,1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,...,74.0,1.0,25.0,53,3,2,0,0,subtle enough look like,"[subtl, enough, look, like]"
1,21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,...,1.0,4.0,75.0,95,0,8,0,0,downturned dirty small small obvious would not,"[downturn, dirti, small, small, obviou, would,..."
2,34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,...,49.0,70.0,16.0,207,16,16,0,4,man call man guess mouse agree not safe space ...,"[man, call, man, guess, mous, agre, not, safe,..."
3,37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,...,5.0,5.0,95.0,28,2,2,0,1,added thank,"[ad, thank]"
4,62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,...,75.0,93.0,29.0,108,5,2,0,2,squatted 225x14 couple weeks ago made sad card...,"[squat, 225x14, coupl, week, ago, made, sad, c..."


In [73]:
# def apply_empath(df):

#     empath = Empath()
# #     df['empath'] = df['tokens'].apply(lambda x:([empath.analyze(sentence, normalize=True) for sentence in x]))
#     empathlist = df['tokens'].apply(lambda x:([empath.analyze(sentence, normalize=True) for sentence in x]))
#     empathdf = pd.DataFrame(empathlist)  
#     print(empathlist)
# #     for row in df:
# #         empathdict = empath.analyze(row, normalize=True)
# #     empathdf = pd.DataFrame.from_dict(data)
#     return df

# empathdf = apply_empath(stemmeddf)
# empathdf.head()



def apply_empath(df):
    empath = Empath()
    empathvalues = []
    for sentence in df['body']:
        empathvalues.append(empath.analyze(sentence, normalize=True))
    empathdf = pd.DataFrame(empathvalues)
    empathdf['author'] = df['author']

    newdf = pd.merge(df, empathdf, on='author', how='outer')
    return newdf

empdf = apply_empath(stemmeddf)
print(empdf.isnull().any().any())
empdf.head()



# empath = Empath()
# empathvalues = []
# for sentence in stemmeddf['tokens']:
#     empathvalues.append(empath.analyze(sentence, normalize=True))
# print(type(empathvalues))
# print(type(empathvalues[10]))
# empathdf = pd.DataFrame(empathvalues)
# empathdf.head()
# # newdf = stemmeddf.append(empathdf)
# # newdf.head()

False


Unnamed: 0,index,author,author_flair_text,body_x,downs,created_utc,subreddit_id,link_id,parent_id,score,...,weapon,children,monster,ocean,giving,contentment,writing,rural,positive_emotion,musical
0,1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,...,0.0,0.0,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.0
3,37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
def ngrams(df, n_min, n_max, ngramtype):
    # convert input from list to string
    ngrams = []
    inputtext = []
    for sentence in df['tokens']:
        text = ' '.join(sentence)
        inputtext.append(text)
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype) 
    vectors = vectorizer.fit_transform(inputtext)
    dense = vectors.todense()
    denselist = dense.tolist()
    names = vectorizer.get_feature_names()
    ngramdf = pd.DataFrame(denselist, columns=names)
    ngramdf['author'] = df['author']
#     newdf = pd.merge(df, ngramdf, on='author', how='outer')
#     ngramdict = ngramdf.to_dict('index')
#     dict_items = list(ngramdict.items())    
    return ngramdf

# stemmeddf['wordngrams'] = ngrams(stemmeddf, 1, 3, 'word')
# stemmeddf['charngrams'] = ngrams(stemmeddf, 2, 3, 'char')
# stemmeddf.head()

wordngramsdf = ngrams(empdf, 1, 3, 'word')
print(wordngramsdf.isnull().any().any())
charngramsdf = ngrams(empdf, 2, 3, 'char')
print(charngramsdf.isnull().any().any())
# testlength = int((len(wordngramsdf)/2))
# testdf = wordngramsdf.iloc[:][0:testlength]
cwngramsdf = pd.merge(wordngramsdf, charngramsdf, on='author', how='outer')
gramsdf = pd.merge(empdf, cwngramsdf, on='author', how='outer')
gramsdf.head()
# cwngramsdf = ngrams(wordngramsdf, 2, 3, 'char')
# print(cwngramsdf.isnull().any().any())
# cwngramsdf.head()

False
False


Unnamed: 0,index,author,author_flair_text,body_x,downs,created_utc,subreddit_id,link_id,parent_id,score,...,yv,yva,yz,yz.1,z,z g,z l,z p,ze,zer
0,1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
gramsdf.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84619 entries, 0 to 84618
Data columns (total 4879 columns):
 #     Column                           Dtype  
---    ------                           -----  
 0     index                            int64  
 1     author                           object 
 2     author_flair_text                object 
 3     body_x                           object 
 4     downs                            float64
 5     created_utc                      float64
 6     subreddit_id                     object 
 7     link_id                          object 
 8     parent_id                        object 
 9     score                            float64
 10    controversiality                 float64
 11    gilded                           float64
 12    id_x                             object 
 13    subreddit                        object 
 14    ups                              float64
 15    word_count                       float64
 16    word_count_quotele

## Empath

as a replacement for LIWC

## Word Lists

Still needed are lists that comprise pronouns and stuff like that (see LIWC vs Empath). These lists can be created via empath

## Sort dataframe

In [76]:
# change language to numeric representation
language = gramsdf['lang'].values.tolist()
language = set(language)
language
# gramsdf['language'] = gramsdf['lang'].apply(lambda x: 0 if x=='en' elif 1 x=='es' else 2)
gramsdf['language']= np.select([gramsdf.lang == 'en', gramsdf.lang == 'es', gramsdf.lang == 'nl'], 
                        [0, 1, 2], 
                        default=3)
print(gramsdf['language'])
gramsdf= gramsdf.drop(columns=['lang'])

# change big five to binary representation
gramsdf['agree'] = gramsdf['agreeableness'].apply(lambda x: 0 if x<50 else 1)
gramsdf['openn'] = gramsdf['openness'].apply(lambda x: 0 if x<50 else 1)
gramsdf['consc'] = gramsdf['conscientiousness'].apply(lambda x: 0 if x<50 else 1)
gramsdf['extra'] = gramsdf['extraversion'].apply(lambda x: 0 if x<50 else 1)
gramsdf['neuro'] = gramsdf['neuroticism'].apply(lambda x: 0 if x<50 else 1)


0        0
1        0
2        0
3        0
4        0
        ..
84614    0
84615    0
84616    0
84617    0
84618    0
Name: language, Length: 84619, dtype: int64


In [77]:
cols_tomove = ['index', 'author', 'body_x', 'probody', 'tokens', 'agreeableness', 'openness', 'conscientiousness', 'extraversion', 'neuroticism', 'agree', 'openn', 'consc', 'extra', 'neuro', 'language']
orderdf  = gramsdf[cols_tomove + [col for col in gramsdf.columns if col not in cols_tomove]]
orderdf.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84619 entries, 0 to 84618
Data columns (total 4884 columns):
 #     Column                           Dtype  
---    ------                           -----  
 0     index                            int64  
 1     author                           object 
 2     body_x                           object 
 3     probody                          object 
 4     tokens                           object 
 5     agreeableness                    float64
 6     openness                         float64
 7     conscientiousness                float64
 8     extraversion                     float64
 9     neuroticism                      float64
 10    agree                            int64  
 11    openn                            int64  
 12    consc                            int64  
 13    extra                            int64  
 14    neuro                            int64  
 15    language                         int64  
 16    author_flair_text 

## Export dataframe

In [78]:
orderdf.to_pickle("features.pkl")