# Data Preprocessing:

TODO: 
- change pipeline such that three columns emerge: raw_body, pro_body and tokens
- character ngrams 

Steps: 
1. Import data
2. Extract features
3. Preprocessing
4. EMPATH
5. Paraphrase database with personality scores 

Features:
1. 11.140 ngram features: tf and tf-idf weighted word and character ngrams stemmed with Porter's stemmer
2. type-token ratio
3. ratio of comments in English
4. ratio of British english vs. American English words
5. 93 features from LIWC 
6. 26 PSYCH features (Preotiuc: Paraphrase Database and NRC Psycholinguistics Database)




## Import packages

In [1]:
import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams, ngrams
import string
from string import punctuation
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from num2words import num2words 
import pandas as pd
from empath import Empath
import random
random.seed(32)

# close nltk download window to continue

[nltk_data] Downloading package punkt to /home/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import data

In [2]:
pandora = pd.read_csv('/home/sophia/ma_py/pandora_bigfive1000.csv')

# # Total number of characters (including space)
# data['char_count'] = data['body'].str.len()

# # Total number of stopwords
# data['stopwords'] = data['body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))

# # Total number of punctuation or special characters
# data['total_punc'] = data['body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))

# # Total number of numerics
# data['total_num'] = data['body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

# # Total number of uppercase words
# data['total_uppercase'] = data['body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))


authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')

bigfive = authors[['author', 'agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive.dropna()

pandoradf = pd.merge(pandora, bigfive, on='author', how='outer')
pandoradf = pandoradf.dropna()
pandoradf.tail()

# pandoradf['probody'] = pandoradf['body']
# pandoradf['tokens'] = pandoradf['body']

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,...,subreddit,ups,word_count,word_count_quoteless,lang,agreeableness,openness,conscientiousness,extraversion,neuroticism
932,Xaielao,Pharah,It seems to me that the least played character...,0.0,1463691000.0,t5_2u5kl,t3_4k2vck,t3_4k2vck,2.0,0.0,...,Overwatch,2.0,98.0,96.0,en,78.0,57.0,38.0,31.0,10.0
944,BadgerKid96,19,Close.,0.0,1469684000.0,t5_2rjli,t3_4ur4p7,t1_d5sbnrs,1.0,0.0,...,teenagers,1.0,1.0,1.0,en,77.0,73.0,73.0,1.0,98.0
962,Ambedo_1,INTJ,ahh gotcha. thanks for replying. i think i do ...,0.0,1479464000.0,t5_2qowo,t3_54j3ww,t1_da5gu9x,1.0,0.0,...,intj,0.0,120.0,120.0,en,11.0,6.0,61.0,1.0,45.0
968,WhatINeverSaid,[ISFJ],No it isn't too much information. I would say ...,0.0,1429376000.0,t5_2s90r,t3_32vycz,t1_cqfnjda,1.0,0.0,...,mbti,1.0,42.0,42.0,en,34.0,10.0,54.0,33.0,46.0
988,mdhh99,http://smile.amazon.com/gp/registry/wishlist/2...,What type of skate? The trick to ice skate is ...,0.0,1436839000.0,t5_2tx47,t3_3d6q1c,t1_ct2e6qm,2.0,0.0,...,Random_Acts_Of_Amazon,2.0,27.0,27.0,en,8.0,9.0,14.0,14.0,29.0


## Feature extraction

In [3]:
def choose_stopwordlist(df, mode):
    if mode == 'NLTK':
        stopwordList = stopwords.words('english')
    if mode == 'NLTK-neg':
        stopwordList = stopwords.words('english')
        stopwordList.remove('no')
        stopwordList.remove('nor')
        stopwordList.remove('not')
    return stopwordList

stopwordList = choose_stopwordlist(pandoradf, mode='NLTK-neg')

print(stopwordList)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

In [4]:
def create_features(workdata):

    # Total number of characters (including space)
    workdata['char_count'] = workdata['body'].str.len()

    # Total number of stopwords
    workdata['stopwords'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))

    # Total number of punctuation or special characters
    workdata['total_punc'] = workdata['body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))

    # Total number of numerics
    workdata['total_num'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # Total number of uppercase words
    workdata['total_uppercase'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    
    return workdata

featuredf = create_features(pandoradf)
featuredf.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,...,agreeableness,openness,conscientiousness,extraversion,neuroticism,char_count,stopwords,total_punc,total_num,total_uppercase
1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,0.0,...,8.0,11.0,74.0,1.0,25.0,53,3,2,0,0
21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,0.0,...,76.0,47.0,1.0,4.0,75.0,95,0,8,0,0
34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,0.0,...,26.0,93.0,49.0,70.0,16.0,207,16,16,0,4
37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,0.0,...,70.0,64.0,5.0,5.0,95.0,28,2,2,0,1
62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,0.0,...,26.0,98.0,75.0,93.0,29.0,108,5,2,0,2


## Preprocessing

1. lower 
2. tokenize
3. numbers to words

In [54]:
def preprocessing(workdf):
    # lower, remove special characters, remove stopwords
    workdf['probody'] = workdf['body'].apply(lambda x: ' '.join([x.lower() for x in x.split() if x.isalnum()]))
    workdf['probody'] = workdf['probody'].apply(lambda x: ' '.join([x for x in x.split() if (x not in stopwordList)]))
    newbody = []
    # num2words
    for sentence in workdf['probody']:
        # string to list
        inputtext = sentence.split()
        numlist = []
        for i in range(len(inputtext)):
            if inputtext[i].isnumeric():
                numlist.append(i)
        for number in numlist:
            inputtext[number] = num2words(inputtext[number])
        
        # list to string
        celltext = ' '.join(inputtext)
        # tokenize
        celltext = word_tokenize(celltext)
        newbody.append(celltext)   
    workdf['tokens'] = newbody
    return workdf

preprocesseddf = preprocessing(featuredf)
print(preprocesseddf.iloc[2]['tokens'])
preprocesseddf.head()
preprocesseddf.info()

['man', 'call', 'man', 'guess', 'mouse', 'agree', 'not', 'safe', 'space', 'per', 'everything', 'want', 'not', 'mess']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 87 entries, 1 to 988
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   author                87 non-null     object 
 1   author_flair_text     87 non-null     object 
 2   body                  87 non-null     object 
 3   downs                 87 non-null     float64
 4   created_utc           87 non-null     float64
 5   subreddit_id          87 non-null     object 
 6   link_id               87 non-null     object 
 7   parent_id             87 non-null     object 
 8   score                 87 non-null     float64
 9   controversiality      87 non-null     float64
 10  gilded                87 non-null     float64
 11  id                    87 non-null     object 
 12  subreddit             87 non-null     object 
 13  ups     

In [32]:
# Porter Stemmer

def stemming(df):
    ps = PorterStemmer()
    df['tokens'] = df['tokens'].apply(lambda x:([ps.stem(word) for word in x]))
    return df

stemmeddf = stemming(preprocesseddf)
print(stemmeddf.iloc[1]['tokens'])
stemmeddf.head()

['downturn', 'dirti', 'small', 'small', 'obviou', 'would', 'not']


Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,...,conscientiousness,extraversion,neuroticism,char_count,stopwords,total_punc,total_num,total_uppercase,probody,tokens
1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,0.0,...,74.0,1.0,25.0,53,3,2,0,0,subtle enough look like,"[subtl, enough, look, like]"
21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,0.0,...,1.0,4.0,75.0,95,0,8,0,0,downturned dirty small small obvious would not,"[downturn, dirti, small, small, obviou, would,..."
34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,0.0,...,49.0,70.0,16.0,207,16,16,0,4,man call man guess mouse agree not safe space ...,"[man, call, man, guess, mou, agr, not, safe, s..."
37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,0.0,...,5.0,5.0,95.0,28,2,2,0,1,added thank,"[ad, thank]"
62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,0.0,...,75.0,93.0,29.0,108,5,2,0,2,squatted 225x14 couple weeks ago made sad card...,"[squat, 225x14, coupl, week, ago, made, sad, c..."


In [61]:
def ngrams(df, n_min, n_max, ngramtype):
    # convert input from list to string
    ngrams = []
    inputtext = []
    for sentence in df['tokens']:
        text = ' '.join(sentence)
        inputtext.append(text)
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype) 
    vectors = vectorizer.fit_transform(inputtext)
    dense = vectors.todense()
    denselist = dense.tolist()
    names = vectorizer.get_feature_names()
    ngramdf = pd.DataFrame(denselist, columns=names)
    ngramdict = ngramdf.to_dict('index')
    dict_items = list(ngramdict.items())    
    return dict_items

stemmeddf['wordngrams'] = ngrams(stemmeddf, 1, 3, 'word')
stemmeddf['charngrams'] = ngrams(stemmeddf, 2, 3, 'char')
stemmeddf.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,...,char_count,stopwords,total_punc,total_num,total_uppercase,probody,tokens,empath,wordngrams,charngrams
1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,0.0,...,53,3,2,0,0,subtle enough look like,"[subtle, enough, look, like]","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(0, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(0, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,0.0,...,95,0,8,0,0,downturned dirty small small obvious would not,"[downturned, dirty, small, small, obvious, wou...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(1, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(1, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,0.0,...,207,16,16,0,4,man call man guess mouse agree not safe space ...,"[man, call, man, guess, mouse, agree, not, saf...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(2, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(2, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,0.0,...,28,2,2,0,1,added thank,"[added, thank]","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(3, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(3, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,0.0,...,108,5,2,0,2,squatted 225x14 couple weeks ago made sad card...,"[squatted, 225x14, couple, weeks, ago, made, s...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(4, {'225x14': 0.18609151074643016, '225x14 co...","(4, {' 2': 0.1099928376810328, ' 22': 0.120178..."


## Empath

In [62]:
def apply_empath(df):

    empath = Empath()
    df['empath'] = df['tokens'].apply(lambda x:([empath.analyze(sentence, normalize=True) for sentence in x]))
    
#     empathvalues = []

#     for sentence in comments['body']:
#         empathvalues.append(empath.analyze(sentence, normalize=True))
#     comments['empath'] = empathvalues
    
    return df

empathdf = apply_empath(stemmeddf)
empathdf.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,...,char_count,stopwords,total_punc,total_num,total_uppercase,probody,tokens,empath,wordngrams,charngrams
1,Sabata11792,Twilight Sparkle,That's subtle enough to just look like a coinc...,0.0,1447260000.0,t5_2s8bl,t3_3sdrrj,t1_cwwel0w,4.0,0.0,...,53,3,2,0,0,subtle enough look like,"[subtle, enough, look, like]","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(0, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(0, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
21,Shadow_Of_,Communist fag,"Downturned nose, dirty skin, tattoos, small ch...",0.0,1493736000.0,t5_35j1r,t3_68roag,t3_68roag,1.0,0.0,...,95,0,8,0,0,downturned dirty small small obvious would not,"[downturned, dirty, small, small, obvious, wou...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(1, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(1, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
34,xenomouse,INFJ/atelerix,"Yes, if I was a man they'd call it a man cave....",0.0,1506875000.0,t5_2r39a,t3_73lfuz,t3_73lfuz,2.0,0.0,...,207,16,16,0,4,man call man guess mouse agree not safe space ...,"[man, call, man, guess, mouse, agree, not, saf...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(2, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(2, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
37,eiznekk,"Eiznek 1736-2104-7526 [Fighting:Machoke,Throh,...",Added you back! Thank you :D,0.0,1440447000.0,t5_2yt52,t3_3i8rel,t1_cue9ixy,1.0,0.0,...,28,2,2,0,1,added thank,"[added, thank]","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(3, {'225x14': 0.0, '225x14 couple': 0.0, '225...","(3, {' 2': 0.0, ' 22': 0.0, ' 2s': 0.0, ' a': ..."
62,vitrael2,135x5,I squatted 225x14 a couple weeks ago and I mad...,0.0,1424723000.0,t5_34op9,t3_2wwmlc,t3_2wwmlc,8.0,0.0,...,108,5,2,0,2,squatted 225x14 couple weeks ago made sad card...,"[squatted, 225x14, couple, weeks, ago, made, s...","[{'help': 0.0, 'office': 0.0, 'dance': 0.0, 'm...","(4, {'225x14': 0.18609151074643016, '225x14 co...","(4, {' 2': 0.1099928376810328, ' 22': 0.120178..."


In [66]:
# https://github.com/Idilismiguzel/NLP-with-Python/blob/master/Text-Classification.ipynb

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(empathdf.wordngrams, empathdf.extraversion, test_size=0.25, random_state=0)


In [67]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

ValueError: setting an array element with a sequence.

In [None]:
predictions = logisticRegr.predict(x_test)

score = logisticRegr.score(x_test, y_test)
print(score)