# All functions for personality prediction

## Prep

In [1]:
# Import packages

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams, ngrams

import re
import string
from string import punctuation

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve

import gensim
from gensim import corpora, models

from empath import Empath

from collections import Counter
from num2words import num2words
from lexicalrichness import LexicalRichness
import textblob


import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
tqdm.pandas()
import datetime
import random
random.seed(32)


[nltk_data] Downloading package punkt to /home/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sophia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/sophia/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


## Prepare dataset

In [2]:
# Import dataset with comments
df = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')

# Import dataset authors and delete not needed columns
authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
bigfive = bigfive[bigfive['agreeableness'].notna()]
del authors

In [3]:
# Functions

# change language to numeric representation
def numeric_lang(df):
    # change lang to numerical representation
    language = df['lang'].values.tolist()
    language = set(language)
    df['language']= np.select([df.lang == 'en', df.lang == 'es', df.lang == 'nl'], 
                            [0, 1, 2], 
                            default=3)
    # print(gramsdf['language'])
    df = df.drop(columns=['lang'])

    return df

# create time columns from UTC
def create_timecolumns(df):
    readable = []
    weekday = []
    month = []
    year = []
    for row in tqdm(df['created_utc']):
        item = datetime.datetime.fromtimestamp(row)
        weekday_item = item.strftime('%A')
        readable_item = datetime.datetime.fromtimestamp(row).isoformat()
        month.append(str(readable_item[5:7]))
        year.append(str(readable_item[0:4]))
        readable.append(readable_item)
        weekday.append(weekday_item.lower())
    df['time'] = readable
    df['weekday'] = weekday
    df['month'] = month
    df['year'] = year
    return df

# count occurences in time columns to get time distribution
def timecounter(lst, vocablst):
    if vocablst == 'weekday':
        vocab = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    elif vocablst == 'month':
        vocab = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    elif vocablst == 'year':
        vocab = ['2015', '2016', '2017', '2018', '2019']
    else:
        print("No valid input: vocab list")
    vectorizer = CountVectorizer(analyzer="word", vocabulary=vocab)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
    return v

def timeinterval(lst):
    max_lst = []
    median_lst = []
    mean_lst = []
    for sublst in lst:
        intlst = [int(item) for item in sublst]
        if len(intlst) > 1:
            intlst.sort()
            arr = np.array(intlst)
            diff_lst = np.diff(arr)
            max_lst.append(max(diff_lst))
            median_lst.append(np.median(diff_lst))
            mean_lst.append(np.mean(diff_lst))
        else:
            max_lst.append(-1)
            median_lst.append(-1)
            mean_lst.append(-1)
    return mean_lst, median_lst, max_lst

# create a list of all subreddits in the dataset
lst = df['subreddit'].tolist()
lst = [item.lower() for item in lst]
subredditset = set(lst)
subredditlist = list(subredditset)

# count occurences of subreddits 
def subredditcounter(lst, subredditlst):
    vectorizer = CountVectorizer(analyzer="word", vocabulary=subredditlist)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
    return v

# aggregate dataset to get one row per author and create new columns for time and subreddit
def create_groupdf(df): 
    print("\tCreate numeric language representation...")
    df = numeric_lang(df)
    print("\tCreate time columns...")
    df = create_timecolumns(df)
    # create dictionary for aggregation function
    d = {'lang': ['nunique'] , 'controversiality': ['mean'], 'gilded': ['mean'], 'score':['mean'],
         'body': (' '. join), 'doc_body': (lambda x : list(x)),
         'utc': (lambda x : list(x)), 'subreddit': (' '. join), 'num_subreddit': ['nunique'],
         'weekday': (' '. join), 'month': (' '. join), 'year': (' '. join)}
    # '§'. join(x)
 
    # new ungrouped columns
    print("\tCreate new ungrouped columns...")
    df['body'] = df['body'].apply(lambda x: str(x))
    df['doc_body'] = df['body']
    df['num_subreddit'] = df['subreddit']
    df['lang'] = df['language'].apply(lambda x: str(x))
    df['utc'] = df['created_utc'].apply(lambda x: str(x))
#     df['subreddit'] = df['subreddit'].apply(lambda x: [x.lower()])
    df['subreddit'] = df['subreddit'].apply(lambda x: ''.join(x.lower()))
    
    # create df groupd by author + transform
    print("\tGroup df by author...")
    groupdf = df.groupby(['author']).agg(d)
    groupdf = groupdf.reset_index()
    groupdf.columns = groupdf.columns.droplevel(1)
    return groupdf
    
def create_new_columns(df):    
    # body
#     print("\tCreate doc_body...")
# #     df['doc_body'] =  df['doc_body'].apply(lambda x: [x.split("§") for x in x])
#     # created_utc
#     print("\tCreate utc list...")
#     df['all_utc'] = df['utc_lst'].apply(lambda x: x.split())
    # controversiality
    print("\tCreate controversiality column...")
    df['controversiality'] = df['controversiality'].fillna(0)
    # gilded
    print("\tCreate mean_gilded...")
    df['gilded'] = df['gilded'].fillna(0)
    # number of comments per subreddit
    print("\tCreate subreddit_dist...")
    subreddit_predist = subredditcounter(df['subreddit'], subredditlist)
    subreddit_predist = subreddit_predist.tolist()
    df['subreddit_dist'] = subreddit_predist
    # time
    print("\tCompute time intervals...")
    df['mean_time'], df['median_time'], df['max_time'] = timeinterval(df['utc'])
    print("\tCreate weekday_dist...")
    weekday = timecounter(df['weekday'], 'weekday')
    weekday = weekday.tolist()
    df['weekday_dist'] = weekday
    print("\tCreate month_dist...")
    month = timecounter(df['month'], 'month')
    month = month.tolist()
    df['month_dist'] = month
    print("\tCreate year_dist...")
    year = timecounter(df['year'], 'year')
    year = year.tolist()
    df['year_dist'] = year
    
    print("\tCreate new aggregated df...")
    newdf = df[['author', 'body', 'doc_body', 'utc', 'score', 'controversiality', 
                'gilded', 'num_subreddit', 'subreddit_dist', 'mean_time', 'median_time', 'max_time', 'weekday_dist', 
                'month_dist', 'year_dist', 'lang']]
    print("\tSort new aggregated df...")
    newdf = newdf.sort_values(by='author')
    print("\tDrop duplicates in new aggregated df...")
    newdf = newdf.drop_duplicates(subset=['author'])
    return newdf

# get one column for each feature in the distributions of time and subreddit
weekday = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
month = ['january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december']
year = ['2015', '2016', '2017', '2018', '2019']

def onecolumnperdatapoint(df, column, namelist):
    for i in tqdm(range(len(namelist))):
        df[namelist[i]] = df[column].apply(lambda x:[x[i]])
        df[namelist[i]] = [item[0] for item in df[namelist[i]]]
    return df

In [4]:
# Wrapper for commentdf
def create_commentdf(df):
    print("Create new df grouped by author...")
    groupdf = create_groupdf(df)
    print("Create new columns with features...")
    pandora = create_new_columns(groupdf)
    print("Distribute the weekday_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'weekday_dist', weekday)
    print("Distribute the month_dist to several columns")
    pandora = onecolumnperdatapoint(pandora, 'month_dist', month)
    print("Distribute the year_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'year_dist', year)
    print("Distribute the subreddit_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'subreddit_dist', subredditlist)
    print("Drop dist columns...")
    pandora.drop(['weekday_dist', 'month_dist', 'year_dist', 'subreddit_dist'], axis=1, inplace=True)
    return pandora

# create commentdf
# print("Create comment df (name: pandora)...")
# pandora = create_commentdf(df)
# print("Done...")

Create comment df (name: pandora)...
Create new df grouped by author...
	Create numeric language representation...
	Create time columns...


  0%|          | 0/3103208 [00:00<?, ?it/s]

	Create new ungrouped columns...
	Group df by author...
Create new columns with features...
	Create controversiality column...
	Create mean_gilded...
	Create subreddit_dist...
	Compute time intervals...
	Create weekday_dist...
	Create month_dist...
	Create year_dist...
	Create new aggregated df...
	Sort new aggregated df...
	Drop duplicates in new aggregated df...
Distribute the weekday_dist to several columns...


  0%|          | 0/7 [00:00<?, ?it/s]

Distribute the month_dist to several columns


  0%|          | 0/12 [00:00<?, ?it/s]

Distribute the year_dist to several columns...


  0%|          | 0/5 [00:00<?, ?it/s]

Distribute the subreddit_dist to several columns...


  0%|          | 0/16063 [00:00<?, ?it/s]

Drop dist columns...
Done...


In [6]:
# # merge commentdf and authordf
# print("Sort pandora df...")
# pandora= pandora.sort_values(by='author')
# print("Sort big five df...")
# bigfive= bigfive.sort_values(by='author')
# if pandora.index.name != 'author':
#     print("Set pandora index...")
#     pandora = pandora.set_index('author')
# if bigfive.index.name != 'author':
#     print("Set bigfive index...")
#     bigfive = bigfive.set_index('author')
# print("Join commentdf and authordf")
# pandoradf = pandora.join(bigfive)

# # create multiindex
# headers = 2*['text'] + 1*['data'] + 3*['post'] + 1*['subreddit'] + 3*['time'] + 1*['post'] + 24*['time'] + 16059*['subreddit'] + 5*['trait']
# columns = pandoradf.columns.values
# print(len(headers))
# print(len(columns))
# arrays = [headers] + [columns]
# pandoradf.columns=pd.MultiIndex.from_arrays(arrays)
# pandoradf.info(verbose=True)

Sort pandora df...
Sort big five df...
Set pandora index...
Set bigfive index...
Join commentdf and authordf
16099
16099
<class 'pandas.core.frame.DataFrame'>
Index: 1606 entries, -Areopagan- to zyzee
Data columns (total 16099 columns):
 #      Column                               Dtype  
---     ------                               -----  
 0      (text, body)                         object 
 1      (text, doc_body)                     object 
 2      (data, utc)                          object 
 3      (post, score)                        float64
 4      (post, controversiality)             float64
 5      (post, gilded)                       float64
 6      (subreddit, num_subreddit)           int64  
 7      (time, mean_time)                    float64
 8      (time, median_time)                  float64
 9      (time, max_time)                     int64  
 10     (post, lang)                         int64  
 11     (time, monday)                       int64  
 12     (time, tuesda

In [7]:
# del pandora
# del bigfive

## Preprocessing

In [8]:
# Functions
# create binary representation of personality traits
def bigfive_cat(df):
    # change big five to binary representation
    df['trait', 'big5_a'] = df['trait', 'agreeableness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_o'] = df['trait', 'openness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_c'] = df['trait', 'conscientiousness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_e'] = df['trait', 'extraversion'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_n'] = df['trait', 'neuroticism'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_a_multi'] = df['trait', 'agreeableness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_o_multi'] = df['trait', 'openness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_c_multi'] = df['trait', 'conscientiousness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_e_multi'] = df['trait', 'extraversion'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_n_multi'] = df['trait', 'neuroticism'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    return df

# define stopwordlist to use
def choose_stopwordlist(df, mode):
    if mode == 'NLTK':
        stopwordList = stopwords.words('english')
    if mode == 'NLTK-neg':
        stopwordList = stopwords.words('english')
        stopwordList.remove('no')
        stopwordList.remove('nor')
        stopwordList.remove('not')
    return stopwordList

# remove decontractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# create sentence tokens
def senttokenize(df):
    sentbody = []
    for row in tqdm(df['text', 'doc_body']):
        sentitem = []
        for item in row:
            sentences = sent_tokenize(item)
            sentitem.append(sentences)
        sentbody.append(sentitem)
    df['text', 'senttokens'] = sentbody
    return df

# lower words and remove special characters
def lower_special(df):
    newrow = []
    for row in tqdm(df['text', 'decon_body']):
        newcomment = []
        for comment in row:
            text_pre = ""
            for character in comment:
                if character.isalnum() or character.isspace():
                    character = character.lower()
                    text_pre += character
                else:
                    text_pre += " "
            newcomment.append(text_pre)
        newrow.append(newcomment)   
    df['text', 'probody'] = newrow
    return df

# remove stopwords
def remove_stopwords(df, stopwordList):
    newprobody = []
    for row in tqdm(df['text', 'probody']):
        newrowprobody = []
        for comment in row:
            words = [word for word in comment.split() if (word not in stopwordList)]
            newcomment = ' '.join(words)
            newrowprobody.append(newcomment)
        newprobody.append(newrowprobody)
    df['text', 'probody'] = newprobody
    return df

# change numbers to words and tokenize words

import decimal
def num_tokenize(df):    
    newbody_complete = []
    newprobody_complete = []
    # num2words
    for row in tqdm(df['text', 'probody']):
        newbody = []
        newprobody = []
        for sentence in row:
            # string to list
            inputtext = sentence.split()
            numlist = []
            for i in range(len(inputtext)):
                if inputtext[i].isnumeric():
                    numlist.append(i)
            for number in numlist:
                # deleted: fractions, superscripts, extremely large numbers, 卌卌, 一
                try:
                    inputtext[number] = num2words(inputtext[number])
                except decimal.InvalidOperation:
                    inputtext[number] = " "
                except OverflowError:
                    inputtext[number] = " "

            # list to string
            inputtext = [word for word in inputtext if word.isalpha()]
            celltext = ' '.join(inputtext)
            newprobody.append(celltext)
            # tokenize
            words = word_tokenize(celltext)
            newbody.append(words)
        newbody_complete.append(newbody)
        newprobody_complete.append(newprobody)
    df['text', 'probody'] = newprobody_complete
    df['text', 'tokens'] = newbody_complete
    return df

# Porter Stemmer
def stemming(df):
    ps = PorterStemmer()
    newtokens = []
    for row in tqdm(df['text', 'tokens']):
        newcomment = []
        for comment in row:
            words = [ps.stem(word) for word in comment]
            newcomment.append(words)
        newtokens.append(newcomment)
    df['text', 'tokens'] = newtokens
    return df

# bring columns of dataframe in correct order
def ordering(df):
    cols_tomove = ['trait', 'text', 'data', 'post', 'time', 'subreddit']
#     cols_tomove = ['body', 'doc_body', 'decon_body', 'probody', 'tokens', 'senttokens', 'agreeableness', 'openness', 'conscientiousness', 'extraversion', 'neuroticism', 'agree', 'openn', 'consc', 'extra', 'neuro']
    orderdf  = df[cols_tomove + [col for col in df.columns if col not in cols_tomove]]
    return orderdf

In [9]:
# Wrapper

def preprocess(df):
    # adjust some column representations
    df = bigfive_cat(df)
    # choose stopwordlist with or without negation
    stopwordList = choose_stopwordlist(df, mode='NLTK-neg')
    # decontract abbreviations (e.g., n't to not)
    print("Decontract...")
    df['text', 'decon_body'] = df['text', 'doc_body'].apply(lambda x:([decontracted(x) for x in x]))
    # create sentence tokens
    print("Tokenize Sentences...")
    df = senttokenize(df)
    # lower, remove stopwords, num2words, tokenize
    print("Lower words and remove special characters...")
    df = lower_special(df)
    print("Remove stopwords...")
    df = remove_stopwords(df, stopwordList)
    print("Change numbers to words and tokenize words...")
    df = num_tokenize(df)
    # porters stemmer
    print("Porters Stemmer...")
    df = stemming(df)
#     print("Order df...")
#     df = ordering(df)
    print("Done!")
    return df

# apply preprocessing
# predf = preprocess(pandoradf)
# predf.to_pickle("preprocessed_df_allcomments.pkl")
# predf

Decontract...
Tokenize Sentences...


  0%|          | 0/1606 [00:00<?, ?it/s]

Lower words and remove special characters...


  0%|          | 0/1606 [00:00<?, ?it/s]

Remove stopwords...


  0%|          | 0/1606 [00:00<?, ?it/s]

Change numbers to words and tokenize words...


  0%|          | 0/1606 [00:00<?, ?it/s]

Porters Stemmer...


  0%|          | 0/1606 [00:00<?, ?it/s]

Done!


Unnamed: 0_level_0,text,text,data,post,post,post,subreddit,time,time,time,...,trait,trait,trait,trait,trait,trait,text,text,text,text
Unnamed: 0_level_1,body,doc_body,utc,score,controversiality,gilded,num_subreddit,mean_time,median_time,max_time,...,big5_n,big5_a_multi,big5_o_multi,big5_c_multi,big5_e_multi,big5_n_multi,decon_body,senttokens,probody,tokens
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
-Areopagan-,Your first and second question is the same que...,[Your first and second question is the same qu...,"[1513882848, 1513744846, 1522253427, 151370438...",2.000000,0.000000,0.000000,1,2.137261e+06,893447.0,6721687,...,0,0,4,4,3,0,[Your first and second question is the same qu...,[[Your first and second question is the same q...,[first second question question try make incis...,"[[first, second, question, question, tri, make..."
-BigSexy-,I've been asked to cum everywhere with my ex j...,[I've been asked to cum everywhere with my ex ...,"[1507650565, 1516397088, 1502590403, 151682490...",4.266714,0.020737,0.000000,147,1.003843e+04,760.0,1292061,...,0,1,4,0,0,0,[I have been asked to cum everywhere with my e...,[[I've been asked to cum everywhere with my ex...,[asked cum everywhere ex experiment preferred ...,"[[ask, cum, everywher, ex, experi, prefer, cum..."
-BlitzN9ne,I'm currently in the middle of making a Payday...,[I'm currently in the middle of making a Payda...,"[1422166355, 1423504286, 1449881503, 145521567...",9.644956,0.014159,0.000000,116,4.830648e+04,793.5,3039780,...,0,2,4,0,2,1,[I am currently in the middle of making a Payd...,[[I'm currently in the middle of making a Payd...,[currently middle making payday two inspired m...,"[[current, middl, make, payday, two, inspir, m..."
-CrestiaBell,First and foremost I extend my condolences to ...,[First and foremost I extend my condolences to...,"[1462304635, 1528773104, 1513663029, 148131600...",24.890662,0.017687,0.000866,149,1.220542e+04,1365.0,594290,...,1,2,4,2,4,2,[First and foremost I extend my condolences to...,[[First and foremost I extend my condolences t...,[first foremost extend condolences family espe...,"[[first, foremost, extend, condol, famili, esp..."
-dyad-,I failed both...I'm great at reading people ir...,[I failed both...I'm great at reading people i...,"[1475875524, 1473096864, 1505168466, 150318014...",7.234043,0.000000,0.000000,5,3.799737e+05,57538.0,6062289,...,0,3,3,2,0,2,[I failed both...I am great at reading people ...,[[I failed both...I'm great at reading people ...,"[failed great reading people irl swear haha, i...","[[fail, great, read, peopl, irl, swear, haha],..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zugzwang_03,You know that giggly group of women going to t...,[You know that giggly group of women going to ...,"[1466099531, 1469625145, 1455352713, 150837533...",9.599348,0.011709,0.000291,146,6.396812e+03,785.0,5209353,...,0,0,2,4,4,0,[You know that giggly group of women going to ...,[[You know that giggly group of women going to...,[know giggly group women going bar go dance sa...,"[[know, giggli, group, women, go, bar, go, dan..."
zuluthrone,"I saw some speculate that the ""download"" would...","[I saw some speculate that the ""download"" woul...","[1438979642, 1451195516, 1468833505, 151331960...",12.150923,0.018458,0.000000,46,1.478935e+05,56420.5,2341861,...,0,0,4,1,4,1,"[I saw some speculate that the ""download"" woul...","[[I saw some speculate that the ""download"" wou...",[saw speculate download would backup rather st...,"[[saw, specul, download, would, backup, rather..."
zwelg,I am actually pretty pleased about my score:Ag...,[I am actually pretty pleased about my score:A...,[1508185843],1.000000,0.000000,0.000000,1,-1.000000e+00,-1.0,-1,...,0,1,4,4,4,0,[I am actually pretty pleased about my score:A...,[[I am actually pretty pleased about my score:...,[actually pretty pleased score agreeableness m...,"[[actual, pretti, pleas, score, agreeabl, mode..."
zymmaster,Respectfully disagree. Offense had plenty of i...,[Respectfully disagree. Offense had plenty of ...,"[1455228093, 1476665332, 1468599441, 146004506...",5.640209,0.010444,0.000000,99,6.970164e+04,679.5,8125664,...,0,1,2,3,1,2,[Respectfully disagree. Offense had plenty of ...,"[[Respectfully disagree., Offense had plenty o...",[respectfully disagree offense plenty issues s...,"[[respect, disagre, offens, plenti, issu, stat..."


In [10]:
# del pandoradf

## Extract features

In [13]:
# User features

# Preprocessing for LDA
def preprocess_lda(df):
    neglst = ["no", "not", "none", "nobody", "nothing", "neither", "nowhere", "never", "nay"]
    inputlst = []
    for row in tqdm(df['text', 'tokens']):
        rowlst = []
        for comment in row:
            rowlst.append([word for word in comment if (word not in neglst)])
        inputlst.append(rowlst)
    return inputlst


def apply_lda(df, inputlst, number, name):
    print("Start LDA...")
    lst = []
    for row in tqdm(inputlst):
        if len(row) < 2:
            lst.append(-1)
            print("\t-1 appended...")
        else:
            print("\tBuild dictionary for row...")
            dictionary = corpora.Dictionary(row)
            print("\tBuild corpus for row...")
            corpus = [dictionary.doc2bow(text) for text in row]
            print("\tBuild model for row...")
            ldamodel = gensim.models.LdaMulticore(corpus, num_topics=number, id2word = dictionary, passes=20, workers=10)
            print("\tCalculate result for row...")
            result = ldamodel.print_topics(num_topics=1, num_words=1)
            res = list(result)
            print("\tDelete unnecessary information...")
            topic = [item[0] for item in res]
            lst.append(topic[0])
            print("\tDone with this row...")
    df['lda', name] = lst
    return df

# Wrapper
def extract_userfeatures(df):
    print("Preprocessing for LDA...")
    inputlst = preprocess_lda(df)
    print("LDA with fifty topics: ")
    df = apply_lda(df, inputlst, 50, "ldafifty")
    print("LDA with onehundred topics: ")
    df = apply_lda(df, inputlst, 100, "ldahundred")
    return df

# create df with user features
# user_feat_df = extract_userfeatures(predf)
# user_feat_df.to_pickle("user_feat_df_allcomments.pkl")

In [16]:
# Linguistic features (functions)

# other features that are not mentioned in the paper
def create_features(df):
#     df['char_count'] = df['body']
#     df['stopwords'] = df['body']
#     df['total_punc'] = df['body']
#     df['total_num'] = df['body']
#     df['total_uppercase'] = df['body']
#     d = {'char_count': (lambda x : str.len()) , 
#          'stopwords': (lambda x: len([x for x in x.split() if x in stopwordList])), 
#          'total_punc': (lambda x: len([x for x in x.split() for j in x if j in string.punctuation])), 
#          'total_num': (lambda x: len([x for x in x.split() if x.isdigit()])), 
#          'total_uppercase': (lambda x: len([x for x in x.split() if x.isupper()]))}

#     groupdf = df.groupby(['author']).agg(d)
#     groupdf = groupdf.reset_index()
#     groupdf.columns = groupdf.columns.droplevel(1)
    
    # Total number of characters (including space)
    print("\tCharacter count per author...")
    df['x_feat', 'char_count'] = df['text', 'body'].str.len()
    # Total number of stopwords
    print("\tNumber of stopwords per author...")
    stopwordList = stopwords.words('english')
    df['x_feat', 'stopwords'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))
    # Total number of punctuation or special characters
    print("\tTotal number of punctuation per author...")
    df['x_feat', 'total_punc'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))
    # Total number of numerics
    print("\tTotal number of numerics per author...")
    df['x_feat', 'total_num'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    # Total number of uppercase words
    print("\tTotal number of upper case words per author...")
    df['x_feat', 'total_uppercase'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))    
    return df

# type token ratio
def typetokenratio(df):
    ratiolst = []
    for comment in tqdm(df['text', 'body']):
            lex = LexicalRichness(comment)
            if lex.words == 0:
                ratiolst.append(0)
            else:
                ratio = lex.ttr
                ratiolst.append(ratio)
    df['lin_feat', 'ttr'] = ratiolst
    return df

# words per sentence
def wordcounter(df):
    lengthscore = []
    for row in tqdm(df['text', 'senttokens']):
        rowscore = []
        for comment in row:
            sentencescore = 0
            for senttoken in comment:
                length = len(senttoken.split())
                sentencescore += length
            if len(comment) > 1:
                sentencescore = sentencescore/len(comment)
        lengthscore.append(sentencescore)
        arr = np.array(lengthscore)
    df['lin_feat', 'words_per_sent'] = lengthscore
    return df

# words longer than six characters
def charcounter(df):
    charscore = []
    for row in tqdm(df['text', 'tokens']):
        for comment in row:
            rowcharscore = 0
            lencomment = len(comment)
            if lencomment == 0:
                score = 0
            else:
                number = 0
                for token in comment:
                    length = len(token)
                    if length > 5:
                        number+=1
                score = number/lencomment
            rowcharscore += score
        rowcharscore = rowcharscore/len(row)
        charscore.append(rowcharscore)
    df['lin_feat', 'wordslongersix'] = charscore
    return df

# POS tagger
def tagging(df):
    past = [] #VPA
    presence = [] #VPR
    adverbs = [] #RB
    prepositions = [] #PREP
    pronouns = [] #PR
    for comment in tqdm(df['body']):
            text = comment.split()
            tags = nltk.pos_tag(text)
            counts = Counter(tag for word,tag in tags)
            total = sum(counts.values())
            pron = counts['PRP'] + counts['PRP$']
            verbspr = counts['VB'] + counts['VBG'] + counts['VBP'] + counts['VBZ'] + counts['MD']
            verbspa = counts['VBD'] + counts['VBN']
            preps = counts['IN'] + counts['TO']
            counts['PR'] = pron
            counts['PREP'] = preps
            counts['VPR'] = verbspr #present tense
            counts['VPA'] = verbspa #past tense
            if total == 0:
                allcounts = dict((word, float(count)/1) for word,count in counts.items())
            else:
                allcounts = dict((word, float(count)/total) for word,count in counts.items())
            try:
                past.append(allcounts['VPA'])
            except KeyError:
                past.append(0)
            try:
                presence.append(allcounts['VPR'])
            except KeyError:
                presence.append(0)
            try:
                adverbs.append(allcounts['RB'])
            except KeyError:
                adverbs.append(0)
            try:
                prepositions.append(allcounts['PREP'])
            except KeyError:
                prepositions.append(0)
            try:
                pronouns.append(allcounts['PR'])
            except KeyError:
                pronouns.append(0)
    df['lin_feat', 'pasttense'] = past
    df['lin_feat', 'presencetense'] = presence
    df['lin_feat', 'adverbs'] = adverbs
    df['lin_feat', 'prepositions'] = prepositions
    df['lin_feat', 'pronouns'] = pronouns
    return df

def ngram_preprocessing(df):
    # convert input from list to string
    ngrams = []
    inputtext = []
    valid = True
    notvalid_lst =[]
    for row in tqdm(df['text', 'tokens']):
        valid_string = ""
        for comment in row:
            valid = True
            text = ' '.join(comment)
            i=0
            for char in text:
                if not(char in string.printable):
                    valid = False
                    notvalid_lst += [char]
                    i+=1
            if valid == True:
                textspace = text + " "
                valid_string += textspace
#         print(valid_string, "\n")
        inputtext.append(valid_string)
    if len(notvalid_lst) > 0:
        print("Not valid chars: ", set(notvalid_lst), "\nNumber of dismissed comments: ", i)
    print("Length of inputtext: ", len(inputtext))
    return inputtext
    
def ngrams(df, inputtext, n_min, n_max, ngramtype):    
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype, max_features=1000)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    dense = vectors.todense()
    denselist = dense.tolist()
    print("Get feature names...")
    names = vectorizer.get_feature_names()
    print("Length of feature names: ", len(names))
    print("Create df...")
    ngramdf = pd.DataFrame(denselist, columns=names)
#     ngramdf['author'] = df['author']
    return ngramdf

def merge_dfs(df1, df2, df3):
#     cwngramsdf = pd.merge(df1, df2, on='author', how='inner', suffixes= (None, "_charngram"))
#     gramsdf = pd.merge(df3, cwngramsdf, on='author', how='inner', suffixes= (None, "_ngram"))
    ngramsdf = df1.join(df2, rsuffix="_char")
    ngramsdf['author'] = df3.index
    ngramsdf = ngramsdf.set_index('author')
    headers = (len(ngramsdf.columns))*['ngram']
    columns = ngramsdf.columns.values
    print(len(headers))
    print(len(columns))
    arrays = [headers] + [columns]
    ngramsdf.columns = pd.MultiIndex.from_arrays(arrays)
    newdf = df3.join(ngramsdf, rsuffix="_ngram")
    return newdf


In [17]:
# Wrapper for linguistic features

def extract_lin_features(df, create_ngrams):
    print("Create additional features...")
    df = create_features(df)
    print("Create ttr...")
    df = typetokenratio(df)
    print("Count words per sentence...")
    df = wordcounter(df)
    print("Count words with more than six letters...")
    df = charcounter(df)
    print("POS-Tagger...")
    df = tagging(df)
    print("number of rows df", len(df))
    df.to_pickle("linfeat_df_nongrams_nolda_allcomments.pkl")
    
    if create_ngrams == "none":
        return df
    
    elif create_ngrams == "all":
        print("Ngrams...")
        print("Preprocessing for ngrams: ")
        inputtext = ngram_preprocessing(df)
        print("Create word ngrams...")
        wordngramsdf = ngrams(df, inputtext, 1, 3, "word")
        print("Create char ngrams...")
        charngramsdf = ngrams(df, inputtext, 2, 3, "char")
        print("Merge df...")
        gramsdf = merge_dfs(wordngramsdf, charngramsdf, df)
        return gramsdf
    
    elif create_ngrams == "word":
        wordngrams = ngrams(df, 1, 3, 'word')
        wordngramsdf = pd.DataFrame(wordngrams)
#         gramsdf = pd.merge(df, wordngramsdf, on='author', how='inner', suffixes=(None, "_ngram"))
        gramsdf = df.join(wordngramsdf)
        return gramsdf
    
# create dataframe with linguistic features
# lin_ngrams_df = extract_lin_features(user_feat_df, "all")
# predf = pd.read_pickle("linfeat_df_nongrams_nolda_allcomments.pkl")
# lin_ngrams_df = extract_lin_features(predf, "all")
# lin_ngrams_df.to_pickle("lin_feat_df_withoutuserfeat_allcomments.pkl")

Ngrams...
Preprocessing for ngrams: 


  0%|          | 0/1606 [00:00<?, ?it/s]

Not valid chars:  {'ọ', 'ᴗ', '笙', 'ƽ', '워', 'ਊ', 'ǵ', '訓', '卖', '午', 'ध', 'ド', 'ਪ', '堆', '味', '喝', '奈', '奔', 'ᴜ', 'し', 'ō', '己', 'ὶ', 'ᴷ', '炸', '个', '境', '金', 'ה', '毵', '鉄', '美', 'ヮ', 'ϵ', '말', 'ʧ', '雕', '进', '术', '军', '點', '揭', '贴', '여', '继', '里', '队', '色', '辦', '莎', '釈', '孩', '絕', 'ٹ', '侏', '憑', '늦', '酱', '洩', '博', '遍', '分', 'ɾ', '者', '哈', 'ᐤ', '谢', 'ž', 'ã', '狂', 'ᕥ', 'ĵ', '러', '효', '혜', '들', '麼', 'อ', '班', 'न', 'अ', 'ฬ', '浅', '럼', 'এ', 'औ', '拾', 'ό', '艸', '姬', 'î', '晚', '翻', 'ṣ', '와', 'व', '것', 'ท', 'د', '姑', 'ό', '啡', '鷹', 'ǒ', '儒', '世', '呢', 'ਧ', 'т', '粤', '稍', '訪', 'ㄝ', '橋', 'ブ', 'º', 'р', 'ς', '休', '業', 'œ', 'ṇ', '荷', '于', '重', '히', 'ʔ', 'α', 'г', 'ב', '教', 'ط', '用', '庭', '标', 'ɻ', 'ѡ', 'ť', '路', 'ὢ', 'ɢ', '泥', '倩', 'к', '꾸', '小', '清', '滚', '讀', '会', '从', '罠', 'ɡ', '啲', 'π', '소', '嗑', '수', '繁', 'ş', '帽', 'な', '监', '遥', '丿', 'ಠ', 'ы', '找', '插', 'チ', '深', '药', '과', '光', 'ᵑ', '浜', '干', '躁', 'う', 'ا', '真', '城', '猴', '傷', '豪', '내', 'ᴍ', '모', 'ᛁ', '綾', 'ω', '刻', '码', '漫', '機', '림', '

  0%|          | 0/1606 [00:00<?, ?it/s]

Get feature names...
Length of feature names:  1000
Create df...
Create char ngrams...
Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

Get feature names...
Length of feature names:  1000
Create df...
Merge df...
1999
1999


In [18]:
# del predf

In [69]:
# Wordlists (functions)

# Empath
# create new categories with empath
def new_cat():
    empath = Empath()
    social = empath.create_category("social",["mate","talk","they"])
    humans = empath.create_category("humans",["adult","baby","boy"])
    cognitive = empath.create_category("cognitive",["cause","know","ought"])
    insight = empath.create_category("insight",["think","know","consider"])
    causation = empath.create_category("causation",["because","effect","hence"])
    discrepancy = empath.create_category("discrepancy",["should","would","could"])
    tentative = empath.create_category("tentative",["maybe","perhaps","guess"])
    certainty = empath.create_category("certainty",["always","never", "proof"])
    inhibition = empath.create_category("inhibition",["block","constrain","stop"])
    inclusive = empath.create_category("inclusive",["and","with","include"])
    exclusive = empath.create_category("exclusive",["but","without","exclude"])
    perceptual = empath.create_category("perceptual",["observing","hear","feeling"])
    see = empath.create_category("see",["view","saw","seen"])
    feel = empath.create_category("feel",["feels","touch","feeling"])
    biological = empath.create_category("biological",["eat","blood","pain"])
    relativity = empath.create_category("relativity",["area","bend","go"])
    space = empath.create_category("space",["down","in","thin"])
    time = empath.create_category("time",["end","until","season"])
    agreement = empath.create_category("agreement", ["agree", "ok", "yes"])
    fillers = empath.create_category("fillers", ["like", "Imean", "yaknow"])
    nonfluencies = empath.create_category("nonfluencies", ["umm", "hm", "er"])
    conjunctions = empath.create_category("conjunctions", ["and", "but", "whereas"])
    quantifiers = empath.create_category("quantifiers", ["few", "many", "much"])
    numbers = empath.create_category("numbers", ["two", "fourteen", "thousand"])

def apply_empath(df):
    empath = Empath()
    print("Create new empath categories...")
    new_cat()
    print("Apply empath...")
    empathvalues = []
    empathcategories = ["swearing_terms", "social", "family", "friends", "humans", "emotional", "positive_emotion", 
                        "negative_emotion", "fear", "anger", "sadness", "cognitive", "insight", "causation", 
                        "discrepancy", "tentative", "certainty", "inhibition", "inclusive", "exclusive", 
                        "perceptual", "see", "hear", "feel", "biological", "body", "health", "sexual", "eat", 
                        "relativity", "space", "time", "work", "achievement", "leisure", "home", "money", 
                        "religion", "death" ,"agreement", "fillers", "nonfluencies", "conjunctions", "quantifiers", 
                        "numbers"]
    for sentence in tqdm(df['text', 'decon_body']):
        empathvalues.append(empath.analyze(sentence, categories=empathcategories, normalize=True))
    empathdf = pd.DataFrame(empathvalues)
    empathdf['author'] = df.index
    empathdf = empathdf.set_index('author')
    headers = 40*['empath'] + 5*['lin_feat']
    columns = empathdf.columns.values
    print(len(headers))
    print(len(columns))
    arrays = [headers] + [columns]
    empathdf.columns=pd.MultiIndex.from_arrays(arrays)
    newdf = df.join(empathdf, rsuffix="_empath")
    return newdf

In [70]:
# Import data for other wordlists
concretenessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/concreteness.csv')
cdf = concretenessdf[['Conc.M']]
cmatrix = cdf.to_numpy()
concrete = concretenessdf['Word'].values.tolist()
del concretenessdf

happinessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/happiness_ratings.csv')
hdf = happinessdf[['happiness_average']]
hmatrix = hdf.to_numpy()
happiness = happinessdf['word'].values.tolist()
del happinessdf

cursedf = pd.read_csv('/home/sophia/ma_py/psych_lists/mean_good_curse.csv')
cudf = cursedf[['mean_good_curse']]
cumatrix = cudf.to_numpy()
curse = cursedf['word'].values.tolist()
del cursedf

sensorydf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_experience_ratings.csv')
serdf = sensorydf[['Average SER']]
sermatrix = serdf.to_numpy()
ser = sensorydf['Word'].values.tolist()
del sensorydf

alldf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_ratings_all.csv')
newalldf = alldf[['Emotion', 'Polarity', 'Social', 'Moral', 'MotionSelf', 'Thought', 'Color', 'TasteSmell', 'Tactile', 'VisualForm', 'Auditory', 'Space', 'Quantity', 'Time', 'CNC', 'IMG', 'FAM']]
newalldf = newalldf.fillna(0)
allmatrix = newalldf.to_numpy()
allsens = alldf['Word'].values.tolist()
del alldf

valarodomdf = pd.read_csv('/home/sophia/ma_py/psych_lists/valence_arousal_dominence.csv')
vaddf = valarodomdf[['V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
vadmatrix = vaddf.to_numpy()
vad = valarodomdf['Word'].values.tolist()
del valarodomdf

mrcdf = pd.read_csv('/home/sophia/ma_py/psych_lists/mrclists_c_p.csv', sep='\t', names=['word', 'cmean', 'pmean'])
cpdf = mrcdf[['cmean', 'pmean']]
cpmatrix = cpdf.to_numpy()
mrc = mrcdf['word'].values.tolist()
del mrcdf

# function for other wordlists

def preprocess_counting(df):
    inputtext = []
    for row in tqdm(df['text', 'decon_body']):
        text = ' '.join(row)
        inputtext.append(text) 
    return inputtext

def counter(inputtext, vocab):  
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("\tVectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    return v

def multiply(matrix, ratings):
    # matrix multiplication 
    result = np.matmul(matrix, ratings)
    # divide each score with the number of words in the list to normalize
    if len(ratings) > 0:
        result = result/(len(ratings))
    return result

def aggregator(df, inputtext, vocab, ratings, name):
    print("\tCount...")
    count = counter(inputtext, vocab)
    print("\tMultiply...")
    result = multiply(count, ratings)
    num_rows, num_cols = result.shape
    
    if num_cols ==1:
        df['psych', name] = result
        return df
    else:
        resultdf = pd.DataFrame(result, columns=name)
        resultdf['author'] = df.index
        resultdf = resultdf.set_index('author')
        headers = (len(name))*['psych']
        columns = resultdf.columns.values
        print(len(headers))
        print(len(columns))
        arrays = [headers] + [columns]
        resultdf.columns=pd.MultiIndex.from_arrays(arrays)
        df = df.join(resultdf, rsuffix="_wordlist")
        return df   

In [73]:
# wordlists created manually
negations = ["no", "not", "none", "nobody", "nothing", "neither", "nowhere", "never", "nay"]
articles = ["a", "an", "the"]
future = ["will", "gonna"]

def list_counter(df, inputtext, vocab, name):
    total = []
    for row in tqdm(df['text', 'decon_body']):
        total.append(len(row))
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("\tVectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    averagev = v.sum(axis=1)
    totalvector =  np.array(total)
    score = np.divide(averagev, totalvector)
    df['lin_feat', name] = score
    return df

In [74]:
# Wrapper for wordlists
def extract_wordlist_features(df):
    print("Empath...")
    empdf = apply_empath(df)
    # create scores for each word list and add them to df
    print("Preprocessing for wordlists...")
    inputtext = preprocess_counting(df)
    print("\nWordlist Concreteness: \n")
    psychdf = aggregator(empdf, inputtext, concrete, cmatrix, "concreteness")
    print("\nWordlist Happiness: \n")
    psychdf = aggregator(psychdf, inputtext, happiness, hmatrix, "happiness")
    print("\nWordlist Good_Curse: \n")
    psychdf = aggregator(psychdf, inputtext, curse, cumatrix, "good_curse")
    print("\n17 further wordlists: \n")
    psychdf = aggregator(psychdf, inputtext, allsens, allmatrix, ['emotion', 'polarity', 'social', 'moral', 'motionself', 'thought', 'color', 'tastesmell', 'tactile', 'visualform', 'auditory', 'space', 'quantity', 'time', 'CNC', 'IMG', 'FAM'])
    print("\nWordlist SER: \n")
    psychdf = aggregator(psychdf, inputtext, ser, sermatrix, "SER")
    print("\nWordlists Valence, Arousal, Dominance: \n")
    psychdf = aggregator(psychdf, inputtext, vad, vadmatrix, ['valence', 'arousal', 'dominance'])
    print("\nWordlist Negation: \n")
    psychdf = list_counter(psychdf, inputtext, negations, "negations")
    print("\nWordlist Articles: \n")
    psychdf = list_counter(psychdf, inputtext, articles, "articles")
    print("\nWordlist Future: \n")
    psychdf = list_counter(psychdf, inputtext, future, "future")
    print("\nWordlists from MRC (2): \n")
    psychdf = aggregator(psychdf, inputtext, mrc, cpmatrix, ["mrc_cmean", "mrc_pmean"])
    
    return psychdf

# predf = pd.read_pickle("linfeat_df_nongrams_nolda_allcomments.pkl")
# predf['decon_body'] = predf['doc_body'].apply(lambda x:([decontracted(x) for x in x]))
psychdf = extract_wordlist_features(lin_ngrams_df)
# psychdf.to_pickle("wordlists_lin_feat_df_withoutuserfeat_allcomments.pkl")

Empath...
Create new empath categories...
["talk", "mates", "mate", "Because", "friends", "anyone", "anything", "mean", "though", "anyway", "guess", "anymore", "should", "why", "knew", "someone", "trust", "wanted", "actually", "family", "anybody", "Well", "care", "parents", "knowing", "understand", "Now", "Maybe", "else", "probably", "happen", "yet", "honestly", "maybe", "either", "If", "always", "thought", "leave", "suppose", "talk", "own_friends", "telling", "nt", "right", "either", "cause", "talking", "cause", "anyways"]
["child", "kid", "girl", "baby", "adult", "teenager", "boy", "little_girl", "little_boy", "young", "age", "baby_girl", "teen", "woman", "princess", "toddler", "grown_man", "baby_sister", "daughter", "six_year_old", "sister", "teenage_girl", "newborn", "guy", "baby_boy", "brother", "three_year_old", "sixteen_year_old", "four_year_old", "6_year_old", "ten_year_old", "new_man", "one", "seven_year_old", "person", "babies", "12_year_old", "twelve_year_old", "4_year_old",

["noticed", "seen", "view", "seeing", "spotted", "sight", "saw", "found", "realized", "spied", "veiw", "appeared", "realised", "showed", "recognized", "glimpsed", "glimpse", "faced", "notice", "noticing", "spot", "disappeared", "stopped", "standing", "shown", "remembered", "front", "caught", "watched", "recognised", "figure", "spotting", "observed", "silhouette", "clear_view", "guessed", "near", "met", "corner", "Seeing", "witnessed", "pictured", "passed", "approached", "entered", "first_glimpse", "emerged", "familiar_face", "imagined", "stood", "notice", "dissapeared", "before"]
["feel", "feels", "feeling", "feeling", "touch", "felt", "touching", "numb", "touch", "touched", "Feeling", "hurt", "feel", "sensation", "hurting", "hurts", "felling", "touches", "burn", "own_skin", "aching", "tingly", "weak", "body", "makes", "kiss", "pain", "tingling", "whole_body", "warm", "knowing", "cold", "breathe", "tingle", "heat", "own_body", "lie", "someone", "yet", "tingling", "burning", "though", "

  0%|          | 0/1606 [00:00<?, ?it/s]

45
45
Preprocessing for wordlists...


  0%|          | 0/1606 [00:00<?, ?it/s]


Wordlist Concreteness: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...

Wordlist Happiness: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...

Wordlist Good_Curse: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...

17 further wordlists: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...
17
17

Wordlist SER: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...

Wordlists Valence, Arousal, Dominance: 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...
3
3

Wordlist Negation: 



  0%|          | 0/1606 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]


Wordlist Articles: 



  0%|          | 0/1606 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]


Wordlist Future: 



  0%|          | 0/1606 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]


Wordlists from MRC (2): 

	Count...
	Vectorize...


  0%|          | 0/1606 [00:00<?, ?it/s]

	Multiply...
2
2


In [None]:
def all_in_one(df, bigfive):
    print("Create comment df (name: pandora)...")
    pandora = create_commentdf(df)
    # merge commentdf and authordf
    print("Sort pandora df...")
    pandora= pandora.sort_values(by='author')
    print("Sort big five df...")
    bigfive= bigfive.sort_values(by='author')
    if pandora.index.name != 'author':
        print("Set pandora index...")
        pandora = pandora.set_index('author')
    if bigfive.index.name != 'author':
        print("Set bigfive index...")
        bigfive = bigfive.set_index('author')
    print("Join commentdf and authordf")
    pandoradf = pandora.join(bigfive)
    del pandora
    del bigfive

    # create multiindex
    headers = 2*['text'] + 1*['data'] + 3*['post'] + 1*['subreddit'] + 3*['time'] + 1*['post'] + 24*['time'] + 16059*['subreddit'] + 5*['trait']
    columns = pandoradf.columns.values
    print(len(headers))
    print(len(columns))
    arrays = [headers] + [columns]
    pandoradf.columns=pd.MultiIndex.from_arrays(arrays)
    pandoradf.info(verbose=True)
    
    # preprocessing
    predf = preprocess(pandoradf)
    predf.to_pickle("preprocessed_df_allcomments.pkl")
    predf
    del pandoradf
    
    # create linguistic features
    lin_ngrams_df = extract_lin_features(predf, "all")
    lin_ngrams_df.to_pickle("lin_feat_df_withoutuserfeat_allcomments.pkl")
    del predf
    
    # create features with empath and wordlists
    psychdf = extract_wordlist_features(lin_ngrams_df)
    psychdf.to_pickle("wordlists_lin_feat_df_withoutuserfeat_allcomments.pkl")
    del lin_ngrams_df
        
    # create user features
    user_feat_df = extract_userfeatures(psychdf)
    user_feat_df.to_pickle("allfeat_df_allcomments.pkl")
    
    return user_feat_df
    
featuredf = all_in_one(df, bigfive)
print(featuredf.info())
featuredf.head()

In [75]:
psychdf.iloc[:, 18173:]

Unnamed: 0_level_0,psych,psych,psych,psych,psych,psych,psych,psych,lin_feat,lin_feat,lin_feat,psych,psych
Unnamed: 0_level_1,time,CNC,IMG,FAM,SER,valence,arousal,dominance,negations,articles,future,mrc_cmean,mrc_pmean
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
-Areopagan-,0.131824,9.069241,10.629827,15.519308,0.112809,0.067701,0.047306,0.066281,3.200000,3.600000,0.400000,0.0,0.0
-BigSexy-,18.580812,1988.205060,2096.214381,2681.537949,18.201356,11.312668,7.874620,11.121217,0.716124,1.496246,0.133000,0.0,0.0
-BlitzN9ne,7.319627,779.291611,849.531292,1117.416778,8.693842,5.300178,3.868446,5.229659,0.245310,0.523186,0.060885,0.0,0.0
-CrestiaBell,45.649601,4955.151798,5401.050599,6924.038615,48.954284,29.105901,20.767584,28.330990,0.479159,1.213482,0.076933,0.0,0.0
-dyad-,1.754554,184.026631,201.512650,258.794940,2.258841,1.376648,0.948765,1.345483,1.051064,1.765957,0.076596,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
zugzwang_03,234.471092,25894.154461,27882.106525,35946.894807,264.864815,151.690760,107.190280,149.514373,2.038737,2.109629,0.269179,0.0,0.0
zuluthrone,5.260399,558.239680,596.608522,756.619174,4.985040,3.232700,2.342442,3.184699,0.326819,1.220413,0.106406,0.0,0.0
zwelg,0.005593,0.611185,0.620506,0.804261,0.008894,0.011598,0.007766,0.011432,0.000000,1.000000,0.000000,0.0,0.0
zymmaster,23.811758,2421.539281,2596.780293,3376.957390,22.829524,14.005260,9.923248,13.896333,1.053264,2.889295,0.194256,0.0,0.0


In [76]:
psychdf.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1606 entries, -Areopagan- to zyzee
Data columns (total 18186 columns):
 #      Column                               Dtype  
---     ------                               -----  
 0      (text, body)                         object 
 1      (text, doc_body)                     object 
 2      (data, utc)                          object 
 3      (post, score)                        float64
 4      (post, controversiality)             float64
 5      (post, gilded)                       float64
 6      (subreddit, num_subreddit)           int64  
 7      (time, mean_time)                    float64
 8      (time, median_time)                  float64
 9      (time, max_time)                     int64  
 10     (post, lang)                         int64  
 11     (time, monday)                       int64  
 12     (time, tuesday)                      int64  
 13     (time, wednesday)                    int64  
 14     (time, thursday)            

In [80]:
for value in psychdf['psych', 'mrc_pmean']:
    if value >0:
        print(value)
print("the end")

the end


In [78]:
psychdf['psych', 'FAM']

author
-Areopagan-        15.519308
-BigSexy-        2681.537949
-BlitzN9ne       1117.416778
-CrestiaBell     6924.038615
-dyad-            258.794940
                    ...     
zugzwang_03     35946.894807
zuluthrone        756.619174
zwelg               0.804261
zymmaster        3376.957390
zyzee              91.403462
Name: (psych, FAM), Length: 1606, dtype: float64