# Preprocessing and feature engineering with memory management

## Prep

In [1]:
# Import packages

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams, ngrams

import re
import string
from string import punctuation


from gensim import corpora, models

from empath import Empath

from collections import Counter
from num2words import num2words
from lexicalrichness import LexicalRichness
import textblob

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from scipy.stats import entropy
import decimal

from tqdm.notebook import tqdm
tqdm.pandas()
import time
import datetime
import random
random.seed(32)

import gc
import sys 
import itertools
from pympler import tracker
import pickle

[nltk_data] Downloading package punkt to /home/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sophia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/sophia/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
def reduce_df_size():
    print("Reduce floats...")
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].astype('float32')
    print("Reduce ints...")
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].astype('int16')
    print("Size of the df in MB: ")
    print(df.memory_usage(index=True,deep=True).sum() / (1024**2))

## Prepare dataset

In [3]:
# Import dataset with comments (big five labels)
# comments = pd.read_csv('/home/sophia/ma_py/pandora_bigfive.csv')

# # Import augmented data with B5 labels
# comments = pd.read_pickle("pandora_b5_deter.pkl")

# # Import dataset authors and delete not needed columns (big five labels)
# authors = pd.read_csv('/home/sophia/ma_py/author_profiles.csv')
# bigfive = authors[['author','agreeableness','openness','conscientiousness','extraversion','neuroticism']]
# bigfive = bigfive[bigfive['openness'].notna()]
# bigfive = bigfive[bigfive['conscientiousness'].notna()]
# bigfive = bigfive[bigfive['extraversion'].notna()]
# bigfive = bigfive[bigfive['agreeableness'].notna()]
# bigfive = bigfive[bigfive['neuroticism'].notna()]
# del authors

# Datasets with mbti und big five labels
comments = pd.read_pickle("comments_uniondf.pkl")
bigfive = pd.read_pickle("uniondf.pkl")

# remember to change name of output as well!!!!

In [4]:
# Functions
traitlen = len(bigfive.columns.tolist())
# minus 1 because of author column
traitlen = traitlen-1 

# create time columns from UTC
def create_timecolumns(df):
    readable = []
    weekday = []
    month = []
    year = []
    hour = []
    for row in tqdm(df['created_utc']):
        item = datetime.datetime.fromtimestamp(row)
        weekday_item = item.strftime('%A')
        readable_item = datetime.datetime.fromtimestamp(row).isoformat()
        month.append(str(readable_item[5:7]))
        year.append(str(readable_item[0:4]))
        hour.append(str(readable_item[11:13]))
        readable.append(readable_item)
        weekday.append(weekday_item.lower())
    df['time'] = readable
    df['weekday'] = weekday
    df['month'] = month
    df['year'] = year
    df['daily'] = hour
    return df

# count occurences in time columns to get time distribution
def timecounter(lst, vocablst):
    if vocablst == 'weekday':
        vocab = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    elif vocablst == 'month':
        vocab = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    elif vocablst == 'hour':
        vocab = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', 
                 '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']
    elif vocablst == 'year':
        vocab = ['2015', '2016', '2017', '2018', '2019']
    else:
        print("No valid input: vocab list")
    vectorizer = CountVectorizer(analyzer="word", vocabulary=vocab)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
    return v

def timeinterval(lst):
    max_lst = []
    median_lst = []
    mean_lst = []
    for sublst in lst:
        intlst = [int(item) for item in sublst]
        if len(intlst) > 1:
            intlst.sort()
            arr = np.array(intlst)
            diff_lst = np.diff(arr)
            max_lst.append(max(diff_lst))
            median_lst.append(np.median(diff_lst))
            mean_lst.append(np.mean(diff_lst))
        else:
            max_lst.append(-1)
            median_lst.append(-1)
            mean_lst.append(-1)
    return mean_lst, median_lst, max_lst

# create a list of all subreddits in the dataset
lst = comments['subreddit'].tolist()
lst = [item.lower() for item in lst]
subredditset = set(lst)
subredditlist = list(subredditset)
subredditlength = len(subredditlist)

# count occurences of subreddits 
def subredditcounter(lst, subredditlst):
    vectorizer = CountVectorizer(analyzer="word", vocabulary=subredditlist)
    vectors = vectorizer.fit_transform(lst)
    v = vectors.toarray()
    return v

# aggregate dataset to get one row per author and create new columns for time and subreddit
def create_groupdf(df): 
#     print("\tCreate numeric language representation...")
#     df = numeric_lang(df)
    print("\tCreate time columns...")
    df = create_timecolumns(df)
    # create dictionary for aggregation function
    d = {'lang': ['nunique'], 'ratio_en': (lambda x : list(x)), 'controversiality': ['mean'], 'gilded': ['mean'], 'score':['mean'],
         'body': (' '. join), 'doc_body': (lambda x : list(x)),
         'utc': (lambda x : list(x)), 'subreddit': (' '. join), 'num_subreddit': ['nunique'],
         'weekday': (' '. join), 'month': (' '. join), 'year': (' '. join), 'daily': (' '. join)}
 
    # new ungrouped columns
    print("\tCreate new ungrouped columns...")
    df['body'] = df['body'].apply(lambda x: str(x))
    df['doc_body'] = df['body']
    df['num_subreddit'] = df['subreddit']
#     df['lang'] = df['language'].apply(lambda x: str(x))
    df['ratio_en'] = df['lang']
    df['utc'] = df['created_utc'].apply(lambda x: str(x))
#     df['subreddit'] = df['subreddit'].apply(lambda x: [x.lower()])
    df['subreddit'] = df['subreddit'].apply(lambda x: ''.join(x.lower()))
    counts = df['author'].value_counts()
    ndf = pd.DataFrame(counts)
    ndf.reset_index(inplace=True)
    ndf.rename(columns = {'index':'author', 'author': 'n_comments'}, inplace = True)
    
    # create df groupd by author + transform
    print("\tGroup df by author...")
    groupdf = df.groupby(['author']).agg(d)
    groupdf = groupdf.reset_index()
    groupdf.columns = groupdf.columns.droplevel(1)
    groupdf.merge(ndf, left_on='author', right_on='author')
    
    return groupdf
    
def create_new_columns(df):    
    # controversiality
    print("\tCreate controversiality column...")
    df['controversiality'] = df['controversiality'].fillna(0)
    # gilded
    print("\tCreate mean_gilded...")
    df['gilded'] = df['gilded'].fillna(0)
    # ratio of english comments
    newcolumn = []
    for row in df['ratio_en']:
        other = [value for value in row if value != 'en']
        english = row.count('en')
        if len(other) == 0:
            newcolumn.append(1)
        else: 
            newcolumn.append(english/len(other))
    df['ratio_en'] = newcolumn
    # number of comments per subreddit
    print("\tCreate subreddit_dist...")
    subreddit_predist = subredditcounter(df['subreddit'], subredditlist)
    subreddit_predist = subreddit_predist.tolist()
    df['subreddit_dist'] = subreddit_predist
    # entropy
    df['entropy'] = df['subreddit_dist'].apply(lambda x: entropy(x, base=2))
    # time
    print("\tCompute time intervals...")
    df['mean_time'], df['median_time'], df['max_time'] = timeinterval(df['utc'])
    print("\tCreate weekday_dist...")
    weekday = timecounter(df['weekday'], 'weekday')
    weekday = weekday.tolist()
    df['weekday_dist'] = weekday
    print("\tCreate month_dist...")
    month = timecounter(df['month'], 'month')
    month = month.tolist()
    df['month_dist'] = month
    print("\tCreate year_dist...")
    year = timecounter(df['year'], 'year')
    year = year.tolist()
    df['year_dist'] = year
    print("\tCreate day_dist...")
    day = timecounter(df['daily'], 'hour')
    day = day.tolist()
    df['daily_dist'] = day
    
    print("\tCreate new aggregated df...")
    newdf = df[['author', 'body', 'doc_body', 'utc', 'score', 'controversiality', 
                'gilded', 'ratio_en', 'num_subreddit', 'subreddit_dist', 'entropy', 'mean_time', 'median_time', 'max_time', 'weekday_dist', 
                'month_dist', 'year_dist', 'daily_dist', 'lang']]
    print("\tSort new aggregated df...")
    newdf = newdf.sort_values(by='author')
    print("\tDrop duplicates in new aggregated df...")
    newdf = newdf.drop_duplicates(subset=['author'])
    return newdf

# get one column for each feature in the distributions of time and subreddit
weekday = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
month = ['january', 'february', 'march', 'april', 'may', 'june', 'juli', 'august', 'september', 'october', 'november', 'december']
year = ['2015', '2016', '2017', '2018', '2019']
hour = ['hour01', 'hour02', 'hour03', 'hour04', 'hour05', 'hour06', 'hour07', 'hour08', 'hour09', 'hour10', 
        'hour11', 'hour12', 'hour13', 'hour14', 'hour15', 'hour16', 'hour17', 'hour18', 'hour19', 'hour20', 
        'hour21', 'hour22', 'hour23', 'hour24']
timelen = len(weekday+month+year+hour)

def onecolumnperdatapoint(df, column, namelist):
    for i in tqdm(range(len(namelist))):
        df[namelist[i]] = df[column].apply(lambda x:[x[i]])
        df[namelist[i]] = [item[0] for item in df[namelist[i]]]
    return df

In [5]:
# Wrapper for commentdf
def create_commentdf(df):
    print("Create new df grouped by author...")
    groupdf = create_groupdf(df)
    print("Create new columns with features...")
    pandora = create_new_columns(groupdf)
    print("Distribute the weekday_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'weekday_dist', weekday)
    print("Distribute the month_dist to several columns")
    pandora = onecolumnperdatapoint(pandora, 'month_dist', month)
    print("Distribute the year_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'year_dist', year)
    print("Distribute the daily_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'daily_dist', hour)
    print("Distribute the subreddit_dist to several columns...")
    pandora = onecolumnperdatapoint(pandora, 'subreddit_dist', subredditlist)
    print("Drop dist columns...")
    pandora.drop(['weekday_dist', 'month_dist', 'year_dist', 'daily_dist', 'subreddit_dist'], axis=1, inplace=True)
    return pandora

# Create basis df with one row per author

In [6]:
start = datetime.datetime.now()
print("Start time:", str(start))
print("Create comment df (name: pandora)...")
pandora = create_commentdf(comments)
print("Pandora: ")
print(pandora.info())
# merge commentdf and authordf
print("Sort pandora df...")
pandora= pandora.sort_values(by='author')
print("Sort big five df...")
bigfive= bigfive.sort_values(by='author')
if pandora.index.name != 'author':
    print("Set pandora index...")
    pandora = pandora.set_index('author')
if bigfive.index.name != 'author':
    print("Set bigfive index...")
    bigfive = bigfive.set_index('author')
print("Join commentdf and authordf")
global df
df = pandora.join(bigfive)
del pandora
del bigfive
gc.collect()
print("Df before multiindex: ")
print(df.memory_usage(index=True,deep=True).sum() / (1024**2))

# create multiindex
print("Create multiindex...\n")
headers = 2*['text'] + 1*['data'] + 4*['post'] + 5*['subtf'] + 1*['post']
headers = headers + (timelen + subredditlength -1)*['subtf'] + traitlen*['trait']

# check multiindex
columns = df.columns.values
predictorsfile=open('columns.txt','w')
for index in range(len(columns)):
    predictorsfile.write(columns[index])
    predictorsfile.write('\n')
predictorsfile.close()

print("Length headers", len(headers))
print("Length columns", len(columns))
arrays = [headers] + [columns]
df.columns=pd.MultiIndex.from_arrays(arrays)

# reduce size of dataset
print("Df with multiindex before reduction of dtypes (MB): ")
print(df.memory_usage(index=True,deep=True).sum() / (1024**2))
print("Reduce size of df...")
reduce_df_size()

print("Df with multiindex (MB): ")
print(df.memory_usage(index=True,deep=True).sum() / (1024**2))
df.info(verbose=True)
del headers

Start time: 2021-05-16 15:52:09.660810
Create comment df (name: pandora)...
Create new df grouped by author...
	Create time columns...


  0%|          | 0/1057787 [00:00<?, ?it/s]

	Create new ungrouped columns...
	Group df by author...
Create new columns with features...
	Create controversiality column...
	Create mean_gilded...
	Create subreddit_dist...
	Compute time intervals...
	Create weekday_dist...
	Create month_dist...
	Create year_dist...
	Create day_dist...
	Create new aggregated df...
	Sort new aggregated df...
	Drop duplicates in new aggregated df...
Distribute the weekday_dist to several columns...


  0%|          | 0/7 [00:00<?, ?it/s]

Distribute the month_dist to several columns


  0%|          | 0/12 [00:00<?, ?it/s]

Distribute the year_dist to several columns...


  0%|          | 0/5 [00:00<?, ?it/s]

Distribute the daily_dist to several columns...


  0%|          | 0/24 [00:00<?, ?it/s]

Distribute the subreddit_dist to several columns...


  0%|          | 0/7820 [00:00<?, ?it/s]

Drop dist columns...
Pandora: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 386 entries, 0 to 385
Columns: 7881 entries, author to stolenvalor
dtypes: float64(7), int64(7870), object(4)
memory usage: 23.2+ MB
None
Sort pandora df...
Sort big five df...
Set pandora index...
Set bigfive index...
Join commentdf and authordf
Df before multiindex: 
519.07164478302
Create multiindex...

Length headers 7889
Length columns 7889
Df with multiindex before reduction of dtypes (MB): 
519.07164478302
Reduce size of df...
Reduce floats...
Reduce ints...
Size of the df in MB: 
501.66259479522705
Df with multiindex (MB): 
501.66259479522705
<class 'pandas.core.frame.DataFrame'>
Index: 386 entries, -BlitzN9ne to zymmaster
Data columns (total 7889 columns):
 #     Column                           Dtype  
---    ------                           -----  
 0     (text, body)                     object 
 1     (text, doc_body)                 object 
 2     (data, utc)                      object 
 3   

In [7]:
df.drop(('data', 'utc'), axis = 1, inplace = True)

del create_timecolumns
del timecounter
del timeinterval
del subredditcounter
del create_groupdf
del create_new_columns
del onecolumnperdatapoint
del traitlen
del lst
del subredditset
del subredditlist
del subredditlength
del weekday
del month
del year
del hour
del timelen
del create_commentdf

## Create big five binary categories

In [8]:
# Functions
# create binary representation of personality traits
def bigfive_cat():
    # change big five to binary representation
    df['trait', 'big5_a'] = df['trait', 'agreeableness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_o'] = df['trait', 'openness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_c'] = df['trait', 'conscientiousness'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_e'] = df['trait', 'extraversion'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_n'] = df['trait', 'neuroticism'].apply(lambda x: 0 if x<50 else 1)
    df['trait', 'big5_a_multi'] = df['trait', 'agreeableness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_o_multi'] = df['trait', 'openness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_c_multi'] = df['trait', 'conscientiousness'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_e_multi'] = df['trait', 'extraversion'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    df['trait', 'big5_n_multi'] = df['trait', 'neuroticism'].apply(lambda x: 0 if x<20 else(1 if x>19 and x<40 else(2 if x>39 and x<60 else(3 if x>59 and x<80 else 4))))
    
bigfive_cat()    

In [9]:
reduce_df_size()

# # write optimized pickle
# print("Create pickle")
# filepath = "aug_commentdf.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(df, protocol=-1)
#     f.write(pickled)
# print("Done")
# del f
# del filepath
# del pickled

Reduce floats...
Reduce ints...
Size of the df in MB: 
493.582013130188


In [10]:
del comments
del bigfive_cat
del predictorsfile

del columns
del arrays

NameError: name 'f' is not defined

In [None]:
# # read optimized pickle
# filepath = "aug_commentdf.pkl"
# with open(filepath, 'rb') as f:
#     tokenlist = pickle.load(f)

# del filepath
# del f

## Feature Engineering 1

In [11]:
# other features that are not mentioned in the paper
def create_features():
    # Total number of characters (including space)
    print("\tCharacter count per author...")
    df['x_feat', 'char_count'] = df['text', 'body'].str.len()
    # Total number of stopwords
    print("\tNumber of stopwords per author...")
    stopwordList = stopwords.words('english')
    df['x_feat', 'stopwords'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))
    # Total number of punctuation or special characters
    print("\tTotal number of punctuation per author...")
    df['x_feat', 'total_punc'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))
    # Total number of numerics
    print("\tTotal number of numerics per author...")
    df['x_feat', 'total_num'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    # Total number of uppercase words
    print("\tTotal number of upper case words per author...")
    df['x_feat', 'total_uppercase'] = df['text', 'body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))    

# type token ratio
def typetokenratio():
    print("Compute ttr..")
    ratiolst = []
    for comment in tqdm(df['text', 'body']):
            lex = LexicalRichness(comment)
            if lex.words == 0:
                ratiolst.append(0)
            else:
                ratio = lex.ttr
                ratiolst.append(ratio)
    df['lin_feat', 'ttr'] = ratiolst
    
# POS tagger
def tagging():
    print("POS-Tagger...")
    past = [] #VPA
    presence = [] #VPR
    adverbs = [] #RB
    prepositions = [] #PREP
    pronouns = [] #PR
    for comment in tqdm(df['text', 'body']):
            text = comment.split()
            tags = nltk.pos_tag(text)
            counts = Counter(tag for word,tag in tags)
            total = sum(counts.values())
            pron = counts['PRP'] + counts['PRP$']
            verbspr = counts['VB'] + counts['VBG'] + counts['VBP'] + counts['VBZ'] + counts['MD']
            verbspa = counts['VBD'] + counts['VBN']
            preps = counts['IN'] + counts['TO']
            counts['PR'] = pron
            counts['PREP'] = preps
            counts['VPR'] = verbspr #present tense
            counts['VPA'] = verbspa #past tense
            if total == 0:
                allcounts = dict((word, float(count)/1) for word,count in counts.items())
            else:
                allcounts = dict((word, float(count)/total) for word,count in counts.items())
            try:
                past.append(allcounts['VPA'])
            except KeyError:
                past.append(0)
            try:
                presence.append(allcounts['VPR'])
            except KeyError:
                presence.append(0)
            try:
                adverbs.append(allcounts['RB'])
            except KeyError:
                adverbs.append(0)
            try:
                prepositions.append(allcounts['PREP'])
            except KeyError:
                prepositions.append(0)
            try:
                pronouns.append(allcounts['PR'])
            except KeyError:
                pronouns.append(0)
    df['lin_feat', 'pasttense'] = past
    df['lin_feat', 'presencetense'] = presence
    df['lin_feat', 'adverbs'] = adverbs
    df['lin_feat', 'prepositions'] = prepositions
    df['lin_feat', 'pronouns'] = pronouns

def charcounter():
    print("Calculate number of words with more than six letters...")
    charscore = []
    for row in tqdm(df['text', 'body']):
        for comment in row:
            rowcharscore = 0
            lencomment = len(comment)
            if lencomment == 0:
                score = 0
            else:
                number = 0
                for word in comment:
                    length = len(word)
                    if length > 5:
                        number+=1
                score = number/lencomment
            rowcharscore += score
        rowcharscore = rowcharscore/len(row)
        charscore.append(rowcharscore)
    df['lin_feat', 'wordslongersix'] = charscore

create_features()
typetokenratio()
tagging()
charcounter()

	Character count per author...
	Number of stopwords per author...
	Total number of punctuation per author...
	Total number of numerics per author...
	Total number of upper case words per author...
Compute ttr..


  0%|          | 0/386 [00:00<?, ?it/s]

POS-Tagger...


  0%|          | 0/386 [00:00<?, ?it/s]

Calculate number of words with more than six letters...


  0%|          | 0/386 [00:00<?, ?it/s]

In [12]:
reduce_df_size()

# # write optimized pickle
# print("Create pickle")
# filepath = "aug_commentdf_FE1.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(df, protocol=-1)
#     f.write(pickled)

# del f
# del filepath
# del pickled

Reduce floats...
Reduce ints...
Size of the df in MB: 
493.59600162506104


## Preprocessing 1

In [13]:
df.drop(('text', 'body'), axis = 1, inplace = True)

# create sentence tokens
def senttokenize():
    sentbody = []
    for row in tqdm(df['text', 'doc_body']):
        sentitem = []
        for item in row:
            sentences = sent_tokenize(item)
            sentitem.append(sentences)
        sentbody.append(sentitem)
    df['text', 'senttokens'] = sentbody

senttokenize()

  0%|          | 0/386 [00:00<?, ?it/s]

## Feature Engineering 2:

In [14]:
# words per sentence
def wordcounter():
    lengthscore = []
    for row in tqdm(df['text', 'senttokens']):
        rowscore = []
        for comment in row:
            sentencescore = 0
            for senttoken in comment:
                length = len(senttoken.split())
                sentencescore += length
            if len(comment) > 1:
                sentencescore = sentencescore/len(comment)
        lengthscore.append(sentencescore)
        arr = np.array(lengthscore)
    df['lin_feat', 'words_per_sent'] = lengthscore

wordcounter()

  0%|          | 0/386 [00:00<?, ?it/s]

In [15]:
df.drop(('text', 'senttokens'), axis = 1, inplace = True)
reduce_df_size()

# # write optimized pickle
# print("Create pickle")
# filepath = "aug_commentdf_FE2.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(df, protocol=-1)
#     f.write(pickled)

# del f
# del filepath
# del pickled

Reduce floats...
Reduce ints...
Size of the df in MB: 
13.958335876464844


## Preprocessing 2:

In [16]:
# remove decontractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

df['text', 'decon_body'] = df['text', 'doc_body'].apply(lambda x:([decontracted(phrase) for phrase in x]))
df.drop(('text', 'doc_body'), axis = 1, inplace = True)

## Feature Engineering 3

In [17]:
# Empath
# create new categories with empath
def new_cat():
    empath = Empath()
    social = empath.create_category("social",["mate","talk","they"])
    humans = empath.create_category("humans",["adult","baby","boy"])
    cognitive = empath.create_category("cognitive",["cause","know","ought"])
    insight = empath.create_category("insight",["think","know","consider"])
    causation = empath.create_category("causation",["because","effect","hence"])
    discrepancy = empath.create_category("discrepancy",["should","would","could"])
    tentative = empath.create_category("tentative",["maybe","perhaps","guess"])
    certainty = empath.create_category("certainty",["always","never", "proof"])
    inhibition = empath.create_category("inhibition",["block","constrain","stop"])
    inclusive = empath.create_category("inclusive",["and","with","include"])
    exclusive = empath.create_category("exclusive",["but","without","exclude"])
    perceptual = empath.create_category("perceptual",["observing","hear","feeling"])
    see = empath.create_category("see",["view","saw","seen"])
    feel = empath.create_category("feel",["feels","touch","feeling"])
    biological = empath.create_category("biological",["eat","blood","pain"])
    relativity = empath.create_category("relativity",["area","bend","go"])
    motion = empath.create_category("motion",["arrive","car","go", "walk", "fly", "move", "run", "leave"])
    space = empath.create_category("space",["down","in","thin"])
    time = empath.create_category("time",["end","until","season"])
    agreement = empath.create_category("agreement", ["agree", "ok", "yes"])
    fillers = empath.create_category("fillers", ["like", "Imean", "yaknow"])
    nonfluencies = empath.create_category("nonfluencies", ["umm", "hm", "er"])
    conjunctions = empath.create_category("conjunctions", ["and", "but", "whereas"])
    quantifiers = empath.create_category("quantifiers", ["few", "many", "much"])
    numbers = empath.create_category("numbers", ["two", "fourteen", "thousand"])

def apply_empath():
    global df
    empath = Empath()
    print("Create new empath categories...")
    new_cat()
    print("Apply empath...")
    empathvalues = []
    empathcategories = ["swearing_terms", "social", "family", "friends", "humans", "emotional", "positive_emotion", 
                        "negative_emotion", "fear", "anger", "sadness", "cognitive", "insight", "causation", 
                        "discrepancy", "tentative", "certainty", "inhibition", "inclusive", "exclusive", 
                        "perceptual", "see", "hear", "feel", "biological", "body", "health", "sexual", "eat", 
                        "relativity", "space", "time", "work", "achievement", "leisure", "home", "money", 
                        "religion", "death" ,"agreement", "fillers", "nonfluencies", "conjunctions", "quantifiers", 
                        "numbers"]
    for sentence in tqdm(df['text', 'decon_body']):
        empathvalues.append(empath.analyze(sentence, categories=empathcategories, normalize=True))
    empathdf = pd.DataFrame(empathvalues)
    empathdf['author'] = df.index
    empathdf = empathdf.set_index('author')
    headers = 40*['empath'] + 5*['lin_feat']
    columns = empathdf.columns.values
    print(len(headers))
    print(len(columns))
    arrays = [headers] + [columns]
    empathdf.columns=pd.MultiIndex.from_arrays(arrays)
    df = df.join(empathdf, rsuffix="_empath")
    del empathdf

In [18]:
# functions for other wordlists

def preprocess_counting():
    global df
    inputtext = []
    for row in tqdm(df['text', 'decon_body']):
        text = ' '.join(row)
        inputtext.append(text) 
    return inputtext

def counter(inputtext, vocab):  
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("\tVectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    return v

def multiply(matrix, ratings):
    # matrix multiplication 
    result = np.matmul(matrix, ratings)
    # divide each score with the number of words in the list to normalize
    if len(ratings) > 0:
        result = result/(len(ratings))
    return result

def aggregator(inputtext, vocab, ratings, name):
    global df
    print("\tCount...")
    count = counter(inputtext, vocab)
    print("\tMultiply...")
    result = multiply(count, ratings)
    num_rows, num_cols = result.shape
    
    if num_cols ==1:
        df['psych', name] = result
    else:
        resultdf = pd.DataFrame(result, columns=name)
        resultdf['author'] = df.index
        resultdf = resultdf.set_index('author')
        headers = (len(name))*['psych']
        columns = resultdf.columns.values
        print(len(headers))
        print(len(columns))
        arrays = [headers] + [columns]
        resultdf.columns=pd.MultiIndex.from_arrays(arrays)
        df = df.join(resultdf, rsuffix="_wordlist")
        del resultdf

def list_counter(inputtext, vocab, name):
    global df
    total = []
    for row in tqdm(df['text', 'decon_body']):
        total.append(len(row))
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("\tVectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    averagev = v.sum(axis=1)
    totalvector =  np.array(total)
    score = np.divide(averagev, totalvector)
    df['lin_feat', name] = score

In [19]:
def extract_wordlist_features():
    global df
    print("Empath...")
    apply_empath()
    
    # Import data for other wordlists
    print("Import data for other wordlists")
    concretenessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/concreteness.csv')
    cdf = concretenessdf[['Conc.M']]
    cmatrix = cdf.to_numpy()
    concrete = concretenessdf['Word'].values.tolist()
    del concretenessdf

    happinessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/happiness_ratings.csv')
    hdf = happinessdf[['happiness_average']]
    hmatrix = hdf.to_numpy()
    happiness = happinessdf['word'].values.tolist()
    del happinessdf

    cursedf = pd.read_csv('/home/sophia/ma_py/psych_lists/mean_good_curse.csv')
    cudf = cursedf[['mean_good_curse']]
    cumatrix = cudf.to_numpy()
    curse = cursedf['word'].values.tolist()
    del cursedf

    sensorydf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_experience_ratings.csv')
    serdf = sensorydf[['Average SER']]
    sermatrix = serdf.to_numpy()
    ser = sensorydf['Word'].values.tolist()
    del sensorydf

    alldf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_ratings_all.csv')
    newalldf = alldf[['Emotion', 'Polarity', 'Social', 'Moral', 'MotionSelf', 'Thought', 'Color', 'TasteSmell', 'Tactile', 'VisualForm', 'Auditory', 'Space', 'Quantity', 'Time', 'CNC', 'IMG', 'FAM']]
    newalldf = newalldf.fillna(0)
    allmatrix = newalldf.to_numpy()
    allsens = alldf['Word'].values.tolist()
    del alldf

    valarodomdf = pd.read_csv('/home/sophia/ma_py/psych_lists/valence_arousal_dominence.csv')
    vaddf = valarodomdf[['V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
    vadmatrix = vaddf.to_numpy()
    vad = valarodomdf['Word'].values.tolist()
    del valarodomdf

    mrcdf = pd.read_csv('/home/sophia/ma_py/psych_lists/mrclists_c_p.csv', sep='\t', names=['word', 'cmean', 'pmean'])
    cpdf = mrcdf[['cmean', 'pmean']]
    cpmatrix = cpdf.to_numpy()
    mrc = mrcdf['word'].values.tolist()
    del mrcdf
    
    # wordlists created manually
    print("Create manual wordlists...")
    negations = ["no", "not", "none", "nobody", "nothing", "neither", "nowhere", "never", "nay"]
    articles = ["a", "an", "the"]
    future = ["will", "gonna"]
    pers_pronouns = ["i", "me", "my", "mine", "myself", "you", "your", "yours", "yourself", "he", "him", "his",
                     "himself", "she", "her", "hers", "herself", "it", "its", "itself", "themself", "we", "us",
                     "our", "ours", "ourselves", "they", "them", "their", "theirs", "themselves"]
    fp_sing = ["i", "me", "my", "mine", "myself"]
    fp_plural = ["we", "us", "our", "ours", "ourselves"]
    secondp = ["you", "your", "yours", "yourself"]
    tp_sing = ["he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "themself"]
    tp_plural = ["they", "them", "their", "theirs", "themselves"]
    indef_pronouns = ["another", "anybody", "anyone", "anything", "each", "either", "enough", "everybody", 
                            "everyone", "everything", "little", "much", "neither", "nobody", "no one", "nothing", 
                            "one", "other", "somebody", "someone", "something", "both", "few", "fewer", "many", 
                            "others", "several", "all", "any", "more", "most", "none", "some", "such"]
    aux_verbs = ["be", "am", "are", "is", "was", "were", "being", "can", "could", "do", "did", "does", "doing", 
                 "have", "had", "has", "having", "may", "might", "must", "shall", "should", "will", "would"]


    # create scores for each word list and add them to df
    print("Preprocessing for wordlists...")
    inputtext = preprocess_counting()
    print("\nWordlist Concreteness: \n")
    aggregator(inputtext, concrete, cmatrix, "concreteness")
    print("\nWordlist Happiness: \n")
    aggregator(inputtext, happiness, hmatrix, "happiness")
    print("\nWordlist Good_Curse: \n")
    aggregator(inputtext, curse, cumatrix, "good_curse")
    print("\n17 further wordlists: \n")
    aggregator(inputtext, allsens, allmatrix, ['emotion', 'polarity', 'social', 'moral', 'motionself', 'thought', 'color', 'tastesmell', 'tactile', 'visualform', 'auditory', 'space', 'quantity', 'time', 'CNC', 'IMG', 'FAM'])
    print("\nWordlist SER: \n")
    aggregator(inputtext, ser, sermatrix, "SER")
    print("\nWordlists Valence, Arousal, Dominance: \n")
    aggregator(inputtext, vad, vadmatrix, ['valence', 'arousal', 'dominance'])
    print("\nWordlist Negation: \n")
    list_counter(inputtext, negations, "negations")
    print("\nWordlist Articles: \n")
    list_counter(inputtext, articles, "articles")
    print("\nWordlist Future: \n")
    list_counter(inputtext, future, "future")
    print("\nWordlist personal pronouns: \n")
    list_counter(inputtext, pers_pronouns, "pers_pronouns")
    print("\nWordlist first person singular pronouns: \n")
    list_counter(inputtext, fp_sing, "fp_sing")
    print("\nWordlist first person plural pronouns: \n")
    list_counter(inputtext, fp_plural, "fp_plural")
    print("\nWordlist second person pronouns: \n")
    list_counter(inputtext, secondp, "secondp")
    print("\nWordlist third person singular pronouns: \n")
    list_counter(inputtext, tp_sing, "tp_sing")
    print("\nWordlist third person plural pronouns: \n")
    list_counter(inputtext, tp_plural, "tp_plural")
    print("\nWordlist indefinite pronouns: \n")
    list_counter(inputtext, indef_pronouns, "indef_pronouns")
    print("\nWordlist auxiliary verbs: \n")
    list_counter(inputtext, aux_verbs, "aux_verbs")
    print("\nWordlists from MRC (2): \n")
    aggregator(inputtext, mrc, cpmatrix, ["mrc_cmean", "mrc_pmean"])

extract_wordlist_features()

Empath...
Create new empath categories...
["talk", "mates", "mate", "Because", "friends", "anyone", "anything", "mean", "though", "anyway", "guess", "anymore", "should", "why", "knew", "someone", "trust", "wanted", "actually", "family", "anybody", "Well", "care", "parents", "knowing", "understand", "Now", "Maybe", "else", "probably", "happen", "yet", "honestly", "maybe", "either", "If", "always", "thought", "leave", "suppose", "talk", "own_friends", "telling", "nt", "right", "either", "cause", "talking", "cause", "anyways"]
["child", "kid", "girl", "baby", "adult", "teenager", "boy", "little_girl", "little_boy", "young", "age", "baby_girl", "teen", "woman", "princess", "toddler", "grown_man", "baby_sister", "daughter", "six_year_old", "sister", "teenage_girl", "newborn", "guy", "baby_boy", "brother", "three_year_old", "sixteen_year_old", "four_year_old", "6_year_old", "ten_year_old", "new_man", "one", "seven_year_old", "person", "babies", "12_year_old", "twelve_year_old", "4_year_old",

["noticed", "seen", "view", "seeing", "spotted", "sight", "saw", "found", "realized", "spied", "veiw", "appeared", "realised", "showed", "recognized", "glimpsed", "glimpse", "faced", "notice", "noticing", "spot", "disappeared", "stopped", "standing", "shown", "remembered", "front", "caught", "watched", "recognised", "figure", "spotting", "observed", "silhouette", "clear_view", "guessed", "near", "met", "corner", "Seeing", "witnessed", "pictured", "passed", "approached", "entered", "first_glimpse", "emerged", "familiar_face", "imagined", "stood", "notice", "dissapeared", "before"]
["feel", "feels", "feeling", "feeling", "touch", "felt", "touching", "numb", "touch", "touched", "Feeling", "hurt", "feel", "sensation", "hurting", "hurts", "felling", "touches", "burn", "own_skin", "aching", "tingly", "weak", "body", "makes", "kiss", "pain", "tingling", "whole_body", "warm", "knowing", "cold", "breathe", "tingle", "heat", "own_body", "lie", "someone", "yet", "tingling", "burning", "though", "

["thousand", "gazillion", "fourteen", "Twenty-one", "1,000", "more_than_100", "Twenty-three", "4,000", "700", "more_than_10", "twenty-eight", "about_300", "five_million", "2.5", "big_number", "1,000,000", "thirty-one", "20", "1-2", "twenty_two", "30,000", "zillion", "400", "Twenty-two", ".3", "One_hundred", "thirty-four", "600", "500,000", "at_least_2", "twenty_three", "twenty-nine", "one_hundred_and_fifty", "2k", "44", "49", "8", "32", "one_million", "only_17", "3.5", "seventy-five", "42"]
Apply empath...


  0%|          | 0/386 [00:00<?, ?it/s]

45
45
Import data for other wordlists
Create manual wordlists...
Preprocessing for wordlists...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist Concreteness: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...

Wordlist Happiness: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...

Wordlist Good_Curse: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...

17 further wordlists: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...
17
17

Wordlist SER: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...

Wordlists Valence, Arousal, Dominance: 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...
3
3

Wordlist Negation: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist Articles: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist Future: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist personal pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist first person singular pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist first person plural pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist second person pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist third person singular pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist third person plural pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist indefinite pronouns: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlist auxiliary verbs: 



  0%|          | 0/386 [00:00<?, ?it/s]

	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]


Wordlists from MRC (2): 

	Count...
	Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

	Multiply...
2
2


In [20]:
reduce_df_size()

# # write optimized pickle
# print("Create pickle")
# filepath = "aug_commentdf_FE3.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(df, protocol=-1)
#     f.write(pickled)
# del f
# del filepath
# del pickled

Reduce floats...
Reduce ints...
Size of the df in MB: 
14.582183837890625


In [21]:
# read optimized pickle
# filepath = "aug_commentdf_FE3.pkl"
# with open(filepath, 'rb') as f:
#     df = pickle.load(f)

## Preprocessing 3

In [22]:
# define stopwordlist to use
def choose_stopwordlist(mode):
    if mode == 'NLTK':
        stopwordList = stopwords.words('english')
    if mode == 'NLTK-neg':
        stopwordList = stopwords.words('english')
        stopwordList.remove('no')
        stopwordList.remove('nor')
        stopwordList.remove('not')
    return stopwordList

def preprocess_tokenize():
    ps = PorterStemmer()
    print("Lower words and remove special characters...")
    newrow = []
    for row in tqdm(df['text', 'decon_body']):
        newcomment = []
        for comment in row:
            text_pre = ""
            for character in comment:
                if character.isalnum() or character.isspace():
                    character = character.lower()
                    text_pre += character
                else:
                    text_pre += " "
            newcomment.append(text_pre)
        newrow.append(newcomment)
    del newcomment    
    print("Remove stopwords...")
    stopwordList = choose_stopwordlist(mode='NLTK-neg')
    newprobody = []
    for row in tqdm(newrow):
        newrowprobody = []
        for comment in row:
            words = [word for word in comment.split() if (word not in stopwordList)]
            newcomment = ' '.join(words)
            newrowprobody.append(newcomment)
        newprobody.append(newrowprobody)
    del newrow
    del newcomment 
    del newrowprobody
# change numbers to words and tokenize words 
    print("Change numbers to words and tokenize words and stem tokens")
    newbody_complete = []
    # num2words
    for row in tqdm(newprobody):
        newbody = []
        for sentence in row:
            # string to list
            inputtext = sentence.split()
            numlist = []
            for i in range(len(inputtext)):
                if inputtext[i].isnumeric():
                    numlist.append(i)
            for number in numlist:
                # deleted: fractions, superscripts, extremely large numbers, 卌卌, 一
                try:
                    inputtext[number] = num2words(inputtext[number])
                except decimal.InvalidOperation:
                    inputtext[number] = " "
                except OverflowError:
                    inputtext[number] = " "

            # list to string
            inputtext = [ps.stem(word) for word in inputtext if word.isalpha()]
            celltext = ' '.join(inputtext)
            newbody.append(celltext)
        newbody_complete.append(newbody)
    return newbody_complete


tokens = preprocess_tokenize()

df.drop(('text', 'decon_body'), axis = 1, inplace = True)

Lower words and remove special characters...


  0%|          | 0/386 [00:00<?, ?it/s]

Remove stopwords...


  0%|          | 0/386 [00:00<?, ?it/s]

Change numbers to words and tokenize words and stem tokens


  0%|          | 0/386 [00:00<?, ?it/s]

In [23]:
# print("Create pickle")
# filepath = "aug_tokens.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(tokens, protocol=-1)
#     f.write(pickled)
    
# del f
# del filepath
# del pickled

In [24]:
# # read optimized pickle
# filepath = "aug_tokens.pkl"
# with open(filepath, 'rb') as f:
#     tokens = pickle.load(f)

## Feature Engineering 4:

In [25]:
# for item in dir():
#     print(item, item.__sizeof__())

In [26]:
def ngram_preprocessing():
    # convert input from list to string
    global tokens
    ngrams = []
    inputtext = []
    valid = True
    notvalid_lst =[]
    for authortext in tqdm(tokens):
        valid_string = ""
        for comment in authortext:
            valid = True
            i=0
            for char in comment:
                if not(char in string.printable):
                    valid = False
                    notvalid_lst += [char]
                    i+=1
            if valid == True:
                textspace = comment + " "
                valid_string += textspace
#         print(valid_string, "\n")
        inputtext.append(valid_string)
    if len(notvalid_lst) > 0:
        print("\nNumber of dismissed comments: ", i)
    print("Length of inputtext: ", len(inputtext))
    return inputtext

    
def ngrams(inputtext, n_min, n_max, ngramtype):
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype, max_features=(n_max-(n_min-1))*1000)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    dense = vectors.todense()
    denselist = dense.tolist()
    print("Get feature names...")
    names = vectorizer.get_feature_names()
    print("Length of feature names: ", len(names))
    print("Create df...")
    ngramdf = pd.DataFrame(denselist, columns=names)
    return ngramdf

def merge_dfs(df1, df2):
    global df
    ngramsdf = df1.join(df2, rsuffix="_char")
    ngramsdf['author_index'] = df.index
    ngramsdf = ngramsdf.set_index('author_index')
    headers = (len(df1.columns))*['wordngram'] + (len(df2.columns))*['charngram']
    columns = ngramsdf.columns.values
    print("Headers: ", len(headers))
    print("Columns: ", len(columns))
    arrays = [headers] + [columns]
    ngramsdf.columns = pd.MultiIndex.from_arrays(arrays)
    df = df.join(ngramsdf, rsuffix="_ngram")

In [27]:
# Preprocessing for LDA
def preprocess_lda():
    global tokens
    neglst = ["no", "not", "none", "nobody", "nothing", "neither", "nowhere", "never", "nay"]
    inputlst = []
    for comment in tokens:
        smalllist = []
        for string in comment: 
            tokens = [token for token in string.split() if (token not in neglst)]
            smalllist.append(' '.join(tokens))
        inputlst.append(smalllist)
    return inputlst

def apply_lda(inputlst, number, name):
    print("Start LDA...")
    dictionary = corpora.Dictionary(inputlst)
    corpus = [dictionary.doc2bow(text) for text in inputlst]
    ldamodel = models.LdaMulticore(corpus, num_topics=number, id2word = dictionary, chunksize=100, dtype = np.float32, workers=14)
#     for idx, topic in ldamodel.print_topics(-1):
#     print("Topic: {} \nWords: {}".format(idx, topic))
#     print("\n")  
    topics_list = []
    for document in corpus:
        topics = ldamodel.get_document_topics(document, minimum_probability=0.0)
        onlytopics = [x[1] for x in topics]
        topics_list.append(onlytopics)
    ldadf = pd.DataFrame(topics_list)
    ldadf['author'] = df.index
    ldadf = ldadf.set_index('author')
    columnname = 'lda' + str(number)
    headers = number*[columnname]
    columns = ldadf.columns.values
    arrays = [headers] + [columns]
    ldadf.columns=pd.MultiIndex.from_arrays(arrays)
    return ldadf

In [28]:
print("Ngrams...")
print("Preprocessing for ngrams: ")
inputtext = ngram_preprocessing()
print("Create word ngrams...")
wordngramsdf = ngrams(inputtext, 1, 3, "word")
print("Create char ngrams...")
charngramsdf = ngrams(inputtext, 2, 3, "char")

Ngrams...
Preprocessing for ngrams: 


  0%|          | 0/386 [00:00<?, ?it/s]


Number of dismissed comments:  0
Length of inputtext:  386
Create word ngrams...
Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

Get feature names...
Length of feature names:  3000
Create df...
Create char ngrams...
Vectorize...


  0%|          | 0/386 [00:00<?, ?it/s]

Get feature names...
Length of feature names:  2000
Create df...


In [29]:
print("Merge df...")
merge_dfs(wordngramsdf, charngramsdf)
del inputtext

Merge df...
Headers:  5000
Columns:  5000


In [30]:
# print("Create pickle")
# filepath = "aug_commentdf_FE4_notext.pkl"
# with open(filepath, "wb") as f:
#     pickled = pickle.dumps(df, protocol=-1)
#     f.write(pickled)

# del f
# del filepath
# del pickled

In [31]:
print("\n\nCreate user features (LDA)...\n")
print("Preprocessing for LDA...")
inputlst = preprocess_lda()
print("LDA with fifty topics: ")
lda50df = apply_lda(inputlst, 50, "ldafifty")
print("LDA with onehundred topics: ")
lda100df = apply_lda(inputlst, 100, "ldahundred")
df = df.join(lda50df, rsuffix="_lda50")
df = df.join(lda100df, rsuffix="_lda100")
df.info(verbose=True)
del inputlst



Create user features (LDA)...

Preprocessing for LDA...
LDA with fifty topics: 
Start LDA...
LDA with onehundred topics: 
Start LDA...
<class 'pandas.core.frame.DataFrame'>
Index: 386 entries, -BlitzN9ne to zymmaster
Data columns (total 13141 columns):
 #      Column                                Dtype  
---     ------                                -----  
 0      (post, score)                         float32
 1      (post, controversiality)              float32
 2      (post, gilded)                        float32
 3      (post, ratio_en)                      float32
 4      (subtf, num_subreddit)                int16  
 5      (subtf, entropy)                      float32
 6      (subtf, mean_time)                    float32
 7      (subtf, median_time)                  float32
 8      (subtf, max_time)                     int16  
 9      (post, lang)                          int16  
 10     (subtf, monday)                       int16  
 11     (subtf, tuesday)                   

In [32]:
# write optimized pickle
print("Create pickle")
filepath = "mbtib5feat.pkl"
with open(filepath, "wb") as f:
    pickled = pickle.dumps(df, protocol=-1)
    f.write(pickled)

del f
del filepath
del pickled

Create pickle


### Naming
Big Five Labels: b5feat

Big Five Labels + Augmentation: b5feat_aug

Big Five + MBTI Labels: b5mbtifeat

In [33]:
df

Unnamed: 0_level_0,post,post,post,post,subtf,subtf,subtf,subtf,subtf,post,...,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100,lda100
Unnamed: 0_level_1,score,controversiality,gilded,ratio_en,num_subreddit,entropy,mean_time,median_time,max_time,lang,...,90,91,92,93,94,95,96,97,98,99
author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
-BlitzN9ne,9.644956,0.014159,0.000000,7.883648,116,4.865813,4.830648e+04,793.5,25124,38,...,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042
-dyad-,7.234043,0.000000,0.000000,32.571430,5,1.484707,3.799737e+05,57538.0,-32559,2,...,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051,0.000051
12345jk12345,2.054545,0.018182,0.000000,17.333334,20,3.474704,3.744704e+05,50695.0,-31569,3,...,0.000092,0.000092,0.000092,0.000092,0.000092,0.000092,0.000092,0.000092,0.000092,0.000092
64BitCoffee,10.816901,0.007042,0.000000,1.581818,14,1.537101,2.699926e+05,133974.0,30165,19,...,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071
ACE_C0ND0R,6.359871,0.039165,0.000321,14.812182,301,5.604323,2.190034e+04,196.0,-28316,37,...,0.000016,0.000016,0.011908,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016,0.000016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yrelav_dnomyar,4.155844,0.012987,0.000000,18.250000,16,3.436423,1.001564e+06,73632.0,24853,3,...,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129,0.000129
zEaK47,2.673674,0.006006,0.000000,13.745387,84,2.522998,2.453335e+04,62.0,14216,30,...,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017,0.000017
zimzat,6.870656,0.027027,0.000000,85.333336,81,5.264661,1.318084e+05,12787.0,16583,5,...,0.000132,0.000132,0.000132,0.000132,0.000132,0.000132,0.000132,0.000132,0.000132,0.000132
zookatron,6.419580,0.000000,0.000000,70.500000,22,2.846639,8.043239e+05,91049.0,-28552,2,...,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070,0.000070


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 386 entries, -BlitzN9ne to zymmaster
Columns: 13141 entries, ('post', 'score') to ('lda100', 99)
dtypes: float32(102), float64(5150), int16(7889)
memory usage: 21.1+ MB
