# Wordlists

## Import packages

In [1]:
import pandas as pd
import numpy as np
from empath import Empath
from sklearn.feature_extraction.text import CountVectorizer 
from tqdm.notebook import tqdm
import random
random.seed(32)

## Import data

In [2]:
df = pd.read_pickle("preprocessed.pkl")
df.head()
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 87 non-null     int64  
 1   author                87 non-null     object 
 2   body                  87 non-null     object 
 3   probody               87 non-null     object 
 4   tokens                87 non-null     object 
 5   agreeableness         87 non-null     float64
 6   openness              87 non-null     float64
 7   conscientiousness     87 non-null     float64
 8   extraversion          87 non-null     float64
 9   neuroticism           87 non-null     float64
 10  agree                 87 non-null     int64  
 11  openn                 87 non-null     int64  
 12  consc                 87 non-null     int64  
 13  extra                 87 non-null     int64  
 14  neuro                 87 non-null     int64  
 15  language              87 

### Empath

as a replacement for LIWC

In [3]:
# create new categories with empath
def new_cat():
    empath = Empath()
    social = empath.create_category("social",["mate","talk","they"])
    humans = empath.create_category("humans",["adult","baby","boy"])
    cognitive = empath.create_category("cognitive",["cause","know","ought"])
    insight = empath.create_category("insight",["think","know","consider"])
    causation = empath.create_category("causation",["because","effect","hence"])
    discrepancy = empath.create_category("discrepancy",["should","would","could"])
    tentative = empath.create_category("tentative",["maybe","perhaps","guess"])
    certainty = empath.create_category("certainty",["always","never", "proof"])
    inhibition = empath.create_category("inhibition",["block","constrain","stop"])
    inclusive = empath.create_category("inclusive",["and","with","include"])
    exclusive = empath.create_category("exclusive",["but","without","exclude"])
    perceptual = empath.create_category("perceptual",["observing","hear","feeling"])
    see = empath.create_category("see",["view","saw","seen"])
    feel = empath.create_category("feel",["feels","touch","feeling"])
    biological = empath.create_category("biological",["eat","blood","pain"])
    relativity = empath.create_category("relativity",["area","bend","go"])
    space = empath.create_category("space",["down","in","thin"])
    time = empath.create_category("time",["end","until","season"])
    agreement = empath.create_category("agreement", ["agree", "ok", "yes"])
    fillers = empath.create_category("fillers", ["like", "Imean", "yaknow"])
    nonfluencies = empath.create_category("nonfluencies", ["umm", "hm", "er"])
    conjunctions = empath.create_category("conjunctions", ["and", "but", "whereas"])
    quantifiers = empath.create_category("quantifiers", ["few", "many", "much"])
    numbers = empath.create_category("numbers", ["two", "fourteen", "thousand"])
    z = empath.analyze("I am not thinking", categories=["negations"], normalize=True)

In [4]:
def apply_empath(df):
    empath = Empath()
    new_cat()
    empathvalues = []
    empathcategories = ["swearing_terms", "social", "family", "friends", "humans", "emotional", "positive_emotion", "negative_emotion", "fear", "anger", "sadness", "cognitive", "insight", "causation", "discrepancy", "tentative", "certainty", "inhibition", "inclusive", "exclusive", "perceptual", "see", "hear", "feel", "biological", "body", "health", "sexual", "eat", "relativity", "space", "time", "work", "achievement", "leisure", "home", "money", "religion", "death" ,"agreement", "fillers", "nonfluencies"]
    for sentence in tqdm(df['body']):
        empathvalues.append(empath.analyze(sentence, categories=empathcategories, normalize=True))
    empathdf = pd.DataFrame(empathvalues)
    empathdf['author'] = df['author']

    newdf = pd.merge(df, empathdf, on='author', how='outer')
    return newdf

empdf = apply_empath(df)
print("NaN in new df: ", empdf.isnull().any().any())
empdf.head()

["talk", "mates", "mate", "Because", "friends", "anyone", "anything", "mean", "though", "anyway", "guess", "anymore", "should", "why", "knew", "someone", "trust", "wanted", "actually", "family", "anybody", "Well", "care", "parents", "knowing", "understand", "Now", "Maybe", "else", "probably", "happen", "yet", "honestly", "maybe", "either", "If", "always", "thought", "leave", "suppose", "talk", "own_friends", "telling", "nt", "right", "either", "cause", "talking", "cause", "anyways"]
["child", "kid", "girl", "baby", "adult", "teenager", "boy", "little_girl", "little_boy", "young", "age", "baby_girl", "teen", "woman", "princess", "toddler", "grown_man", "baby_sister", "daughter", "six_year_old", "sister", "teenage_girl", "newborn", "guy", "baby_boy", "brother", "three_year_old", "sixteen_year_old", "four_year_old", "6_year_old", "ten_year_old", "new_man", "one", "seven_year_old", "person", "babies", "12_year_old", "twelve_year_old", "4_year_old", "10_year_old", "nine_year_old", "teenage_

["noticed", "seen", "view", "seeing", "spotted", "sight", "saw", "found", "realized", "spied", "veiw", "appeared", "realised", "showed", "recognized", "glimpsed", "glimpse", "faced", "notice", "noticing", "spot", "disappeared", "stopped", "standing", "shown", "remembered", "front", "caught", "watched", "recognised", "figure", "spotting", "observed", "silhouette", "clear_view", "guessed", "near", "met", "corner", "Seeing", "witnessed", "pictured", "passed", "approached", "entered", "first_glimpse", "emerged", "familiar_face", "imagined", "stood", "notice", "dissapeared", "before"]
["feel", "feels", "feeling", "feeling", "touch", "felt", "touching", "numb", "touch", "touched", "Feeling", "hurt", "feel", "sensation", "hurting", "hurts", "felling", "touches", "burn", "own_skin", "aching", "tingly", "weak", "body", "makes", "kiss", "pain", "tingling", "whole_body", "warm", "knowing", "cold", "breathe", "tingle", "heat", "own_body", "lie", "someone", "yet", "tingling", "burning", "though", "

  0%|          | 0/87 [00:00<?, ?it/s]

NaN in new df:  False


Unnamed: 0,index,author,body_x,probody,tokens,agreeableness,openness,conscientiousness,extraversion,neuroticism,...,work,achievement,leisure,home,money,religion,death,agreement,fillers,nonfluencies
0,1,Sabata11792,That's subtle enough to just look like a coinc...,subtle enough look like,"[subtl, enough, look, like]",8.0,11.0,74.0,1.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0
1,21,Shadow_Of_,"Downturned nose, dirty skin, tattoos, small ch...",downturned dirty small small obvious would not,"[downturn, dirti, small, small, obviou, would,...",76.0,47.0,1.0,4.0,75.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34,xenomouse,"Yes, if I was a man they'd call it a man cave....",man would call man guess mouse agree not safe ...,"[man, would, call, man, guess, mous, agre, not...",26.0,93.0,49.0,70.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,0.0,0.0
3,37,eiznekk,Added you back! Thank you :D,added thank,"[ad, thank]",70.0,64.0,5.0,5.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,62,vitrael2,I squatted 225x14 a couple weeks ago and I mad...,squatted 225x14 couple weeks ago made sad card...,"[squat, 225x14, coupl, week, ago, made, sad, c...",26.0,98.0,75.0,93.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0


## PSYCH Wordlists

In [5]:
concretenessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/concreteness.csv')
cdf = concretenessdf[['Conc.M']]
cmatrix = cdf.to_numpy()
concrete = concretenessdf['Word'].values.tolist()

happinessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/happiness_ratings.csv')
hdf = happinessdf[['happiness_average']]
hmatrix = hdf.to_numpy()
happiness = happinessdf['word'].values.tolist()

cursedf = pd.read_csv('/home/sophia/ma_py/psych_lists/mean_good_curse.csv')
cudf = cursedf[['mean_good_curse']]
cumatrix = cudf.to_numpy()
curse = cursedf['word'].values.tolist()

sensorydf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_experience_ratings.csv')
serdf = sensorydf[['Average SER']]
sermatrix = serdf.to_numpy()
ser = sensorydf['Word'].values.tolist()

alldf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_ratings_all.csv')
newalldf = alldf[['Emotion', 'Polarity', 'Social', 'Moral', 'MotionSelf', 'Thought', 'Color', 'TasteSmell', 'Tactile', 'VisualForm', 'Auditory', 'Space', 'Quantity', 'Time', 'CNC', 'IMG', 'FAM']]
allmatrix = newalldf.to_numpy()
allsens = alldf['Word'].values.tolist()

valarodomdf = pd.read_csv('/home/sophia/ma_py/psych_lists/valence_arousal_dominence.csv')
vaddf = valarodomdf[['V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
vadmatrix = vaddf.to_numpy()
vad = valarodomdf['Word'].values.tolist()

mrcdf = pd.read_csv('/home/sophia/ma_py/psych_lists/mrclists_c_p.csv', sep='\t', names=['word', 'cmean', 'pmean'])
cpdf = mrcdf[['cmean', 'pmean']]
cpmatrix = cpdf.to_numpy()
mrc = mrcdf['word'].values.tolist()


# num_rows, num_cols = matrix.shape
# print (num_rows, num_cols)

In [6]:
def counter(df, vocab):
    inputtext = []
    for row in df['body_x']:
        text = ' '.join(row)
        inputtext.append(text)
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    return v

# hmatrix = counter(empdf, happiness)
# print(type(hmatrix))
# print("Number of non zero elements: ", np.count_nonzero(hmatrix))
# print(type(hmatrix))
# num_rows, num_cols = hmatrix.shape
# print (num_rows, num_cols)

In [7]:
def multiply(matrix, ratings):
    # matrix multiplication 
    result = np.matmul(matrix, ratings)
    # divide each score with the number of words in the list to normalize
    result = result/(len(ratings))
    return result

# test = multiply(hdf, hmatrix)
# num_rows, num_cols = test.shape
# print (num_rows, num_cols)
# print("Number of non zero elements: ", np.count_nonzero(test))

In [8]:
def aggregator(df, vocab, ratings, name):
    count = counter(df, vocab)
    result = multiply(count, ratings)
    num_rows, num_cols = result.shape
    
    if num_cols ==1:
        df[name] = result
    else:
        resultdf = pd.DataFrame(result)
        for i in range(len(name)):
            # first i is zero
            column = name[i]
            df[column] = resultdf[i]
    return df

# psychdf = aggregator(empdf, concrete, "concreteness")
# psychdf

In [14]:
negations = ["No", "Not", "None", "Nobody", "Nothing", "Neither", "Nowhere", "Never"]
articles = ["a", "an", "the"]
future = ["will", "gonna"]

def list_counter(df, vocab, name):
    inputtext = []
    total = []
    for row in empdf['body_x']:
        total.append(len(row))
        text = ' '.join(row)
        inputtext.append(text)
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    averagev = v.sum(axis=1)
    totalvector =  np.array(total)
    score = np.divide(averagev, totalvector)
    df[name] = score
    return df

## Wrapper

In [15]:
def extract_features(df):
    # create scores for each word list and add them to df
    psychdf = aggregator(df, concrete, cmatrix, "concreteness")
    psychdf = aggregator(df, happiness, hmatrix, "happiness")
    psychdf = aggregator(df, curse, cumatrix, "good_curse")
    psychdf = aggregator(df, allsens, allmatrix, ['emotion', 'polarity', 'social', 'moral', 'motionself', 'thought', 'color', 'tastesmell', 'tactile', 'visualform', 'auditory', 'space', 'quantity', 'time', 'CNC', 'IMG', 'FAM'])
    psychdf = aggregator(df, ser, sermatrix, "SER")
    psychdf = aggregator(df, vad, vadmatrix, ['valence', 'arousal', 'dominance'])
    psychdf = list_counter(df, negations, "negations")
    psychdf = list_counter(df, articles, "articles")
    psychdf = list_counter(df, future, "future")
    psychdf = aggregator(df, mrc, cpmatrix, ["mrc_cmean", "mrc_pmean"])
    
    return psychdf

psychdf = extract_features(empdf)
psychdf.info(verbose=True)

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

Vectorize...


  0%|          | 0/199 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 198
Data columns (total 103 columns):
 #    Column                Dtype  
---   ------                -----  
 0    index                 int64  
 1    author                object 
 2    body_x                object 
 3    probody               object 
 4    tokens                object 
 5    agreeableness         float64
 6    openness              float64
 7    conscientiousness     float64
 8    extraversion          float64
 9    neuroticism           float64
 10   agree                 int64  
 11   openn                 int64  
 12   consc                 int64  
 13   extra                 int64  
 14   neuro                 int64  
 15   language              int64  
 16   author_flair_text     object 
 17   downs                 float64
 18   created_utc           float64
 19   subreddit_id          object 
 20   link_id               object 
 21   parent_id             object 
 22   score                 fl

## Export dataframe

In [11]:
# psychdf.to_pickle("wordlists.pkl")