# Wordlists with one row per author

## Import packages

In [1]:
import pandas as pd
import numpy as np
from empath import Empath
from sklearn.feature_extraction.text import CountVectorizer 
from tqdm.notebook import tqdm
import random
random.seed(32)

## Import data

In [2]:
df = pd.read_pickle("linguistic_ngrams_author.pkl")
df.head()
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 422 entries, 0 to 421
Data columns (total 24198 columns):
 #      Column                                                       Dtype  
---     ------                                                       -----  
 0      index                                                        int64  
 1      author                                                       object 
 2      complete_body                                                object 
 3      doc_body                                                     object 
 4      probody                                                      object 
 5      tokens                                                       object 
 6      senttokens                                                   object 
 7      agreeableness                                                float64
 8      openness                                                     float64
 9      conscientiousness                    

### Empath

as a replacement for LIWC

In [3]:
# create new categories with empath
def new_cat():
    empath = Empath()
    social = empath.create_category("social",["mate","talk","they"])
    humans = empath.create_category("humans",["adult","baby","boy"])
    cognitive = empath.create_category("cognitive",["cause","know","ought"])
    insight = empath.create_category("insight",["think","know","consider"])
    causation = empath.create_category("causation",["because","effect","hence"])
    discrepancy = empath.create_category("discrepancy",["should","would","could"])
    tentative = empath.create_category("tentative",["maybe","perhaps","guess"])
    certainty = empath.create_category("certainty",["always","never", "proof"])
    inhibition = empath.create_category("inhibition",["block","constrain","stop"])
    inclusive = empath.create_category("inclusive",["and","with","include"])
    exclusive = empath.create_category("exclusive",["but","without","exclude"])
    perceptual = empath.create_category("perceptual",["observing","hear","feeling"])
    see = empath.create_category("see",["view","saw","seen"])
    feel = empath.create_category("feel",["feels","touch","feeling"])
    biological = empath.create_category("biological",["eat","blood","pain"])
    relativity = empath.create_category("relativity",["area","bend","go"])
    space = empath.create_category("space",["down","in","thin"])
    time = empath.create_category("time",["end","until","season"])
    agreement = empath.create_category("agreement", ["agree", "ok", "yes"])
    fillers = empath.create_category("fillers", ["like", "Imean", "yaknow"])
    nonfluencies = empath.create_category("nonfluencies", ["umm", "hm", "er"])
    conjunctions = empath.create_category("conjunctions", ["and", "but", "whereas"])
    quantifiers = empath.create_category("quantifiers", ["few", "many", "much"])
    numbers = empath.create_category("numbers", ["two", "fourteen", "thousand"])
    z = empath.analyze("I am not thinking", categories=["negations"], normalize=True)

In [4]:
def apply_empath(df):
    empath = Empath()
    new_cat()
    empathvalues = []
    empathcategories = ["swearing_terms", "social", "family", "friends", "humans", "emotional", "positive_emotion", "negative_emotion", "fear", "anger", "sadness", "cognitive", "insight", "causation", "discrepancy", "tentative", "certainty", "inhibition", "inclusive", "exclusive", "perceptual", "see", "hear", "feel", "biological", "body", "health", "sexual", "eat", "relativity", "space", "time", "work", "achievement", "leisure", "home", "money", "religion", "death" ,"agreement", "fillers", "nonfluencies"]
    for sentence in tqdm(df['complete_body']):
        empathvalues.append(empath.analyze(sentence, categories=empathcategories, normalize=True))
    empathdf = pd.DataFrame(empathvalues)
    empathdf['author'] = df['author']

    newdf = pd.merge(df, empathdf, on='author', how='inner', suffixes=(None, "_wordlist"))
    return newdf

empdf = apply_empath(df)
print("NaN in new df: ", empdf.isnull().any().any())
empdf.head()

["talk", "mates", "mate", "Because", "friends", "anyone", "anything", "mean", "though", "anyway", "guess", "anymore", "should", "why", "knew", "someone", "trust", "wanted", "actually", "family", "anybody", "Well", "care", "parents", "knowing", "understand", "Now", "Maybe", "else", "probably", "happen", "yet", "honestly", "maybe", "either", "If", "always", "thought", "leave", "suppose", "talk", "own_friends", "telling", "nt", "right", "either", "cause", "talking", "cause", "anyways"]
["child", "kid", "girl", "baby", "adult", "teenager", "boy", "little_girl", "little_boy", "young", "age", "baby_girl", "teen", "woman", "princess", "toddler", "grown_man", "baby_sister", "daughter", "six_year_old", "sister", "teenage_girl", "newborn", "guy", "baby_boy", "brother", "three_year_old", "sixteen_year_old", "four_year_old", "6_year_old", "ten_year_old", "new_man", "one", "seven_year_old", "person", "babies", "12_year_old", "twelve_year_old", "4_year_old", "10_year_old", "nine_year_old", "teenage_

["noticed", "seen", "view", "seeing", "spotted", "sight", "saw", "found", "realized", "spied", "veiw", "appeared", "realised", "showed", "recognized", "glimpsed", "glimpse", "faced", "notice", "noticing", "spot", "disappeared", "stopped", "standing", "shown", "remembered", "front", "caught", "watched", "recognised", "figure", "spotting", "observed", "silhouette", "clear_view", "guessed", "near", "met", "corner", "Seeing", "witnessed", "pictured", "passed", "approached", "entered", "first_glimpse", "emerged", "familiar_face", "imagined", "stood", "notice", "dissapeared", "before"]
["feel", "feels", "feeling", "feeling", "touch", "felt", "touching", "numb", "touch", "touched", "Feeling", "hurt", "feel", "sensation", "hurting", "hurts", "felling", "touches", "burn", "own_skin", "aching", "tingly", "weak", "body", "makes", "kiss", "pain", "tingling", "whole_body", "warm", "knowing", "cold", "breathe", "tingle", "heat", "own_body", "lie", "someone", "yet", "tingling", "burning", "though", "

  0%|          | 0/422 [00:00<?, ?it/s]

NaN in new df:  False


Unnamed: 0,index,author,complete_body,doc_body,probody,tokens,senttokens,agreeableness,openness,conscientiousness,...,work_wordlist,achievement_wordlist,leisure,home,money_wordlist,religion_wordlist,death_wordlist,agreement,fillers,nonfluencies
0,0,-BigSexy-,Oooh i see,[Oooh i see],[oooh see],"[[oooh, see]]",[[Oooh i see]],39.0,92.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,-BlitzN9ne,**Quality** material right here,[**Quality** material right here],[quality material right],"[[quality, material, right]]",[[**Quality** material right here]],50.0,85.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
2,2,-CrestiaBell,A slidewhistle or a meow-meow board That's bec...,"[A slidewhistle or a meow-meow board, That's b...","[slidewhistle meow meow board, watch cartoon s...","[[slidewhistle, meow, meow, board], [watch, ca...","[[A slidewhistle or a meow-meow board], [That'...",50.0,85.0,50.0,...,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0
3,3,-tactical-throw-away,Sorry for your feelings. Kek &lt;------- This ...,"[Sorry for your feelings., Kek &lt;------- Thi...","[sorry feelings, kek lt onekek kek kek kek, no...","[[sorry, feelings], [kek, lt, onekek, kek, kek...","[[Sorry for your feelings.], [Kek &lt;------- ...",2.0,92.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,137288,Carly's so glad to get your .0000003 cents Exc...,"[Carly's so glad to get your .0000003 cents, E...","[carly glad get three cents, except uk debuted...","[[carly, glad, get, three, cents], [except, uk...","[[Carly's so glad to get your .0000003 cents],...",10.0,87.0,49.0,...,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0


## PSYCH Wordlists

In [5]:
concretenessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/concreteness.csv')
cdf = concretenessdf[['Conc.M']]
cmatrix = cdf.to_numpy()
concrete = concretenessdf['Word'].values.tolist()

happinessdf = pd.read_csv('/home/sophia/ma_py/psych_lists/happiness_ratings.csv')
hdf = happinessdf[['happiness_average']]
hmatrix = hdf.to_numpy()
happiness = happinessdf['word'].values.tolist()

cursedf = pd.read_csv('/home/sophia/ma_py/psych_lists/mean_good_curse.csv')
cudf = cursedf[['mean_good_curse']]
cumatrix = cudf.to_numpy()
curse = cursedf['word'].values.tolist()

sensorydf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_experience_ratings.csv')
serdf = sensorydf[['Average SER']]
sermatrix = serdf.to_numpy()
ser = sensorydf['Word'].values.tolist()

alldf = pd.read_csv('/home/sophia/ma_py/psych_lists/sensory_ratings_all.csv')
newalldf = alldf[['Emotion', 'Polarity', 'Social', 'Moral', 'MotionSelf', 'Thought', 'Color', 'TasteSmell', 'Tactile', 'VisualForm', 'Auditory', 'Space', 'Quantity', 'Time', 'CNC', 'IMG', 'FAM']]
allmatrix = newalldf.to_numpy()
allsens = alldf['Word'].values.tolist()

valarodomdf = pd.read_csv('/home/sophia/ma_py/psych_lists/valence_arousal_dominence.csv')
vaddf = valarodomdf[['V.Mean.Sum', 'A.Mean.Sum', 'D.Mean.Sum']]
vadmatrix = vaddf.to_numpy()
vad = valarodomdf['Word'].values.tolist()

mrcdf = pd.read_csv('/home/sophia/ma_py/psych_lists/mrclists_c_p.csv', sep='\t', names=['word', 'cmean', 'pmean'])
cpdf = mrcdf[['cmean', 'pmean']]
cpmatrix = cpdf.to_numpy()
mrc = mrcdf['word'].values.tolist()


# num_rows, num_cols = matrix.shape
# print (num_rows, num_cols)

In [6]:
def counter(df, vocab):
    inputtext = []
    for row in df['complete_body']:
        text = ' '.join(row)
        inputtext.append(text)
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    return v

# hmatrix = counter(empdf, happiness)
# print(type(hmatrix))
# print("Number of non zero elements: ", np.count_nonzero(hmatrix))
# print(type(hmatrix))
# num_rows, num_cols = hmatrix.shape
# print (num_rows, num_cols)

In [7]:
def multiply(matrix, ratings):
    # matrix multiplication 
    result = np.matmul(matrix, ratings)
    # divide each score with the number of words in the list to normalize
    result = result/(len(ratings))
    return result

# test = multiply(hdf, hmatrix)
# num_rows, num_cols = test.shape
# print (num_rows, num_cols)
# print("Number of non zero elements: ", np.count_nonzero(test))

In [8]:
def aggregator(df, vocab, ratings, name):
    count = counter(df, vocab)
    result = multiply(count, ratings)
    num_rows, num_cols = result.shape
    
    if num_cols ==1:
        df[name] = result
    else:
        resultdf = pd.DataFrame(result)
        for i in range(len(name)):
            # first i is zero
            column = name[i]
            df[column] = resultdf[i]
    return df

# psychdf = aggregator(empdf, concrete, "concreteness")
# psychdf

In [9]:
negations = ["No", "Not", "None", "Nobody", "Nothing", "Neither", "Nowhere", "Never"]
articles = ["a", "an", "the"]
future = ["will", "gonna"]

def list_counter(df, vocab, name):
    inputtext = []
    total = []
    for row in empdf['complete_body']:
        total.append(len(row))
        text = ' '.join(row)
        inputtext.append(text)
    vectorizer = CountVectorizer(analyzer="word", ngram_range=(1,1), vocabulary = vocab)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    v = vectors.toarray()
    averagev = v.sum(axis=1)
    totalvector =  np.array(total)
    score = np.divide(averagev, totalvector)
    df[name] = score
    return df

## Wrapper

In [10]:
def extract_features(df):
    # create scores for each word list and add them to df
    print("Count Wordlist Concreteness: \n")
    psychdf = aggregator(df, concrete, cmatrix, "concreteness")
    print("Count Wordlist Happiness: \n")
    psychdf = aggregator(df, happiness, hmatrix, "happiness")
    print("Count Wordlist Good_Curse: \n")
    psychdf = aggregator(df, curse, cumatrix, "good_curse")
    print("Count 17 further wordlists: \n")
    psychdf = aggregator(df, allsens, allmatrix, ['emotion', 'polarity', 'social', 'moral', 'motionself', 'thought', 'color', 'tastesmell', 'tactile', 'visualform', 'auditory', 'space', 'quantity', 'time', 'CNC', 'IMG', 'FAM'])
    print("Count Wordlist SER: \n")
    psychdf = aggregator(df, ser, sermatrix, "SER")
    print("Count Wordlists Valence, Arousal, Dominance: \n")
    psychdf = aggregator(df, vad, vadmatrix, ['valence', 'arousal', 'dominance'])
    print("Count Wordlist Negation: \n")
    psychdf = list_counter(df, negations, "negations")
    print("Count Wordlist Articles: \n")
    psychdf = list_counter(df, articles, "articles")
    print("Count Wordlist Future: \n")
    psychdf = list_counter(df, future, "future")
    print("Count Wordlists from MRC (2): \n")
    psychdf = aggregator(df, mrc, cpmatrix, ["mrc_cmean", "mrc_pmean"])
    
    return psychdf

psychdf = extract_features(empdf)
print("Length of dataframe: ", len(psychdf))
psychdf.info(verbose=True)

Count Wordlist Concreteness: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist Happiness: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist Good_Curse: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count 17 further wordlists: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist SER: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlists Valence, Arousal, Dominance: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist Negation: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist Articles: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlist Future: 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Count Wordlists from MRC (2): 

Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Length of dataframe:  422
<class 'pandas.core.frame.DataFrame'>
Int64Index: 422 entries, 0 to 421
Data columns (total 24259 columns):
 #      Column                                                       Dtype  
---     ------                                                       -----  
 0      index                                                        int64  
 1      author                                                       object 
 2      complete_body                                                object 
 3      doc_body                                                     object 
 4      probody                                                      object 
 5      tokens                                                       object 
 6      senttokens                                                   object 
 7      agreeableness                                                float64
 8      openness                                                     float64
 9      conscientio

## Export dataframe

In [11]:
psychdf.to_pickle("wordlists_author.pkl")