# Linguistic features version with one row per author

## Import packages

In [30]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import random
random.seed(32)
from lexicalrichness import LexicalRichness
import textblob

import nltk
from nltk.corpus import stopwords
from nltk.util import bigrams, ngrams

import string
from string import punctuation

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from collections import Counter

## Import data (preprocessed)

In [31]:
df = pd.read_pickle("preprocessed_author.pkl")
df.head()
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  422 non-null    int64  
 1   author                 422 non-null    object 
 2   complete_body          422 non-null    object 
 3   doc_body               422 non-null    object 
 4   probody                422 non-null    object 
 5   tokens                 422 non-null    object 
 6   senttokens             422 non-null    object 
 7   agreeableness          422 non-null    float64
 8   openness               422 non-null    float64
 9   conscientiousness      422 non-null    float64
 10  extraversion           422 non-null    float64
 11  neuroticism            422 non-null    float64
 12  agree                  422 non-null    int64  
 13  openn                  422 non-null    int64  
 14  consc                  422 non-null    int64  
 15  extra 

## Features not mentioned in paper

In [32]:
df.head()

Unnamed: 0,index,author,complete_body,doc_body,probody,tokens,senttokens,agreeableness,openness,conscientiousness,...,neuro,all_utc,mean_controversiality,mean_gilded,num_subreddits,subreddit_dist,weekday_dist,month_dist,year_dist,all_lang
0,0,-BigSexy-,Oooh i see,[Oooh i see],[oooh see],"[[oooh, see]]",[[Oooh i see]],39.0,92.0,1.0,...,0,[1510236798],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]",0
1,1,-BlitzN9ne,**Quality** material right here,[**Quality** material right here],[quality material right],"[[quality, material, right]]",[[**Quality** material right here]],50.0,85.0,15.0,...,0,[1549708109],0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1]",0
2,2,-CrestiaBell,A slidewhistle or a meow-meow board That's bec...,"[A slidewhistle or a meow-meow board, That's b...","[slidewhistle meow meow board, watch cartoon s...","[[slidewhistle, meow, meow, board], [watch, ca...","[[A slidewhistle or a meow-meow board], [That'...",50.0,85.0,50.0,...,1,"[1538664591, 1475867279, 1505862626, 151267621...",0.0,0.0,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2]","[0, 2, 3, 1, 1]",0 0 0 0 0 0 0
3,3,-tactical-throw-away,Sorry for your feelings. Kek &lt;------- This ...,"[Sorry for your feelings., Kek &lt;------- Thi...","[sorry feelings, kek lt onekek kek kek kek, no...","[[sorry, feelings], [kek, lt, onekek, kek, kek...","[[Sorry for your feelings.], [Kek &lt;------- ...",2.0,92.0,31.0,...,1,"[1498536785, 1486701409, 1506834463]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]","[0, 0, 3, 0, 0]",0 0 0
4,4,137288,Carly's so glad to get your .0000003 cents Exc...,"[Carly's so glad to get your .0000003 cents, E...","[carly glad get three cents, except uk debuted...","[[carly, glad, get, three, cents], [except, uk...","[[Carly's so glad to get your .0000003 cents],...",10.0,87.0,49.0,...,1,"[1536611153, 1550537879, 1516548513, 1523299682]",0.0,0.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 3, 1]",0 0 0 0


In [33]:
def create_features(df):

    # Total number of characters (including space)
    df['char_count'] = df['complete_body'].str.len()

    # Total number of stopwords
    stopwordList = stopwords.words('english')
    df['stopwords'] = df['complete_body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))

    # Total number of punctuation or special characters
    df['total_punc'] = df['complete_body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))

    # Total number of numerics
    df['total_num'] = df['complete_body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # Total number of uppercase words
    df['total_uppercase'] = df['complete_body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    
    return df

## Type-Token Ratio (ttr)

In [34]:
def typetokenratio(df):
    ratiolst = []
    for comment in df['complete_body']:
            lex = LexicalRichness(comment)
            if lex.words == 0:
                ratiolst.append(0)
            else:
                ratio = lex.ttr
                ratiolst.append(ratio)
    df['ttr'] = ratiolst
    return df

## Linguistic processes

In [35]:
# words per sentence
def wordcounter(df):
    lengthscore = []
    for row in df['senttokens']:
        rowscore = []
        for comment in row:
            sentencescore = 0
            for senttoken in comment:
                length = len(senttoken.split())
                sentencescore += length
            sentencescore = sentencescore/len(comment)
        lengthscore.append(sentencescore)
        arr = np.array(lengthscore)
    df['words_per_sent'] = lengthscore
    return df

# words longer than six characters
def charcounter(df):
    charscore = []
    for row in df['tokens']:
        for comment in row:
            rowcharscore = 0
            lencomment = len(comment)
            if lencomment == 0:
                score = 0
            else:
                number = 0
                for token in comment:
                    length = len(token)
                    if length > 5:
                        number+=1
                score = number/lencomment
            rowcharscore += score
        rowcharscore = rowcharscore/len(row)
        charscore.append(rowcharscore)
    df['wordslongersix'] = charscore
    return df

In [36]:
# # POS tagger
# def tagging(df):
#     past = [] #VPA
#     presence = [] #VPR
#     adverbs = [] #RB
#     prepositions = [] #PREP
#     pronouns = [] #PR
#     for row in df['tokens']:
#         row_past = [] #VPA
#         row_presence = [] #VPR
#         row_adverbs = [] #RB
#         row_prepositions = [] #PREP
#         row_pronouns = [] #PR
#         for text in row:
#             tags = nltk.pos_tag(text)
#             counts = Counter(tag for word,tag in tags)
#             total = sum(counts.values())
#             pron = counts['PRP'] + counts['PRP$']
#             verbspr = counts['VB'] + counts['VBG'] + counts['VBP'] + counts['VBZ'] + counts['MD']
#             verbspa = counts['VBD'] + counts['VBN']
#             preps = counts['IN'] + counts['TO']
#             counts['PR'] = pron
#             counts['PREP'] = preps
#             counts['VPR'] = verbspr #present tense
#             counts['VPA'] = verbspa #past tense
#             if total == 0:
#                 allcounts = dict((word, float(count)/1) for word,count in counts.items())
#             else:
#                 allcounts = dict((word, float(count)/total) for word,count in counts.items())
#             try:
#                 row_past.append(allcounts['VPA'])
#             except KeyError:
#                 row_past.append(0)
#             try:
#                 row_presence.append(allcounts['VPR'])
#             except KeyError:
#                 row_presence.append(0)
#             try:
#                 row_adverbs.append(allcounts['RB'])
#             except KeyError:
#                 row_adverbs.append(0)
#             try:
#                 row_prepositions.append(allcounts['PREP'])
#             except KeyError:
#                 row_prepositions.append(0)
#             try:
#                 row_pronouns.append(allcounts['PR'])
#             except KeyError:
#                 row_pronouns.append(0)
#         past.append(row_past) #VPA
#         presence.append(row_presence) #VPR
#         adverbs.append(row_adverbs) #RB
#         prepositions.append(row_prepositions) #PREP
#         pronouns.append(row_pronouns) #PR
#         past_arr = [np.array(lst) for lst in past]
#         presence_arr = [np.array(lst) for lst in presence]
#         adverbs_arr = [np.array(lst) for lst in adverbs]
#         prepositions_arr = [np.array(lst) for lst in prepositions]
#         pronouns_arr = [np.array(lst) for lst in pronouns]
#     df['pasttense'] = past_arr
#     df['presencetense'] = presence_arr
#     df['adverbs'] = adverbs_arr
#     df['prepositions'] = prepositions_arr
#     df['pronouns'] = pronouns_arr
#     return df

# # nltk.help.upenn_tagset('RB')

In [37]:
# POS tagger
def tagging(df):
    past = [] #VPA
    presence = [] #VPR
    adverbs = [] #RB
    prepositions = [] #PREP
    pronouns = [] #PR
    for comment in df['complete_body']:
            text = comment.split()
            tags = nltk.pos_tag(text)
            counts = Counter(tag for word,tag in tags)
            total = sum(counts.values())
            pron = counts['PRP'] + counts['PRP$']
            verbspr = counts['VB'] + counts['VBG'] + counts['VBP'] + counts['VBZ'] + counts['MD']
            verbspa = counts['VBD'] + counts['VBN']
            preps = counts['IN'] + counts['TO']
            counts['PR'] = pron
            counts['PREP'] = preps
            counts['VPR'] = verbspr #present tense
            counts['VPA'] = verbspa #past tense
            if total == 0:
                allcounts = dict((word, float(count)/1) for word,count in counts.items())
            else:
                allcounts = dict((word, float(count)/total) for word,count in counts.items())
            try:
                past.append(allcounts['VPA'])
            except KeyError:
                past.append(0)
            try:
                presence.append(allcounts['VPR'])
            except KeyError:
                presence.append(0)
            try:
                adverbs.append(allcounts['RB'])
            except KeyError:
                adverbs.append(0)
            try:
                prepositions.append(allcounts['PREP'])
            except KeyError:
                prepositions.append(0)
            try:
                pronouns.append(allcounts['PR'])
            except KeyError:
                pronouns.append(0)
    df['pasttense'] = past
    df['presencetense'] = presence
    df['adverbs'] = adverbs
    df['prepositions'] = prepositions
    df['pronouns'] = pronouns
    return df

## Ngrams

In [38]:
def ngrams(df, n_min, n_max, ngramtype):
    # convert input from list to string
    ngrams = []
    inputtext = []
    for row in df['tokens']:
        for comment in row:
            text = ' '.join(comment)
        inputtext.append(text)
    print("Length of inputtext: ", len(inputtext))
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    dense = vectors.todense()
    denselist = dense.tolist()
    print("Length denselist: ", len(denselist))
    print("Get feature names...")
    names = vectorizer.get_feature_names()
    print("Length of feature names: ", len(names))
    print("Create df...")
    ngramdf = pd.DataFrame(denselist, columns=names)
    ngramdf['author'] = df['author']
    return ngramdf

In [39]:
def merge_dfs(df1, df2, df3):
    cwngramsdf = pd.merge(df1, df2, on='author', how='inner', suffixes= (None, "_charngram"))
    gramsdf = pd.merge(df3, cwngramsdf, on='author', how='inner', suffixes= (None, "_ngram"))
    return gramsdf

## Wrapper

In [40]:
def extract_lin_features(df, create_ngrams):
    print("Create additional features...")
    df = create_features(df)
    print("Create ttr...")
    df = typetokenratio(df)
    print("Count words per sentence...")
    df = wordcounter(df)
    print("Count words with more than six letters...")
    df = charcounter(df)
    print("POS-Tagger...")
    df = tagging(df)
    print("number of rows df", len(df))
    
    if create_ngrams == "none":
        return df
    
    elif create_ngrams == "all":
        print("Create word ngrams...")
        wordngramsdf = ngrams(df, 1, 3, "word")
        print("number of rows in wordngramsdf", len(wordngramsdf))
        print("Create char ngrams...")
        charngramsdf = ngrams(df, 2, 3, "char")
        print("number of rows in charngramsdf", len(charngramsdf))
        print("Merge df...")
        gramsdf = merge_dfs(wordngramsdf, charngramsdf, df)
        return gramsdf
    
    elif create_ngrams == "word":
        wordngrams = ngrams(df, 1, 3, 'word')
        wordngramsdf = pd.DataFrame(wordngrams)
        gramsdf = pd.merge(df, wordngramsdf, on='author', how='inner', suffixes=(None, "_ngram"))
        return gramsdf

In [41]:
lin_feat_df = extract_lin_features(df, "none")
print("Length of dataframe: ", len(lin_feat_df))
lin_feat_df.info(verbose=True)

Create additional features...
Create ttr...
Count words per sentence...
Count words with more than six letters...
POS-Tagger...
number of rows df 422
Length of dataframe:  422
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  422 non-null    int64  
 1   author                 422 non-null    object 
 2   complete_body          422 non-null    object 
 3   doc_body               422 non-null    object 
 4   probody                422 non-null    object 
 5   tokens                 422 non-null    object 
 6   senttokens             422 non-null    object 
 7   agreeableness          422 non-null    float64
 8   openness               422 non-null    float64
 9   conscientiousness      422 non-null    float64
 10  extraversion           422 non-null    float64
 11  neuroticism            422 non-null   

In [42]:
lin_feat_df

Unnamed: 0,index,author,complete_body,doc_body,probody,tokens,senttokens,agreeableness,openness,conscientiousness,...,total_num,total_uppercase,ttr,words_per_sent,wordslongersix,pasttense,presencetense,adverbs,prepositions,pronouns
0,0,-BigSexy-,Oooh i see,[Oooh i see],[oooh see],"[[oooh, see]]",[[Oooh i see]],39.0,92.0,1.0,...,0,0,1.000000,3.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000
1,1,-BlitzN9ne,**Quality** material right here,[**Quality** material right here],[quality material right],"[[quality, material, right]]",[[**Quality** material right here]],50.0,85.0,15.0,...,0,0,1.000000,4.000000,0.666667,0.000000,0.000000,0.250000,0.000000,0.000000
2,2,-CrestiaBell,A slidewhistle or a meow-meow board That's bec...,"[A slidewhistle or a meow-meow board, That's b...","[slidewhistle meow meow board, watch cartoon s...","[[slidewhistle, meow, meow, board], [watch, ca...","[[A slidewhistle or a meow-meow board], [That'...",50.0,85.0,50.0,...,1,2,0.866667,9.000000,0.102041,0.080000,0.070000,0.050000,0.100000,0.040000
3,3,-tactical-throw-away,Sorry for your feelings. Kek &lt;------- This ...,"[Sorry for your feelings., Kek &lt;------- Thi...","[sorry feelings, kek lt onekek kek kek kek, no...","[[sorry, feelings], [kek, lt, onekek, kek, kek...","[[Sorry for your feelings.], [Kek &lt;------- ...",2.0,92.0,31.0,...,0,0,0.800000,4.000000,0.166667,0.000000,0.000000,0.000000,0.133333,0.066667
4,4,137288,Carly's so glad to get your .0000003 cents Exc...,"[Carly's so glad to get your .0000003 cents, E...","[carly glad get three cents, except uk debuted...","[[carly, glad, get, three, cents], [except, uk...","[[Carly's so glad to get your .0000003 cents],...",10.0,87.0,49.0,...,0,1,0.906250,9.000000,0.150000,0.062500,0.125000,0.062500,0.187500,0.093750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,424,xanthraxoid,I'd really like this video to include some inf...,[I'd really like this video to include some in...,[would really like video include information c...,"[[would, really, like, video, include, informa...",[[I'd really like this video to include some i...,86.0,45.0,8.0,...,0,8,0.688776,29.000000,0.102941,0.016129,0.198925,0.043011,0.145161,0.048387
418,425,xenomouse,"You're a guy, aren't you? I can definitely see...","[You're a guy, aren't you? I can definitely se...",[guy not definitely see would make boy scene f...,"[[guy, not, definitely, see, would, make, boy,...","[[You're a guy, aren't you?, I can definitely ...",26.0,93.0,49.0,...,0,12,0.617512,11.250000,0.022727,0.021277,0.196809,0.058511,0.138298,0.132979
419,426,xeroctr3,man even the thought of it makes me depressed....,[man even the thought of it makes me depressed...,[man even thought makes depressed loving someo...,"[[man, even, thought, makes, depressed, loving...",[[man even the thought of it makes me depresse...,3.0,75.0,27.0,...,0,4,0.723404,6.714286,0.346154,0.000000,0.340426,0.063830,0.127660,0.127660
420,427,xzack18,Not all of us are out to kill,[Not all of us are out to kill],[not us kill],"[[not, us, kill]]",[[Not all of us are out to kill]],4.0,19.0,11.0,...,0,0,1.000000,8.000000,0.000000,0.000000,0.250000,0.125000,0.375000,0.125000


In [43]:
lin_feat_df.to_pickle("linguistic_features_author.pkl")

In [44]:
lin_ngrams_df = extract_lin_features(df, "all")

print("Length of dataframe: ", len(lin_ngrams_df))
lin_ngrams_df.info(verbose=True)

Create additional features...
Create ttr...
Count words per sentence...
Count words with more than six letters...
POS-Tagger...
number of rows df 422
Create word ngrams...
Length of inputtext:  422
Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Length denselist:  422
Get feature names...
Length of feature names:  18987
Create df...
number of rows in wordngramsdf 422
Create char ngrams...
Length of inputtext:  422
Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Length denselist:  422
Get feature names...
Length of feature names:  5172
Create df...
number of rows in charngramsdf 422
Merge df...
Length of dataframe:  422
<class 'pandas.core.frame.DataFrame'>
Int64Index: 422 entries, 0 to 421
Data columns (total 24198 columns):
 #      Column                                                       Dtype  
---     ------                                                       -----  
 0      index                                                        int64  
 1      author                                                       object 
 2      complete_body                                                object 
 3      doc_body                                                     object 
 4      probody                                                      object 
 5      tokens                                                       object 
 6      senttokens                                                   object 
 7      agreeableness                 

In [45]:
lin_ngrams_df.to_pickle("linguistic_ngrams_author.pkl")

In [46]:
lin_wordngrams_df = extract_lin_features(df, "word")
print("Length of dataframe: ", len(lin_wordngrams_df))

Create additional features...
Create ttr...
Count words per sentence...
Count words with more than six letters...
POS-Tagger...
number of rows df 422
Length of inputtext:  422
Vectorize...


  0%|          | 0/422 [00:00<?, ?it/s]

Length denselist:  422
Get feature names...
Length of feature names:  18987
Create df...
Length of dataframe:  422


In [47]:
lin_wordngrams_df.to_pickle("linguistic_wordngrams_author.pkl")