# Linguistic features

## Import packages

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import random
random.seed(32)
from lexicalrichness import LexicalRichness
import textblob

import nltk
from nltk.corpus import stopwords
from nltk.util import bigrams, ngrams

import string
from string import punctuation

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from collections import Counter

## Import data (preprocessed)

In [2]:
df = pd.read_pickle("preprocessed.pkl")
df.head()
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 975 entries, 0 to 974
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 975 non-null    int64  
 1   author                975 non-null    object 
 2   ident                 975 non-null    object 
 3   body                  975 non-null    object 
 4   probody               975 non-null    object 
 5   tokens                975 non-null    object 
 6   senttokens            975 non-null    object 
 7   agreeableness         975 non-null    float64
 8   openness              975 non-null    float64
 9   conscientiousness     975 non-null    float64
 10  extraversion          975 non-null    float64
 11  neuroticism           975 non-null    float64
 12  agree                 975 non-null    int64  
 13  openn                 975 non-null    int64  
 14  consc                 975 non-null    int64  
 15  extra                 9

## Features not mentioned in paper

In [3]:
def create_features(workdata):

    # Total number of characters (including space)
    workdata['char_count'] = workdata['body'].str.len()

    # Total number of stopwords
    stopwordList = stopwords.words('english')
    workdata['stopwords'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x in stopwordList]))

    # Total number of punctuation or special characters
    workdata['total_punc'] = workdata['body'].apply(lambda x: len([x for x in x.split() for j in x if j in string.punctuation]))

    # Total number of numerics
    workdata['total_num'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    # Total number of uppercase words
    workdata['total_uppercase'] = workdata['body'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    
    return workdata

## Type-Token Ratio (ttr)

In [4]:
def typetokenratio(df):
    ratiolst = []
    for text in df['body']:
        lex = LexicalRichness(text)
        if lex.words == 0:
            ratiolst.append(0)
        else:
            ratio = lex.ttr
            ratiolst.append(ratio)
    df['ttr'] = ratiolst
    return df

## Linguistic processes

In [5]:
# words per sentence
def wordcounter(df):
    lengthscore = []
    for row in df['senttokens']:
        tempscore = []
        for sentence in row:
            length = len(sentence.split())
            tempscore.append(length)
        score = sum(tempscore)
        lengthscore.append(score)
    df['words_per_sent'] = lengthscore
    return df

# words longer than six characters
def charcounter(df):
    charscore = []
    for row in df['tokens']:
        lenrow = len(row)
        if lenrow == 0:
            score = 0
        else:
            number = 0
            for token in row:
                length = len(token)
                if length > 5:
                    number+=1
            score = number/lenrow
        charscore.append(score)
    df['wordslongersix'] = charscore
    return df

In [6]:
# POS tagger
def tagging(df):
    past = [] #VPA
    presence = [] #VPR
    adverbs = [] #RB
    prepositions = [] #PREP
    pronouns = [] #PR
    for text in df['tokens']:
        tags = nltk.pos_tag(text)
        counts = Counter(tag for word,tag in tags)
        total = sum(counts.values())
        pron = counts['PRP'] + counts['PRP$']
        verbspr = counts['VB'] + counts['VBG'] + counts['VBP'] + counts['VBZ'] + counts['MD']
        verbspa = counts['VBD'] + counts['VBN']
        preps = counts['IN'] + counts['TO']
        counts['PR'] = pron
        counts['PREP'] = preps
        counts['VPR'] = verbspr #present tense
        counts['VPA'] = verbspa #past tense
        if total == 0:
            allcounts = dict((word, float(count)/1) for word,count in counts.items())
        else:
            allcounts = dict((word, float(count)/total) for word,count in counts.items())
        try:
            past.append(allcounts['VPA'])
        except KeyError:
            past.append(0)
        try:
            presence.append(allcounts['VPR'])
        except KeyError:
            presence.append(0)
        try:
            adverbs.append(allcounts['RB'])
        except KeyError:
            adverbs.append(0)
        try:
            prepositions.append(allcounts['PREP'])
        except KeyError:
            prepositions.append(0)
        try:
            pronouns.append(allcounts['PR'])
        except KeyError:
            pronouns.append(0)
    df['pasttense'] = past
    df['presencetense'] = presence
    df['adverbs'] = adverbs
    df['prepositions'] = prepositions
    df['pronouns'] = pronouns
    return df

# nltk.help.upenn_tagset('RB')

## Ngrams

In [7]:
def ngrams(df, n_min, n_max, ngramtype):
    # convert input from list to string
    ngrams = []
    inputtext = []
    for sentence in df['tokens']:
        text = ' '.join(sentence)
        inputtext.append(text)
    vectorizer = TfidfVectorizer(ngram_range=(n_min,n_max), analyzer=ngramtype)
    print("Vectorize...")
    vectors = vectorizer.fit_transform(tqdm(inputtext))
    dense = vectors.todense()
    denselist = dense.tolist()
    print("Get feature names...")
    names = vectorizer.get_feature_names()
    print("Create df...")
    ngramdf = pd.DataFrame(denselist, columns=names)
    ngramdf['ident'] = df['ident']
    print("Done")
#     newdf = pd.merge(df, ngramdf, on='author', how='outer')
#     ngramdict = ngramdf.to_dict('index')
#     dict_items = list(ngramdict.items())    
    return ngramdf

# stemmeddf['wordngrams'] = ngrams(stemmeddf, 1, 3, 'word')
# stemmeddf['charngrams'] = ngrams(stemmeddf, 2, 3, 'char')
# stemmeddf.head()

wordngramsdf = ngrams(df, 1, 3, 'word')
print("NaN in wordngramsdf: ", wordngramsdf.isnull().any().any())
print("Number of rows in wordngramsdf: ", len(wordngramsdf))
charngramsdf = ngrams(df, 2, 3, 'char')
print("NaN in charngramsdf: ", charngramsdf.isnull().any().any())
print("Number of rows in charngramsdf: ", len(charngramsdf))

def merge_dfs(df1, df2, df3):
    cwngramsdf = pd.merge(df1, df2, on='ident', how='inner')
    gramsdf = pd.merge(df3, cwngramsdf, on='ident', how='inner')
    return gramsdf

# newdf = merge_dfs(wordngramsdf, charngramsdf, df)
# print("number of rows in newdf: ", len(newdf))
# print("number of rows in newdf.author: ", len(newdf.author))
# print(newdf.author)
# print(newdf.index)

# df.index
# display(df)

# import collections

# list_1 = df.ident.tolist()
# list_2 = wordngramsdf.ident.tolist()
# list_3 = charngramsdf.ident.tolist()
# list_4 = newdf.ident.tolist()
# collections.Counter(list_1) == collections.Counter(list_2) == collections.Counter(list_3) == collections.Counter(list_4)
# print(list_4)
# gramsdf = merge_dfs(wordngramsdf, charngramsdf, stemmeddf)
# gramsdf.head()
# cwngramsdf = ngrams(wordngramsdf, 2, 3, 'char')
# print(cwngramsdf.isnull().any().any())
# cwngramsdf.head()

Vectorize...


  0%|          | 0/975 [00:00<?, ?it/s]

Get feature names...
Create df...
Done
NaN in wordngramsdf:  False
Number of rows in wordngramsdf:  975
Vectorize...


  0%|          | 0/975 [00:00<?, ?it/s]

Get feature names...
Create df...
Done
NaN in charngramsdf:  False
Number of rows in charngramsdf:  975


## Wrapper

In [8]:
def extract_lin_features(df):
    df = create_features(df)
    df = typetokenratio(df)
    df = wordcounter(df)
    df = charcounter(df)
    df = tagging(df)
    return df

In [9]:
lin_feat_df = extract_lin_features(df)

print("Length of dataframe: ", len(lin_feat_df))
lin_feat_df.info(verbose=True)


Length of dataframe:  975
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 975 entries, 0 to 974
Data columns (total 45 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 975 non-null    int64  
 1   author                975 non-null    object 
 2   ident                 975 non-null    object 
 3   body                  975 non-null    object 
 4   probody               975 non-null    object 
 5   tokens                975 non-null    object 
 6   senttokens            975 non-null    object 
 7   agreeableness         975 non-null    float64
 8   openness              975 non-null    float64
 9   conscientiousness     975 non-null    float64
 10  extraversion          975 non-null    float64
 11  neuroticism           975 non-null    float64
 12  agree                 975 non-null    int64  
 13  openn                 975 non-null    int64  
 14  consc                 975 non-null    int64  
 1

In [10]:
lin_feat_df.to_pickle("linguistic_features.pkl")

In [11]:
def extract_lin_ngrams(df):
    df = create_features(df)
    df = typetokenratio(df)
    df = wordcounter(df)
    df = charcounter(df)
    df = tagging(df)
    wordngramsdf = ngrams(df, 1, 3, 'word')
    charngramsdf = ngrams(df, 2, 3, 'char')
    gramsdf = merge_dfs(wordngramsdf, charngramsdf, df)
    return gramsdf

lin_ngrams_df = extract_lin_ngrams(df)

print("Length of dataframe: ", len(lin_ngrams_df))
lin_ngrams_df.info(verbose=True)

Vectorize...


  0%|          | 0/975 [00:00<?, ?it/s]

Get feature names...
Create df...
Done
Vectorize...


  0%|          | 0/975 [00:00<?, ?it/s]

Get feature names...
Create df...
Done
Length of dataframe:  975
<class 'pandas.core.frame.DataFrame'>
Int64Index: 975 entries, 0 to 974
Data columns (total 33444 columns):
 #      Column                                     Dtype  
---     ------                                     -----  
 0      index                                      int64  
 1      author_x                                   object 
 2      ident                                      object 
 3      body                                       object 
 4      probody                                    object 
 5      tokens                                     object 
 6      senttokens                                 object 
 7      agreeableness                              float64
 8      openness                                   float64
 9      conscientiousness                          float64
 10     extraversion                               float64
 11     neuroticism                                float64
 

In [12]:
lin_ngrams_df.to_pickle("linguistic_ngrams.pkl")

In [13]:
def extract_lin_wordngrams(df):
    df = create_features(df)
    df = typetokenratio(df)
    df = wordcounter(df)
    df = charcounter(df)
    df = tagging(df)
    wordngrams = ngrams(df, 1, 3, 'word')
    wordngramsdf = pd.DataFrame(wordngrams)
    gramsdf = pd.merge(df, wordngramsdf, on='ident', how='outer')
    return gramsdf

lin_wordngrams_df = extract_lin_wordngrams(df)
print("Length of dataframe: ", len(lin_wordngrams_df))

Vectorize...


  0%|          | 0/975 [00:00<?, ?it/s]

Get feature names...
Create df...
Done
Length of dataframe:  975


In [14]:
lin_wordngrams_df.to_pickle("linguistic_wordngrams.pkl")