## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

### NLP Feature Engineering

- Pre-processed Dataset load (already tokenized, stemmed, POS, NER, Lex, Dep parsed). 
- Cuantitative variables: counts (chars, words, sentences, punctuation, etc.)
- Cuantitative variables: average lengths.
- Cuantitative variables: POS frequencies.
- Cuantitative variables: diversity scores (HTR, TTR).

### Pre-Processed Dataset Load

In [1]:
import pandas as pd 

In [2]:
processed_dataset_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/data/Prolexitim_v2_processed_dep.csv"

alex_df = pd.read_csv(processed_dataset_path, header=0, delimiter=";")

In [3]:
alex_df.Code.count()

381

In [97]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem', 'POS', 'NER', 'DEP', 'Chars', 'avgWL',
       'avgSL'],
      dtype='object')

In [4]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,Alex_A,Alex_B,Words,Sentences,Tokens,Tokens_Stop,Tokens_Stem,POS,NER,DEP
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0,0,16,2,"['es', 'un', 'niño', 'pensando', 'en', 'cual',...","['niño', 'pensando', 'respuesta', 'deberes', '...","['niño', 'pensando', 'respuesta', 'deber', 'sa...","[('es', 'AUX'), ('un', 'DET'), ('niño', 'NOUN'...","[('es', 'O'), ('un', 'O'), ('niño', 'O'), ('pe...","[[(('niño', 'NOUN'), 'cop', ('es', 'AUX')), ((..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0,0,7,2,"['hombre', 'llorando', 'porque', 'su', 'mujer'...","['hombre', 'llorando', 'mujer', 'muerto']","['hombr', 'llorando', 'mujer', 'muerto']","[('hombre', 'NOUN'), ('llorando', 'VERB'), ('p...","[('hombre', 'O'), ('llorando', 'O'), ('porque'...","[[(('llorando', 'VERB'), 'nsubj', ('hombre', '..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0,0,29,4,"['un', 'niño', 'cansado', 'de', 'estudiar', 'y...","['niño', 'cansado', 'estudiar', 'presionado', ...","['niño', 'cansado', 'estudiar', 'presionado', ...","[('un', 'DET'), ('niño', 'NOUN'), ('cansado', ...","[('un', 'O'), ('niño', 'O'), ('cansado', 'O'),...","[[(('niño', 'NOUN'), 'det', ('un', 'DET')), ((..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0,0,24,3,"['grupo', 'de', 'amigos', 'después', 'de', 'un...","['grupo', 'amigos', 'después', 'noche', 'diver...","['grupo', 'amigo', 'despué', 'noch', 'diversió...","[('grupo', 'NOUN'), ('de', 'ADP'), ('amigos', ...","[('grupo', 'O'), ('de', 'O'), ('amigos', 'O'),...","[[(('grupo', 'NOUN'), 'nmod', ('amigos', 'NOUN..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0,0,10,2,"['hombre', 'desolado', 'porque', 'se', 'ha', '...","['hombre', 'desolado', 'encontrado', 'mujer', ...","['hombr', 'desolado', 'encontrado', 'mujer', '...","[('hombre', 'NOUN'), ('desolado', 'ADJ'), ('po...","[('hombre', 'O'), ('desolado', 'O'), ('porque'...","[[(('hombre', 'NOUN'), 'amod', ('desolado', 'A..."


# Cuantitative NLP features

## Counts

### Char count (Chars)

In [5]:
alex_df['Chars'] = alex_df.Text.apply(lambda x: sum(len(word) for word in str(x).split(" ")))

In [6]:
alex_df.Chars.describe()

count    381.000000
mean     158.706037
std      118.689279
min       22.000000
25%       78.000000
50%      123.000000
75%      203.000000
max      744.000000
Name: Chars, dtype: float64

### Average word length (avgWL)

In [8]:
alex_df['avgWL'] = alex_df.Chars / alex_df.Words

In [9]:
alex_df.avgWL.describe()

count    381.000000
mean       4.555929
std        0.482036
min        3.388889
25%        4.225806
50%        4.545455
75%        4.831169
max        6.000000
Name: avgWL, dtype: float64

### Average sentence length (avgSL)

In [11]:
alex_df['avgSL'] = alex_df.Words / alex_df.Sentences

In [12]:
alex_df.avgSL.describe()

count    381.000000
mean      11.390465
std        5.762169
min        2.500000
25%        7.333333
50%       10.500000
75%       14.500000
max       51.333333
Name: avgSL, dtype: float64

### Puctuation Count (Pun_Count)

In [104]:
import string

In [110]:
count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))

In [115]:
alex_df['Pun_Count'] = alex_df.Text.apply(lambda text: count(text,set(string.punctuation))) 

In [116]:
alex_df.Pun_Count.describe()

count    381.000000
mean       3.761155
std        3.523330
min        0.000000
25%        1.000000
50%        3.000000
75%        5.000000
max       31.000000
Name: Pun_Count, dtype: float64

## PoS Counts

### Number of POS tags
VERB_Count<br>
NOUN_Count<br>
SYM_Count<br>
ADV_Count	<br>
PUNCT_Count<br>
INTJ_Count<br>
CCONJ_Count<br>
ADJ_Count<br>
AUX_Count<br>
DET_Count<br>
SCONJ_Count<br>
PRON_Count<br>
NUM_Count<br>
PROPN_Count<br>
ADP_Count<br>

In [27]:
import ast

# Get list of possible POS tags 
POS_tags = set()

for str_posList in alex_df.POS:
    posList = ast.literal_eval(str_posList)
    for posPair in posList:
        POS_tags.add(posPair[1])
# POS_tags

In [39]:
# Create df with POS tag counts
cNames = [] # Count column name
tNames = [] # Original tag name

for tag in POS_tags:
    cNames.append(tag+'_Count')
    tNames.append(tag)
    
POS_Counts_df = df = pd.DataFrame(columns = cNames)
POS_Counts_df

Unnamed: 0,VERB_Count,NOUN_Count,SYM_Count,ADV_Count,PUNCT_Count,INTJ_Count,CCONJ_Count,ADJ_Count,AUX_Count,DET_Count,SCONJ_Count,PRON_Count,NUM_Count,PROPN_Count,ADP_Count


In [43]:
# for each exemplar, for each POS tag list, for each tag: count

for str_posList in alex_df.POS:
    posList = ast.literal_eval(str_posList)
    counts = []
    for tag in tNames:
        tagCount = 0        
        for posPair in posList:
            if posPair[1] == tag:
                tagCount += 1
        counts.append(tagCount)
    row = pd.Series(counts, index=POS_Counts_df.columns)
    POS_Counts_df = POS_Counts_df.append(row, ignore_index=True)


In [47]:
POS_Counts_df.head()

Unnamed: 0,VERB_Count,NOUN_Count,SYM_Count,ADV_Count,PUNCT_Count,INTJ_Count,CCONJ_Count,ADJ_Count,AUX_Count,DET_Count,SCONJ_Count,PRON_Count,NUM_Count,PROPN_Count,ADP_Count
0,2,3,0,1,1,0,0,0,2,3,1,2,0,0,2
1,2,2,0,0,1,0,0,0,1,1,1,0,0,0,0
2,5,2,0,0,3,0,2,5,1,3,3,5,0,0,3
3,1,5,0,1,2,0,1,2,3,2,1,3,0,0,5
4,1,2,0,0,1,0,0,2,1,1,1,1,0,0,1


In [117]:
# Concat to the main df (append new columns)
POS_Counts_df = POS_Counts_df.reset_index(drop=True)
alex_df = alex_df.reset_index(drop=True)

feats_df = pd.concat([alex_df,POS_Counts_df],axis=1)

In [118]:
feats_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem', 'POS', 'NER', 'DEP', 'Chars', 'avgWL',
       'avgSL', 'Pun_Count', 'VERB_Count', 'NOUN_Count', 'SYM_Count',
       'ADV_Count', 'PUNCT_Count', 'INTJ_Count', 'CCONJ_Count', 'ADJ_Count',
       'AUX_Count', 'DET_Count', 'SCONJ_Count', 'PRON_Count', 'NUM_Count',
       'PROPN_Count', 'ADP_Count'],
      dtype='object')

###  Relative frequency of POS 
Proportion of each POS relative to the total number of tokens

In [119]:
for tag in tNames:
    feats_df[tag+'_Ratio'] = feats_df.apply(lambda row: row[tag+'_Count'] / row['Words'], axis=1)

In [120]:
feats_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,INTJ_Ratio,CCONJ_Ratio,ADJ_Ratio,AUX_Ratio,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0,0.0,0.0,0.125,0.1875,0.0625,0.125,0.0,0.0,0.125
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.0,0.068966,0.172414,0.034483,0.103448,0.103448,0.172414,0.0,0.0,0.103448
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.0,0.041667,0.083333,0.125,0.083333,0.041667,0.125,0.0,0.0,0.208333
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.0,0.0,0.2,0.1,0.1,0.1,0.1,0.0,0.0,0.1


## Lexical Diversity Scores
https://en.wikipedia.org/wiki/Lexical_diversity

### Type Token Ratio (TTR)
Calculates the overall complexity of the text based on the total number of unique word types used in the text.<br>
The total number of unique words divided by the total number of words in order to give the text a score from 0 to 1.<br>
This is a very text length sensible score. Not really significant for comparison if texts are not of the same length. 


In [121]:
def numberOfTypes( row ):
    num = 0
    for typeToken in cNames: 
        if row[typeToken] > 0: 
            num += 1
    return num

In [122]:
feats_df['TTR'] = feats_df.apply( lambda row: numberOfTypes(row) / len(ast.literal_eval(row.Tokens)), axis=1)

In [123]:
feats_df.TTR.describe()

count    381.000000
mean       0.377069
std        0.193790
min        0.071429
25%        0.229167
50%        0.344828
75%        0.478261
max        1.000000
Name: TTR, dtype: float64

### Hapax legomena/Token ratio (HTR)
The number of words that occur only once divided by the number of total words.

In [124]:
feats_df['HTR'] = feats_df.apply(
    lambda row: len(set(ast.literal_eval(row.Tokens))) / len(ast.literal_eval(row.Tokens)), axis=1)

In [125]:
feats_df.HTR.describe()

count    381.000000
mean       0.855941
std        0.102061
min        0.512000
25%        0.787234
50%        0.864865
75%        0.933333
max        1.000000
Name: HTR, dtype: float64

In [126]:
feats_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,ADJ_Ratio,AUX_Ratio,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio,TTR,HTR
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0,0.125,0.1875,0.0625,0.125,0.0,0.0,0.125,0.5625,0.875
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.857143,1.0
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.172414,0.034483,0.103448,0.103448,0.172414,0.0,0.0,0.103448,0.344828,0.793103
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.083333,0.125,0.083333,0.041667,0.125,0.0,0.0,0.208333,0.458333,0.875
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.2,0.1,0.1,0.1,0.1,0.0,0.0,0.1,0.9,1.0


## Save features df

In [127]:
features_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Prolexitim-Dataset\\Prolexitim_v2_features.csv"
feats_df.to_csv(features_dataset_path, sep=';', encoding='utf-8', index=False)