## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

### Bag of Words Model

- Compute word frequencies

## Load features dataset
- Data is already pre-processed (1-Preprocessing). 
- Basic NLP features are already calculated (2-Features).  

In [1]:
import pandas as pd 

In [2]:
feats_dataset_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/data/Prolexitim_v2_features.csv"

alex_df = pd.read_csv(feats_dataset_path, header=0, delimiter=";")

In [3]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem_P', 'Tokens_Stem_S', 'POS', 'NER', 'DEP',
       'Lemmas_CNLP', 'Lemmas_Spacy', 'Chars', 'avgWL', 'avgSL', 'Pun_Count',
       'Stop_Count', 'RawTokens', 'Title_Count', 'Upper_Count', 'PRON_Count',
       'DET_Count', 'ADV_Count', 'VERB_Count', 'PROPN_Count', 'NOUN_Count',
       'NUM_Count', 'PUNCT_Count', 'SYM_Count', 'SCONJ_Count', 'CCONJ_Count',
       'INTJ_Count', 'AUX_Count', 'ADP_Count', 'ADJ_Count', 'PRON_Ratio',
       'DET_Ratio', 'ADV_Ratio', 'VERB_Ratio', 'PROPN_Ratio', 'NOUN_Ratio',
       'NUM_Ratio', 'PUNCT_Ratio', 'SYM_Ratio', 'SCONJ_Ratio', 'CCONJ_Ratio',
       'INTJ_Ratio', 'AUX_Ratio', 'ADP_Ratio', 'ADJ_Ratio', 'TTR', 'HTR'],
      dtype='object')

In [4]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,PUNCT_Ratio,SYM_Ratio,SCONJ_Ratio,CCONJ_Ratio,INTJ_Ratio,AUX_Ratio,ADP_Ratio,ADJ_Ratio,TTR,HTR
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0625,0.0,0.0625,0.0,0.0,0.125,0.125,0.0,0.5625,0.875
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.142857,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.857143,1.0
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.103448,0.0,0.103448,0.068966,0.0,0.034483,0.103448,0.172414,0.344828,0.793103
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.083333,0.0,0.041667,0.041667,0.0,0.125,0.208333,0.083333,0.458333,0.875
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.1,0.0,0.1,0.0,0.0,0.1,0.1,0.2,0.9,1.0


## Create standard BoW Model

In [8]:
import ast

In [15]:
# Word Frequency
word_Freq = {}

# for each sentence in corpus, update word frequency dictionary
for tokens in alex_df.Tokens_Stop:
    token_list = ast.literal_eval(tokens)
    for token in token_list:
        if token not in word_Freq.keys():
            word_Freq[token] = 1
        else:
            word_Freq[token] += 1

In [27]:
print("Frequency of word 'niño' is %d." % word_Freq.get("niño"))
print("Frequency of word 'niña' is %d." % word_Freq.get("niña"))

Frequency of word 'niño' is 85.
Frequency of word 'niña' is 2.


In [28]:
print("We have %d different words in our corpus." % (len(word_Freq)))

We have 2710 different words in our corpus.


In [29]:
#Let's get the 400 most frequent words to build document vectors
import heapq

most_Freq_Words = heapq.nlargest(400, word_Freq, key=word_Freq.get)

In [59]:
print(heapq.nlargest(20, word_Freq, key=word_Freq.get))

['niño', 'violín', 'hombre', 'día', 'tocar', 'mujer', 'ser', 'padres', 'tras', 'casa', 'después', 'trabajo', 'grupo', 'hacer', 'cascada', 'vida', 'si', 'mientras', 'quería', 'música']


In [39]:
# Create a binary vector representation for each document
alex_df['BoW-Vector'] = ""

In [40]:
for i in range(0, len(alex_df)):
    BoW_Vec = []
    token_list = ast.literal_eval(alex_df['Tokens_Stop'].iloc[i])
    for token in most_Freq_Words:
        if token in token_list:
            BoW_Vec.append(1)
        else: 
            BoW_Vec.append(0)
    # print(BoW_Vec)
    alex_df['BoW-Vector'].iloc[i] = BoW_Vec
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [41]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,SYM_Ratio,SCONJ_Ratio,CCONJ_Ratio,INTJ_Ratio,AUX_Ratio,ADP_Ratio,ADJ_Ratio,TTR,HTR,BoW-Vector
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0,0.0625,0.0,0.0,0.125,0.125,0.0,0.5625,0.875,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.857143,1.0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.0,0.103448,0.068966,0.0,0.034483,0.103448,0.172414,0.344828,0.793103,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.0,0.041667,0.041667,0.0,0.125,0.208333,0.083333,0.458333,0.875,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.0,0.1,0.0,0.0,0.1,0.1,0.2,0.9,1.0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Create a Stem BoW Model

In [42]:
# Stem (Snowball) Frequency
stem_Freq = {}

# for each sentence in corpus, update stem frequency dictionary
for tokens in alex_df.Tokens_Stem_S:
    token_list = ast.literal_eval(tokens)
    for token in token_list:
        if token not in stem_Freq.keys():
            stem_Freq[token] = 1
        else:
            stem_Freq[token] += 1

In [47]:
print("We have %d words and %d stems." % (len(word_Freq),len(stem_Freq)))

We have 2710 words and 1779 stems.


In [45]:
print("Frequency of word 'niñ' is %d." % stem_Freq.get("niñ"))
print("Frequency of word 'hombr' is %d." % stem_Freq.get("hombr"))

Frequency of word 'niñ' is 92.
Frequency of word 'hombr' is 81.


In [48]:
most_Freq_Stems = heapq.nlargest(400, stem_Freq, key=stem_Freq.get)

In [58]:
# 20 most common stems
print(heapq.nlargest(20, stem_Freq, key=stem_Freq.get))

['violin', 'niñ', 'hombr', 'dia', 'toc', 'trabaj', 'hac', 'descans', 'muj', 'sol', 'padr', 'com', 'lleg', 'sab', 'pas', 'quer', 'pod', 'cas', 'gust', 'ser']


In [49]:
# Create a binary vector representation for each document
alex_df['BoW-Stem-Vector'] = ""

In [50]:
for i in range(0, len(alex_df)):
    Stem_Vec = []
    token_list = ast.literal_eval(alex_df['Tokens_Stem_S'].iloc[i])
    for token in most_Freq_Stems:
        if token in token_list:
            Stem_Vec.append(1)
        else: 
            Stem_Vec.append(0)
    alex_df['BoW-Stem-Vector'].iloc[i] = Stem_Vec

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [60]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem_P', 'Tokens_Stem_S', 'POS', 'NER', 'DEP',
       'Lemmas_CNLP', 'Lemmas_Spacy', 'Chars', 'avgWL', 'avgSL', 'Pun_Count',
       'Stop_Count', 'RawTokens', 'Title_Count', 'Upper_Count', 'PRON_Count',
       'DET_Count', 'ADV_Count', 'VERB_Count', 'PROPN_Count', 'NOUN_Count',
       'NUM_Count', 'PUNCT_Count', 'SYM_Count', 'SCONJ_Count', 'CCONJ_Count',
       'INTJ_Count', 'AUX_Count', 'ADP_Count', 'ADJ_Count', 'PRON_Ratio',
       'DET_Ratio', 'ADV_Ratio', 'VERB_Ratio', 'PROPN_Ratio', 'NOUN_Ratio',
       'NUM_Ratio', 'PUNCT_Ratio', 'SYM_Ratio', 'SCONJ_Ratio', 'CCONJ_Ratio',
       'INTJ_Ratio', 'AUX_Ratio', 'ADP_Ratio', 'ADJ_Ratio', 'TTR', 'HTR',
       'BoW-Vector', 'BoW-Stem-Vector'],
      dtype='object')

## Save BoW models

In [64]:
# Create a df with BoW models and original variables. 
# So we can use it later for classification tasks.
BoWs_df = alex_df[['Code', 'TAS20', 'F1', 'F2', 'F3','Card','Alex_A', 'Alex_B', 'BoW-Vector', 'BoW-Stem-Vector']]

In [65]:
BoWs_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Card,Alex_A,Alex_B,BoW-Vector,BoW-Stem-Vector
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,13HM,0,0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,1,0,0,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,9VH,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,13HM,0,0,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [67]:
BoW_Models_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Prolexitim-Dataset\\Prolexitim_v2_BoWs.csv"
BoWs_df.to_csv(BoW_Models_path, sep=';', encoding='utf-8', index=False)