## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

### TF-IDF

- TF = (Frequency of the word in the sentence) / (Total number of words in the sentence)
- IDF: (Total number of sentences (documents))/(Number of sentences (documents) containing the word)
- log(IDF): log((Total number of sentences (documents))/(Number of sentences (documents) containing the word))

## Load features dataset
- Data is already pre-processed (1-Preprocessing). 
- Basic NLP features are already calculated (2-Features). 

In [11]:
import pandas as pd 
import numpy as np
import ast
import heapq

In [2]:
feats_dataset_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/data/Prolexitim_v2_features.csv"

alex_df = pd.read_csv(feats_dataset_path, header=0, delimiter=";")

In [3]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem_P', 'Tokens_Stem_S', 'POS', 'NER', 'DEP',
       'Lemmas_CNLP', 'Lemmas_Spacy', 'Chars', 'avgWL', 'avgSL', 'Pun_Count',
       'Stop_Count', 'RawTokens', 'Title_Count', 'Upper_Count', 'PRON_Count',
       'DET_Count', 'ADV_Count', 'VERB_Count', 'PROPN_Count', 'NOUN_Count',
       'NUM_Count', 'PUNCT_Count', 'SYM_Count', 'SCONJ_Count', 'CCONJ_Count',
       'INTJ_Count', 'AUX_Count', 'ADP_Count', 'ADJ_Count', 'PRON_Ratio',
       'DET_Ratio', 'ADV_Ratio', 'VERB_Ratio', 'PROPN_Ratio', 'NOUN_Ratio',
       'NUM_Ratio', 'PUNCT_Ratio', 'SYM_Ratio', 'SCONJ_Ratio', 'CCONJ_Ratio',
       'INTJ_Ratio', 'AUX_Ratio', 'ADP_Ratio', 'ADJ_Ratio', 'TTR', 'HTR'],
      dtype='object')

In [4]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,PUNCT_Ratio,SYM_Ratio,SCONJ_Ratio,CCONJ_Ratio,INTJ_Ratio,AUX_Ratio,ADP_Ratio,ADJ_Ratio,TTR,HTR
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0625,0.0,0.0625,0.0,0.0,0.125,0.125,0.0,0.5625,0.875
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.142857,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.857143,1.0
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.103448,0.0,0.103448,0.068966,0.0,0.034483,0.103448,0.172414,0.344828,0.793103
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.083333,0.0,0.041667,0.041667,0.0,0.125,0.208333,0.083333,0.458333,0.875
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.1,0.0,0.1,0.0,0.0,0.1,0.1,0.2,0.9,1.0


## Create TF/IDF Model

In [6]:
# Word Frequency
word_Freq = {}

# for each sentence in corpus, update word frequency dictionary
for tokens in alex_df.Tokens_Stop:
    token_list = ast.literal_eval(tokens)
    for token in token_list:
        if token not in word_Freq.keys():
            word_Freq[token] = 1
        else:
            word_Freq[token] += 1

In [8]:
most_Freq_Words = heapq.nlargest(200, word_Freq, key=word_Freq.get)

In [10]:
most_Freq_Words[0:10]

['niño',
 'violín',
 'hombre',
 'día',
 'tocar',
 'mujer',
 'ser',
 'padres',
 'tras',
 'casa']

In [14]:
# Calculate IDF values for each word

word_IDF = {}

for token in most_Freq_Words: # For each most frequent token
    docs_with_word = 0
    for tokens in alex_df.Tokens_Stop: # For each document
        token_list = ast.literal_eval(tokens)
        if token in token_list: # if token is in this document, count it. 
            docs_with_word += 1
    # print("Token: %s in %d docs." % (token, docs_with_word))
    word_IDF[token] = np.log(len(alex_df)/(1 + docs_with_word))

In [16]:
word_IDF.get("niño")

1.6253112615903906

In [20]:
# TF Dictionary for each word

word_TFs = {}

for token in most_Freq_Words: # For each most frequent token
    TF_vector = []
    for tokens in alex_df.Tokens_Stop: # For each document
        token_list = ast.literal_eval(tokens)
        doc_freq = 0
        for word in token_list: # For each word in this document
            if token == word: # if equals to current most frequent token, count
                doc_freq += 1
        word_TF = doc_freq / len(token_list)
        # print("Word: %s - TF: %f" % (token,word_TF))
        TF_vector.append(word_TF)
    # print("Word TF Vector: %s" % TF_vector)
    word_TFs[token] = TF_vector

In [22]:
word_TFs.get("niño")

[0.2,
 0.0,
 0.09090909090909091,
 0.0,
 0.0,
 0.2,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07142857142857142,
 0.0,
 0.0,
 0.0,
 0.16666666666666666,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.047619047619047616,
 0.125,
 0.0,
 0.0,
 0.08333333333333333,
 0.0,
 0.0,
 0.1,
 0.0,
 0.0,
 0.07142857142857142,
 0.0,
 0.0,
 0.0,
 0.0,
 0.06666666666666667,
 0.0,
 0.041666666666666664,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2,
 0.0,
 0.0,
 0.0,
 0.08333333333333333,
 0.0,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.03333333333333333,
 0.0,
 0.0,
 0.0,
 0.03571428571428571,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.09090909090909091,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07692307692307693,
 0.09090909090909091,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07142857142857142,
 0.0,
 0.125,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08333333333333333,
 0.0,
 0.0,
 0.0,
 0.

In [24]:
len(word_TFs.get("niño")) == len(alex_df)

True

In [26]:
# Compute TF/IDF

TF_IDF = []
for token in word_TFs.keys(): # for each TF value of most frequent words
    TF_IDF_vectors = []
    for TF_vector in word_TFs[token]:
        TF_IDF_score = TF_vector * word_IDF[token]
        TF_IDF_vectors.append(TF_IDF_score)
    TF_IDF.append(TF_IDF_vectors)

In [33]:
print("We have %d vectors of dimension %d." % 
      (len(TF_IDF),len(TF_IDF[0])))

print("Because there are %d documents and we selected %d most frequent words." % 
      (len(alex_df),len(most_Freq_Words)))

We have 200 vectors of dimension 381.
Because there are 381 documents and we selected 200 most frequent words.


In [38]:
# IF/IDF Model
TF_IDF_model = np.asarray(TF_IDF)

# Let rows represent TF/IDF vectors
TF_IDF_model = np.transpose(TF_IDF_model)

In [40]:
TF_IDF_model.shape

(381, 200)

In [46]:
# TF/IDF vector for document 2
TF_IDF_model[2]

array([0.14775557, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.24063296, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.27257822, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.30034019, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [47]:
# Create a TF/IDF vector representation for each document
alex_df['TFIDF_Vector'] = ""

In [48]:
for i in range(0, len(alex_df)):
    alex_df['TFIDF_Vector'].iloc[i] = TF_IDF_model[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [49]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,SYM_Ratio,SCONJ_Ratio,CCONJ_Ratio,INTJ_Ratio,AUX_Ratio,ADP_Ratio,ADJ_Ratio,TTR,HTR,TFIDF_Vector
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.0,0.0625,0.0,0.0,0.125,0.125,0.0,0.5625,0.875,"[0.32506225231807817, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.857143,1.0,"[0.0, 0.0, 0.47936192109788794, 0.0, 0.0, 0.54..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.0,0.103448,0.068966,0.0,0.034483,0.103448,0.172414,0.344828,0.793103,"[0.14775556923549008, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.0,0.041667,0.041667,0.0,0.125,0.208333,0.083333,0.458333,0.875,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.0,0.1,0.0,0.0,0.1,0.1,0.2,0.9,1.0,"[0.0, 0.0, 0.3834895368783104, 0.0, 0.0, 0.436..."


## Save TF/IDF model

In [50]:
# Create a df with BoW models and original variables. 
# So we can use it later for classification tasks.
TFIDF_df = alex_df[['Code', 'TAS20', 'F1', 'F2', 'F3','Card','Alex_A', 'Alex_B', 'TFIDF_Vector']]

In [51]:
TFIDF_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Card,Alex_A,Alex_B,TFIDF_Vector
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,1,0,0,"[0.32506225231807817, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,13HM,0,0,"[0.0, 0.0, 0.47936192109788794, 0.0, 0.0, 0.54..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,1,0,0,"[0.14775556923549008, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,9VH,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,13HM,0,0,"[0.0, 0.0, 0.3834895368783104, 0.0, 0.0, 0.436..."


In [52]:
TFIDF_df_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Prolexitim-Dataset\\Prolexitim_v2_TFIDF.csv"
TFIDF_df.to_csv(TFIDF_df_path, sep=';', encoding='utf-8', index=False)