## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

## Latent Space Feature Engineering (embeddings)
### Using Spanish 3 billion Word2Vex

- Pre-trained model: https://github.com/aitoralmeida/spanish_word2vec 

## Data set load 

In [2]:
import pandas as pd 

In [3]:
embeds_dataset_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/data/Prolexitim_v2_features-embed-usem.csv"

alex_df = pd.read_csv(embeds_dataset_path, header=0, delimiter=";")

In [4]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem', 'POS', 'NER', 'DEP', 'Chars', 'avgWL',
       'avgSL', 'Pun_Count', 'Stop_Count', 'RawTokens', 'Title_Count',
       'Upper_Count', 'VERB_Count', 'NOUN_Count', 'SYM_Count', 'ADV_Count',
       'PUNCT_Count', 'INTJ_Count', 'CCONJ_Count', 'ADJ_Count', 'AUX_Count',
       'DET_Count', 'SCONJ_Count', 'PRON_Count', 'NUM_Count', 'PROPN_Count',
       'ADP_Count', 'VERB_Ratio', 'NOUN_Ratio', 'SYM_Ratio', 'ADV_Ratio',
       'PUNCT_Ratio', 'INTJ_Ratio', 'CCONJ_Ratio', 'ADJ_Ratio', 'AUX_Ratio',
       'DET_Ratio', 'SCONJ_Ratio', 'PRON_Ratio', 'NUM_Ratio', 'PROPN_Ratio',
       'ADP_Ratio', 'TTR', 'HTR', 'Embed_USEM'],
      dtype='object')

In [5]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,AUX_Ratio,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio,TTR,HTR,Embed_USEM
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.125,0.1875,0.0625,0.125,0.0,0.0,0.125,0.5625,0.875,[-7.31239393e-02 8.52924492e-03 5.43712601e-...
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.857143,1.0,[-4.18877453e-02 5.60541414e-02 9.09477472e-...
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.034483,0.103448,0.103448,0.172414,0.0,0.0,0.103448,0.344828,0.793103,[-0.04114395 -0.01856564 0.04856339 0.032358...
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.125,0.083333,0.041667,0.125,0.0,0.0,0.208333,0.458333,0.875,[-0.03877169 -0.00012742 0.06037965 0.035413...
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.1,0.1,0.1,0.1,0.0,0.0,0.1,0.9,1.0,[-0.04557237 -0.01713993 0.04308525 -0.027140...


## Load the word vectors

In [6]:
from gensim.models import KeyedVectors

In [7]:
# My copy of Spanish 3 Billion Word2Vec
word_vectors = KeyedVectors.load('D:\Dropbox-Array2001\Dropbox\DataSets\Spanish Word2Vec\complete.kv', mmap='r')

In [8]:
# Testing Word Vectors 
word_vectors.most_similar("niño")

[('chiquillo', 0.863490104675293),
 ('bebé', 0.85056471824646),
 ('crío', 0.842483401298523),
 ('muchacho', 0.7765985727310181),
 ('niñito', 0.7588649988174438),
 ('chico', 0.75837242603302),
 ('cachorro', 0.7369438409805298),
 ('muchachito', 0.7227176427841187),
 ('gatito', 0.6718686819076538),
 ('hombre', 0.6658662557601929)]

In [49]:
# Example of one 400 dim vector
word_vectors.get_vector("niño")[0:10]

memmap([-0.98019737,  0.37916845, -1.6284896 ,  0.74026036,  0.67275375,
        -1.6113833 , -1.4873409 ,  1.6933979 ,  0.30024448,  0.6112603 ],
       dtype=float32)

In [10]:
word_vectors.similar_by_vector(
    word_vectors.get_vector("rey") - 
    word_vectors.get_vector("hombre") + 
    word_vectors.get_vector("mujer"))

[('reina', 0.7074883580207825),
 ('princesa', 0.6571577787399292),
 ('emperatriz', 0.6183611750602722),
 ('esposa', 0.5992065072059631),
 ('doncella', 0.5958696603775024),
 ('dama', 0.5885563492774963),
 ('infanta', 0.5775144696235657),
 ('mujer', 0.5741763114929199),
 ('concubina', 0.5553327202796936),
 ('hija', 0.5514962673187256)]

In [11]:
word_vectors.similar_by_vector(
    word_vectors.get_vector("enfermera") - 
    word_vectors.get_vector("mujer") + 
    word_vectors.get_vector("hombre"))

[('enfermero', 0.7018126845359802),
 ('hombre', 0.6676745414733887),
 ('individuo', 0.6419544219970703),
 ('celador', 0.6333192586898804),
 ('cirujano', 0.6183052062988281),
 ('médico', 0.6180358529090881),
 ('camillero', 0.6082327365875244),
 ('muchacho', 0.6047707200050354),
 ('chico', 0.6040195226669312),
 ('hombrecillo', 0.5817158222198486)]

In [50]:
'''
word_vectors.similar_by_vector(
    word_vectors.get_vector("euros") - 
    word_vectors.get_vector("españa") + 
    word_vectors.get_vector("canadá"))
    '''

'\nword_vectors.similar_by_vector(\n    word_vectors.get_vector("euros") - \n    word_vectors.get_vector("españa") + \n    word_vectors.get_vector("canadá"))\n    '

In [51]:
'''
word_vectors.similar_by_vector(
    word_vectors.get_vector("vehículo") - 
    word_vectors.get_vector("ruedas"))
    '''

'\nword_vectors.similar_by_vector(\n    word_vectors.get_vector("vehículo") - \n    word_vectors.get_vector("ruedas"))\n    '

In [53]:
'''
word_vectors.similar_by_vector(
    word_vectors.get_vector("persona") + 
    word_vectors.get_vector("amor"))
    '''

'\nword_vectors.similar_by_vector(\n    word_vectors.get_vector("persona") + \n    word_vectors.get_vector("amor"))\n    '

In [54]:
'''
word_vectors.similar_by_vector(
    word_vectors.get_vector("hombre") - 
    word_vectors.get_vector("amor") -
    word_vectors.get_vector("feliz"))
    '''

'\nword_vectors.similar_by_vector(\n    word_vectors.get_vector("hombre") - \n    word_vectors.get_vector("amor") -\n    word_vectors.get_vector("feliz"))\n    '

## Build sentence vector from word vectors

In [55]:
import numpy as np

### Sentence vector as the mean of word vectors

In [56]:
# Calculates the mean word vector of a list of words using the indicated word2vec model
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [57]:
# Test item
test_words = alex_df.Tokens.iloc[4]
test_words

"['hombre', 'desolado', 'porque', 'se', 'ha', 'encontrado', 'a', 'su', 'mujer', 'fallecida']"

In [61]:
test_vec = get_mean_vector(word_vectors, test_words)
test_vec[0:10]

array([ 0.2878785 ,  0.2457171 ,  0.2132683 , -0.6135208 ,  1.3157121 ,
       -0.7953963 ,  0.28311563, -0.8507003 ,  0.7568729 , -0.8901831 ],
      dtype=float32)

In [62]:
alex_df['Embed_3B_Mean'] = alex_df['Tokens'].apply(lambda x: get_mean_vector(word_vectors, x))

In [63]:
alex_df.head()

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,...,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio,TTR,HTR,Embed_USEM,Embed_3B_Mean
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,...,0.1875,0.0625,0.125,0.0,0.0,0.125,0.5625,0.875,[-7.31239393e-02 8.52924492e-03 5.43712601e-...,"[0.7490059, 0.43533283, 0.38320902, -0.7258389..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,...,0.142857,0.142857,0.0,0.0,0.0,0.0,0.857143,1.0,[-4.18877453e-02 5.60541414e-02 9.09477472e-...,"[0.24148928, 0.1480174, 0.47298616, -0.5232566..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,...,0.103448,0.103448,0.172414,0.0,0.0,0.103448,0.344828,0.793103,[-0.04114395 -0.01856564 0.04856339 0.032358...,"[0.6218276, 0.28139102, 0.2586725, -0.7076428,..."
3,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,9VH,0,1,...,0.083333,0.041667,0.125,0.0,0.0,0.208333,0.458333,0.875,[-0.03877169 -0.00012742 0.06037965 0.035413...,"[0.46048477, 0.19402103, 0.18872687, -0.681791..."
4,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,13HM,0,1,...,0.1,0.1,0.1,0.0,0.0,0.1,0.9,1.0,[-0.04557237 -0.01713993 0.04308525 -0.027140...,"[0.2878785, 0.2457171, 0.2132683, -0.6135208, ..."


## Save embeddings

In [65]:
embed_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Prolexitim-Dataset\\Prolexitim_v2_features-embed-3B.csv"
alex_df.to_csv(embed_dataset_path, sep=';', encoding='utf-8', index=False)