## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

## Latent Space Feature Engineering (embeddings)
### Using Google Universal Sentence Encoder - Multilingual Large 3

- TF Hub Pre-trained model: https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3 

### Load my dataset 
(After former feature engineering)

In [4]:
import pandas as pd 

In [5]:
feats_dataset_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/data/Prolexitim_v2_features.csv"

alex_df = pd.read_csv(feats_dataset_path, header=0, delimiter=";")

In [6]:
alex_df.columns

Index(['Code', 'TAS20', 'F1', 'F2', 'F3', 'Gender', 'Age', 'Card',
       'T_Metaphors', 'T_ToM', 'T_FP', 'T_Interpret', 'T_Desc', 'T_Confussion',
       'Text', 'Alex_A', 'Alex_B', 'Words', 'Sentences', 'Tokens',
       'Tokens_Stop', 'Tokens_Stem', 'POS', 'NER', 'DEP', 'Chars', 'avgWL',
       'avgSL', 'Pun_Count', 'Stop_Count', 'RawTokens', 'Title_Count',
       'Upper_Count', 'VERB_Count', 'NOUN_Count', 'SYM_Count', 'ADV_Count',
       'PUNCT_Count', 'INTJ_Count', 'CCONJ_Count', 'ADJ_Count', 'AUX_Count',
       'DET_Count', 'SCONJ_Count', 'PRON_Count', 'NUM_Count', 'PROPN_Count',
       'ADP_Count', 'VERB_Ratio', 'NOUN_Ratio', 'SYM_Ratio', 'ADV_Ratio',
       'PUNCT_Ratio', 'INTJ_Ratio', 'CCONJ_Ratio', 'ADJ_Ratio', 'AUX_Ratio',
       'DET_Ratio', 'SCONJ_Ratio', 'PRON_Ratio', 'NUM_Ratio', 'PROPN_Ratio',
       'ADP_Ratio', 'TTR', 'HTR'],
      dtype='object')

In [8]:
alex_df.head(2)

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,T_FP,T_Interpret,T_Desc,T_Confussion,Text,Alex_A,Alex_B,Words,Sentences,Tokens,Tokens_Stop,Tokens_Stem,POS,NER,DEP,Chars,avgWL,avgSL,Pun_Count,Stop_Count,RawTokens,Title_Count,Upper_Count,VERB_Count,NOUN_Count,SYM_Count,ADV_Count,PUNCT_Count,INTJ_Count,CCONJ_Count,ADJ_Count,AUX_Count,DET_Count,SCONJ_Count,PRON_Count,NUM_Count,PROPN_Count,ADP_Count,VERB_Ratio,NOUN_Ratio,SYM_Ratio,ADV_Ratio,PUNCT_Ratio,INTJ_Ratio,CCONJ_Ratio,ADJ_Ratio,AUX_Ratio,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio,TTR,HTR
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,0,1,0,0,es un niño pensando en cual es la respuesta de...,0,0,16,2,"['es', 'un', 'niño', 'pensando', 'en', 'cual',...","['niño', 'pensando', 'respuesta', 'deberes', '...","['niño', 'pensando', 'respuesta', 'deber', 'sa...","[('es', 'AUX'), ('un', 'DET'), ('niño', 'NOUN'...","[('es', 'O'), ('un', 'O'), ('niño', 'O'), ('pe...","[[(('niño', 'NOUN'), 'cop', ('es', 'AUX')), ((...",62,3.875,8.0,1,11,"['es', 'un', 'niño', 'pensando', 'en', 'cual',...",0,0,2,3,0,1,1,0,0,0,2,3,1,2,0,0,2,0.125,0.1875,0.0,0.0625,0.0625,0.0,0.0,0.0,0.125,0.1875,0.0625,0.125,0.0,0.0,0.125,0.5625,0.875
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,0,0,0,0,hombre llorando porque su mujer ha muerto.,0,0,7,2,"['hombre', 'llorando', 'porque', 'su', 'mujer'...","['hombre', 'llorando', 'mujer', 'muerto']","['hombr', 'llorando', 'mujer', 'muerto']","[('hombre', 'NOUN'), ('llorando', 'VERB'), ('p...","[('hombre', 'O'), ('llorando', 'O'), ('porque'...","[[(('llorando', 'VERB'), 'nsubj', ('hombre', '...",36,5.142857,3.5,1,3,"['hombre', 'llorando', 'porque', 'su', 'mujer'...",0,0,2,2,0,0,1,0,0,0,1,1,1,0,0,0,0,0.285714,0.285714,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.857143,1.0


### Environment setup

In [9]:
%%capture
#@title Setup Environment
# Install the latest Tensorflow version.
!pip install tensorflow_text
!pip install bokeh
!pip install simpleneighbors[annoy]
!pip install tqdm

In [10]:
#@title Setup common imports and functions
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import os
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
import sklearn.metrics.pairwise

from simpleneighbors import SimpleNeighbors
from tqdm import tqdm
from tqdm import trange

def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width=1200, plot_height=600,
                         xaxis_font_size='12pt', yaxis_font_size='12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])
  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns=['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette=[*reversed(bokeh.palettes.YlOrRd[9])], low=df.sim.min(),
      high=df.sim.max())

  p = bokeh.plotting.figure(title=plot_title, x_range=labels_1,
                            x_axis_location="above",
                            y_range=[*reversed(labels_2)],
                            plot_width=plot_width, plot_height=plot_height,
                            tools="save",toolbar_location='below', tooltips=[
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x="embeddings_1", y="embeddings_2", width=1, height=1, source=df,
         fill_color={'field': 'sim', 'transform': mapper}, line_color=None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)


In [11]:
# Using the Google Universal Sentence Enconder
# Multilingual version for Spanish support (large 3)
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3' #@param ['https://tfhub.dev/google/universal-sentence-encoder-multilingual/3', 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3']

model = hub.load(module_url)

def embed_text(input):
  return model(input)

### Computing Text Embeddings

In [12]:
one_test = alex_df.RawTokens.iloc[4]
one_test

"['Hombre', 'desolado', 'porque', 'se', 'ha', 'encontrado', 'a', 'su', 'mujer', 'fallecida']"

In [23]:
embedded_test = embed_text(one_test)
embedded_test.numpy()[0]

array([-0.04557237, -0.01713993,  0.04308525, -0.02714091,  0.01057335,
        0.02123043, -0.03652984, -0.04409388,  0.06297714,  0.04313858,
        0.04221709,  0.08276404, -0.07536884,  0.03595347, -0.06516098,
       -0.02230111, -0.01920966, -0.09134265,  0.02303783,  0.09225203,
       -0.02083435, -0.10699026, -0.04911768, -0.0338804 , -0.03447349,
       -0.01241336,  0.01377914,  0.01994555, -0.03919867, -0.05277168,
        0.06579806, -0.03068903,  0.01899753, -0.03936582, -0.03338377,
       -0.11383827,  0.03754874, -0.00722124, -0.0100561 , -0.06096003,
       -0.00041967,  0.02075296,  0.04782465,  0.01154795, -0.01575884,
       -0.00809672, -0.02580861,  0.0552176 ,  0.03783795, -0.05131494,
        0.03739861, -0.01955346, -0.00648472, -0.03254839,  0.01631508,
        0.04303116, -0.0462549 ,  0.04555619, -0.04746798,  0.0362647 ,
       -0.02612547,  0.05361738,  0.03525089, -0.02314913, -0.02770599,
        0.00874192, -0.00992853, -0.03441396,  0.01534355,  0.02

In [21]:
# Compute the embedding for each document in the dataset (as a one-dim numpy array)
alex_df['Embed_USEM'] = alex_df.RawTokens.apply(lambda tokens: embed_text(tokens).numpy()[0] ) 

In [22]:
alex_df.head(3)

Unnamed: 0,Code,TAS20,F1,F2,F3,Gender,Age,Card,T_Metaphors,T_ToM,T_FP,T_Interpret,T_Desc,T_Confussion,Text,Alex_A,Alex_B,Words,Sentences,Tokens,Tokens_Stop,Tokens_Stem,POS,NER,DEP,Chars,avgWL,avgSL,Pun_Count,Stop_Count,RawTokens,Title_Count,Upper_Count,VERB_Count,NOUN_Count,SYM_Count,ADV_Count,PUNCT_Count,INTJ_Count,CCONJ_Count,ADJ_Count,AUX_Count,DET_Count,SCONJ_Count,PRON_Count,NUM_Count,PROPN_Count,ADP_Count,VERB_Ratio,NOUN_Ratio,SYM_Ratio,ADV_Ratio,PUNCT_Ratio,INTJ_Ratio,CCONJ_Ratio,ADJ_Ratio,AUX_Ratio,DET_Ratio,SCONJ_Ratio,PRON_Ratio,NUM_Ratio,PROPN_Ratio,ADP_Ratio,TTR,HTR,Embed_USEM
0,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,1,0,1,0,1,0,0,es un niño pensando en cual es la respuesta de...,0,0,16,2,"['es', 'un', 'niño', 'pensando', 'en', 'cual',...","['niño', 'pensando', 'respuesta', 'deberes', '...","['niño', 'pensando', 'respuesta', 'deber', 'sa...","[('es', 'AUX'), ('un', 'DET'), ('niño', 'NOUN'...","[('es', 'O'), ('un', 'O'), ('niño', 'O'), ('pe...","[[(('niño', 'NOUN'), 'cop', ('es', 'AUX')), ((...",62,3.875,8.0,1,11,"['es', 'un', 'niño', 'pensando', 'en', 'cual',...",0,0,2,3,0,1,1,0,0,0,2,3,1,2,0,0,2,0.125,0.1875,0.0,0.0625,0.0625,0.0,0.0,0.0,0.125,0.1875,0.0625,0.125,0.0,0.0,0.125,0.5625,0.875,"[-0.07312394, 0.008529245, 0.05437126, 0.02084..."
1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,2,22,13HM,0,1,0,0,0,0,hombre llorando porque su mujer ha muerto.,0,0,7,2,"['hombre', 'llorando', 'porque', 'su', 'mujer'...","['hombre', 'llorando', 'mujer', 'muerto']","['hombr', 'llorando', 'mujer', 'muerto']","[('hombre', 'NOUN'), ('llorando', 'VERB'), ('p...","[('hombre', 'O'), ('llorando', 'O'), ('porque'...","[[(('llorando', 'VERB'), 'nsubj', ('hombre', '...",36,5.142857,3.5,1,3,"['hombre', 'llorando', 'porque', 'su', 'mujer'...",0,0,2,2,0,0,1,0,0,0,1,1,1,0,0,0,0,0.285714,0.285714,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.857143,1.0,"[-0.041887745, 0.05605414, 0.009094775, -0.015..."
2,20cd825cadb95a71763bad06e142c148,40,12,10,18,2,22,1,0,1,0,0,0,0,un Niño cansado de estudiar y presionado por s...,0,0,29,4,"['un', 'niño', 'cansado', 'de', 'estudiar', 'y...","['niño', 'cansado', 'estudiar', 'presionado', ...","['niño', 'cansado', 'estudiar', 'presionado', ...","[('un', 'DET'), ('niño', 'NOUN'), ('cansado', ...","[('un', 'O'), ('niño', 'O'), ('cansado', 'O'),...","[[(('niño', 'NOUN'), 'det', ('un', 'DET')), ((...",127,4.37931,7.25,3,18,"['un', 'Niño', 'cansado', 'de', 'estudiar', 'y...",3,3,5,2,0,0,3,0,2,5,1,3,3,5,0,0,3,0.172414,0.068966,0.0,0.0,0.103448,0.0,0.068966,0.172414,0.034483,0.103448,0.103448,0.172414,0.0,0.0,0.103448,0.344828,0.793103,"[-0.041143946, -0.018565636, 0.04856339, 0.032..."


In [25]:
# Export file
from google.colab import files

alex_df.to_csv('Prolexitim_v2_features-embed-usem.csv', sep=';', encoding='utf-8', index=False) 
files.download('Prolexitim_v2_features-embed-usem.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>