## Language Analysis of Alexithymic Discourse

<hr>

Alexithymic Language Project / raul@psicobotica.com / V2 release (sept 2020)

<hr>

## Sentiment Analysis Lexicons (Spanish)

- Multilingual Sentiment Project (Spanish) [link](https://sites.google.com/site/datascienceslab/projects/multilingualsentiment).
- AFINN-165 (English) [link](https://github.com/fnielsen/afinn). 
- AFINN-165-ES (my translation of AFINN-165). 


## Multilingual Sentiment Project (Spanish)
Lists of positive and negative words

In [100]:
import urllib.request
import numpy as np
import pandas as pd

In [1]:
msp_pos_words_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/lexicon/Multilingualsentiment_positive_words_es.txt"
msp_neg_words_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/lexicon/Multilingualsentiment_negative_words_es.txt"

In [62]:
# pos_text_file = open(msp_pos_words_path, "r")
# neg_text_file = open(msp_neg_words_path, "r")
# Load from URL instead of local disk
pos_text_file =  urllib.request.urlopen(msp_pos_words_path)
neg_text_file =  urllib.request.urlopen(msp_neg_words_path)

In [63]:
charset = pos_text_file.info().get_content_charset()
charset

'utf-8'

In [64]:
# Format is one word per line
pos_lines = pos_text_file.read().decode(charset)
neg_lines = neg_text_file.read().decode(charset)

In [69]:
pos_lines = pos_lines.splitlines()
neg_lines = neg_lines.splitlines()

In [81]:
print("%2d %2d" % (len(neg_lines), len(pos_lines)))

2720 1555


In [82]:
neg_lines[34:40]

['aire', 'restos', 'canal', 'pasado', 'falta', 'problema']

In [83]:
pos_lines[34:40]

['grande', 'realizar', 'firme', 'profesional', 'similar', 'libre']

In [84]:
# Build sets
pos_words_set = set(pos_lines)
neg_words_set = set(neg_lines)

In [85]:
pos_words_set.intersection(neg_words_set)

set()

In [86]:
all_words_set = pos_words_set.union(neg_words_set)
len(all_words_set)

4275

In [99]:
test_phrase = "Que asco de vida".split()

n_pos = np.sum([word in pos_words_set for word in test_phrase])
n_neg = np.sum([word in neg_words_set for word in test_phrase])

print("N_POS:%2d; N_NEG:%2d" % (n_pos, n_neg))

N_POS: 0; N_NEG: 1


In [102]:
# Considering stems instead of full words

import nltk
from nltk.stem.porter import PorterStemmer

# Porter stemmer
p_stemmer = PorterStemmer() 

pos_stems = [p_stemmer.stem(word) for word in pos_lines]
neg_stems = [p_stemmer.stem(word) for word in neg_lines]

pos_stems_set = set(pos_stems)
neg_stems_set = set(neg_stems)


In [105]:
len(pos_stems_set.intersection(pos_words_set))

1150

In [107]:
test_phrase2 = "Que asco de vida".split()
test_phrase2_s = [p_stemmer.stem(word) for word in test_phrase2]

n_pos = np.sum([word in pos_stems_set for word in test_phrase2_s])
n_neg = np.sum([word in neg_stems_set for word in test_phrase2_s])

print("N_POS:%2d; N_NEG:%2d" % (n_pos, n_neg))

N_POS: 0; N_NEG: 1


In [133]:
## Create dataframes
pos_words_df = pd.DataFrame(list(pos_words_set), columns=['Pos'])
neg_words_df = pd.DataFrame(list(neg_words_set), columns=['Neg'])
pos_stems_df = pd.DataFrame(list(neg_stems_set), columns=['Pos'])
neg_stems_df = pd.DataFrame(list(neg_stems_set), columns=['Neg'])

### Save word sets as df 

In [135]:
pos_words_df_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Sentiment_Lexicons\\MSP_Pos_Words.csv"
neg_words_df_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Sentiment_Lexicons\\MSP_Neg_Words.csv"
pos_stems_df_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Sentiment_Lexicons\\MSP_Pos_Stems.csv"
neg_stems_df_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Sentiment_Lexicons\\MSP_Neg_Stems.csv"

pos_words_df.to_csv(pos_words_df_path, sep=';', encoding='utf-8', index=False)
neg_words_df.to_csv(neg_words_df_path, sep=';', encoding='utf-8', index=False)
pos_stems_df.to_csv(pos_stems_df_path, sep=';', encoding='utf-8', index=False)
neg_stems_df.to_csv(neg_stems_df_path, sep=';', encoding='utf-8', index=False)

## AFINN-165

### Getting AFINN sentiment analysis lexicon in English

In [148]:
AFINN_path = "https://raw.githubusercontent.com/raul-arrabales/alexithymic-lang/master/lexicon/AFINN-en-165.txt"

AFINN_df = pd.read_csv(AFINN_path, header=None, delimiter="\t")

In [152]:
AFINN_df.columns = ['Word', 'Score']

In [153]:
AFINN_df.head()

Unnamed: 0,Word,Score
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


In [154]:
AFINN_df.describe()

Unnamed: 0,Score
count,3382.0
mean,-0.617386
std,2.124552
min,-5.0
25%,-2.0
50%,-2.0
75%,2.0
max,5.0


### Translate FINN words into Spanish
Using Google Translate API

In [157]:
# ! pip install googletrans

In [160]:
# Translating with Google Translate API
from googletrans import Translator 
translator = Translator()

In [182]:
AFINN_es_df = AFINN_df.copy()
AFINN_es_df['Word_ES_Text'] = ""

In [215]:
# Apply in batches 
for i in range(3000,3382):
    trlted = translator.translate(AFINN_df['Word'].iloc[i], src='en', dest='es').text
    print(trlted)
    AFINN_es_df['Word_ES_Text'].iloc[i] = trlted


celoso


In [217]:
AFINN_es_df.tail()

Unnamed: 0,Word,Score,Word_ES,Word_ES_Text
3377,yucky,-2,"Translated(src=en, dest=es, text=yucky, pronun...",yucky
3378,yummy,3,"Translated(src=en, dest=es, text=yummy, pronun...",sabroso
3379,zealot,-2,"Translated(src=en, dest=es, text=zealot, pronu...",fanático
3380,zealots,-2,"Translated(src=en, dest=es, text=zealots, pron...",zealots
3381,zealous,2,"Translated(src=en, dest=es, text=zealous, pron...",celoso


In [161]:
# All at once (problems with Google API)
# AFINN_df['Word_ES'] = AFINN_df.apply(
#     lambda row: translator.translate(row.Word, src='en', dest='es').text, axis=1) 

In [238]:
# Remove duplicates (they appeared due to translation)
AFINN_es_df = AFINN_es_df.drop_duplicates(subset='Word_ES_Text', keep="first")

In [239]:
len(AFINN_df) - len(AFINN_es_df)

494

In [235]:
# AFINN_es_df = pd.read_csv(AFINN_es_path, header=0, delimiter=";")

In [242]:
# All to lower
AFINN_es_df['Word_ES_Text_lower'] = AFINN_es_df.Word_ES_Text.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [240]:
AFINN_es_df.drop('Word_ES', axis=1, inplace=True)

In [244]:
AFINN_es_df.drop('Word_ES_Text', axis=1, inplace=True)

In [249]:
AFINN_es_df = AFINN_es_df.rename(columns={'Word_ES_Text_lower': 'Word_ES'})

In [252]:
AFINN_es_df.head()

Unnamed: 0,Word,Score,Word_ES
0,abandon,-2,abandonar
1,abandoned,-2,abandonado
2,abandons,-2,abandona
3,abducted,-2,secuestrado
4,abduction,-2,secuestro


### Saving Spanish Traslated AFINN df

In [251]:
AFINN_es_path = "D:\\Dropbox-Array2001\\Dropbox\\DataSets\\Sentiment_Lexicons\\AFINN-165-es.csv"
AFINN_es_df.to_csv(AFINN_es_path, sep=';', encoding='utf-8', index=False)