# Prolexitim Analytics (NLP)
## NLP Dataset Preparation and Pre-processing
### Merging TAS-20 results with NLP data (text, sentiment, emotion)
<hr>
May 2019.<br> Prolexitim dataset version 1.1 (MPGS-TFM-Submission).<br> 
Raúl Arrabales Moreno (Psicobótica / Serendeepia Research)<br>
<a target="_blank" href="http://www.conscious-robots.com/">http://www.conscious-robots.com/</a> <br>
<hr>

### Load dataset from CSV files

In [60]:
# Loading the tables

import pandas as pd 

nlp_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\UNI\\MPGS\\2_TFM\\Datos\\prolexitim-nlp-1.1.csv"
tas_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\UNI\\MPGS\\2_TFM\\Datos\\prolexitim-tas-1.1.csv"

nlp_df = pd.read_csv(nlp_dataset_path,header=0,delimiter="\t") 
tas_df = pd.read_csv(tas_dataset_path,header=0,delimiter="\t") 

In [65]:
# Check
nlp_df.head()

Unnamed: 0.1,Unnamed: 0,RowId,code,card,hum,mode,time,G-score,G-magnitude,Azure-TA,...,Text-EN,nlu-sentiment,nlu-label,nlu-joy,nlu-anger,nlu-disgust,nlu-sadness,nlu-fear,es-len,en-len
0,0,1,b7adde8a9eec8ce92b5ee0507ce054a4,13V,1,T,200000,-0.2,0.2,0.62,...,It was a child sitting in the barn and thinkin...,-0.640157,negative,0.31792,0.143086,0.422023,0.173421,0.098997,115,124
1,1,2,b7adde8a9eec8ce92b5ee0507ce054a4,18NM,2,T,200000,-0.5,0.5,0.41,...,"A mother who is comforting her son, after givi...",0.0,neutral,0.2851,0.168727,0.057098,0.362623,0.109176,110,115
2,2,3,b7adde8a9eec8ce92b5ee0507ce054a4,12VN,0,T,200000,0.0,1.2,0.63,...,A swam with an abandoned boat. Let's see what ...,0.265769,positive,0.039779,0.205065,0.244164,0.164005,0.481812,93,96
3,3,4,76ef63369f7d5b6597a543017e1ef578,12VN,0,T,200000,0.0,0.1,0.89,...,"It was a beautiful place, with a boat, a littl...",-0.353556,negative,0.208997,0.007244,0.008434,0.698307,0.190991,255,244
4,4,5,76ef63369f7d5b6597a543017e1ef578,10,2,T,200000,0.3,0.1,0.24,...,"It was once a husband and wife, who loved each...",-0.552068,negative,0.367801,0.063256,0.095947,0.469062,0.103351,184,184


In [66]:
# Check
tas_df.head()

Unnamed: 0.1,Unnamed: 0,NLP,code,TAS20,F1,F2,F3,Tas20Time,Sex,Gender,...,Dhand,Studies,SClass,Siblings,SibPos,Origin,Resid,Rtime,Ethnic,Job
0,0,0,be8f0c722d0a0f4cd9d92c503e6f7583,42,16,10,16,254305,1,1,...,1,5,2,6,2,ES,ES,-1,Iberic,Psychology
1,1,1,608af5455da8c250a87f81a5ed5c1942,55,15,20,15,103425,1,1,...,2,7,2,5,5,ES,ES,-1,Iberic,Psychology
2,2,1,bc39e22ca5dba59fbd97c27987878f56,40,16,9,15,201637,2,2,...,1,5,2,2,2,ES,ES,-1,Iberic,Psychology
3,3,0,a2caa2eaccf99705bf39f6aeaee00ee3,40,13,10,17,242202,2,2,...,1,5,2,3,1,ES,ES,-1,Iberic,Psychology
4,4,1,20cd825cadb95a71763bad06e142c148,40,12,10,18,155945,2,2,...,1,5,2,1,1,ES,ES,-1,Iberic,Psychology


In [67]:
# Column rename so code matches both tables

tas_df.rename(columns={'Code':'code'}, inplace=True)

In [68]:
tas_df.dtypes

Unnamed: 0     int64
NLP            int64
code          object
TAS20          int64
F1             int64
F2             int64
F3             int64
Tas20Time      int64
Sex            int64
Gender         int64
Age            int64
Dhand          int64
Studies        int64
SClass         int64
Siblings       int64
SibPos         int64
Origin        object
Resid         object
Rtime          int64
Ethnic        object
Job           object
dtype: object

In [69]:
# Fix wrong codes:

nlp_df['code'] = nlp_df['code'].str.replace('ceaee8e1286ad5dbb921b38f529c1c6b','2f052997edc3c005cc16cd263210f05f',regex=True)
nlp_df['code'] = nlp_df['code'].str.replace('40cf09c4faba45471aa7e6f28f2a8b1b','1384903a9d83e20ccd863fe23876e919',regex=True)
nlp_df['code'] = nlp_df['code'].str.replace('116457052e0ddfbabafad6f14feb1ab0','116457052e0ddfbabafad6f14feb1ab',regex=True)
nlp_df['code'] = nlp_df['code'].str.replace('1057c3565f4c5b60e4765746d2e475ef','8a17721b00de1ac4e9e771b3d06c3131',regex=True)
nlp_df['code'] = nlp_df['code'].str.replace('272f153b22c299914f25010e8dab62f9','107b920c3318629af25cd9fe09c2bd25',regex=True)
nlp_df['code'] = nlp_df['code'].str.replace('1e79d86ac69decf2c5e34db8a3308106','8455c12aed5a184b635e816f798b336f',regex=True)


### Joining NLP exemplars with TAS-20 scores

In [70]:
# Perform the left join

prolex_df = pd.merge(nlp_df, tas_df, on='code', how='left')

In [73]:
# Check
prolex_df.dtypes

RowId              int64
code              object
card              object
hum                int64
mode              object
time               int64
G-score          float64
G-magnitude      float64
Azure-TA         float64
Text              object
Text-EN           object
nlu-sentiment    float64
nlu-label         object
nlu-joy          float64
nlu-anger        float64
nlu-disgust      float64
nlu-sadness      float64
nlu-fear         float64
es-len             int64
en-len             int64
NLP              float64
TAS20            float64
F1               float64
F2               float64
F3               float64
Tas20Time        float64
Sex              float64
Gender           float64
Age              float64
Dhand            float64
Studies          float64
SClass           float64
Siblings         float64
SibPos           float64
Origin            object
Resid             object
Rtime            float64
Ethnic            object
Job               object
dtype: object

In [72]:
# remove unused columns
prolex_df = prolex_df.drop(columns="Unnamed: 0_x");
prolex_df = prolex_df.drop(columns="Unnamed: 0_y");

In [74]:
prolex_df.head(n=30)

Unnamed: 0,RowId,code,card,hum,mode,time,G-score,G-magnitude,Azure-TA,Text,...,Dhand,Studies,SClass,Siblings,SibPos,Origin,Resid,Rtime,Ethnic,Job
0,1,b7adde8a9eec8ce92b5ee0507ce054a4,13V,1,T,200000,-0.2,0.2,0.62,Era un niño pensando en el granero pensando a ...,...,1.0,4.0,2.0,5.0,3.0,ES,ES,-1.0,Iberic,Manager
1,2,b7adde8a9eec8ce92b5ee0507ce054a4,18NM,2,T,200000,-0.5,0.5,0.41,"Una madre que está consolando a su hijo, despu...",...,1.0,4.0,2.0,5.0,3.0,ES,ES,-1.0,Iberic,Manager
2,3,b7adde8a9eec8ce92b5ee0507ce054a4,12VN,0,T,200000,0.0,1.2,0.63,Un pantanal con una barca abandonada. A ver qu...,...,1.0,4.0,2.0,5.0,3.0,ES,ES,-1.0,Iberic,Manager
3,4,76ef63369f7d5b6597a543017e1ef578,12VN,0,T,200000,0.0,0.1,0.89,"Era un paraje muy bonito, con una barca, un po...",...,1.0,1.0,2.0,3.0,3.0,ES,ES,-1.0,Iberic,Retired
4,5,76ef63369f7d5b6597a543017e1ef578,10,2,T,200000,0.3,0.1,0.24,"Era una vez un matrimonio, que se quería muchí...",...,1.0,1.0,2.0,3.0,3.0,ES,ES,-1.0,Iberic,Retired
5,6,76ef63369f7d5b6597a543017e1ef578,1,1,T,200000,-0.3,0.9,0.34,Erase una vez un niño que se encontraba muy tr...,...,1.0,1.0,2.0,3.0,3.0,ES,ES,-1.0,Iberic,Retired
6,7,3a7bc6a0450eda9cc016324a2ee5b749,3VH,1,T,200000,-0.1,0.3,0.16,Alguien que está triste o cansado de algo y es...,...,1.0,5.0,2.0,2.0,1.0,ES,ES,-1.0,Iberic,Student
7,8,3a7bc6a0450eda9cc016324a2ee5b749,11,0,T,200000,0.2,0.2,0.82,Erase una vez un bosque encantado en el cual n...,...,1.0,5.0,2.0,2.0,1.0,ES,ES,-1.0,Iberic,Student
8,9,3a7bc6a0450eda9cc016324a2ee5b749,13N,1,T,200000,0.8,0.8,0.69,Erase una vez un niño que le gustaba descubrir...,...,1.0,5.0,2.0,2.0,1.0,ES,ES,-1.0,Iberic,Student
9,10,4509cf6e9d9a624a3a809bf96cfbdbd7,3VH,1,T,200000,-0.1,0.3,0.45,Pues no sé. Puede haber llegado por la noche d...,...,1.0,6.0,2.0,4.0,4.0,ES,ES,-1.0,Iberic,Engineer


In [75]:
prolex_df.tail(n=30)

Unnamed: 0,RowId,code,card,hum,mode,time,G-score,G-magnitude,Azure-TA,Text,...,Dhand,Studies,SClass,Siblings,SibPos,Origin,Resid,Rtime,Ethnic,Job
308,309,21f81d0e1462187617f7f93d3e34a90d,11,0,W,1156305,-0.2,0.2,0.51,"una ciudad perdida en mitad de la nada, donde ...",...,1.0,5.0,2.0,4.0,3.0,ES,ES,-1.0,Iberic,Marketing
309,310,21f81d0e1462187617f7f93d3e34a90d,13HM,2,W,1156305,0.1,0.5,0.73,"Érase una vez... un cuarto, un cuarto visitado...",...,1.0,5.0,2.0,4.0,3.0,ES,ES,-1.0,Iberic,Marketing
310,311,bd0f72617b5ea094ac4accb397070d8d,1,1,W,1393972,-0.4,2.9,0.39,Érase una vez un niño castaño de unos 7 años q...,...,1.0,6.0,2.0,1.0,1.0,ES,ES,-1.0,Iberic,Marketing
311,312,bd0f72617b5ea094ac4accb397070d8d,9VH,4,W,1393972,-0.7,0.7,0.72,"en la época de los 80, un grupo de ladrones qu...",...,1.0,6.0,2.0,1.0,1.0,ES,ES,-1.0,Iberic,Marketing
312,313,bd0f72617b5ea094ac4accb397070d8d,11,0,W,1393972,0.3,2.0,0.81,un lugar psicodelico al que nadie podía accede...,...,1.0,6.0,2.0,1.0,1.0,ES,ES,-1.0,Iberic,Marketing
313,314,bd0f72617b5ea094ac4accb397070d8d,13HM,2,W,1393972,-0.1,0.7,0.55,una pareja enamorada que tras hacer el amor ti...,...,1.0,6.0,2.0,1.0,1.0,ES,ES,-1.0,Iberic,Marketing
314,315,f0b6d248520d5d24a7553a41ee5faaa9,1,1,W,2036261,-0.1,0.3,0.39,Jorge es un chico de 10 años que no le gusta e...,...,1.0,5.0,2.0,2.0,2.0,MX,ES,1.0,Latin,Marketing
315,316,f0b6d248520d5d24a7553a41ee5faaa9,9VH,4,W,2036261,0.3,0.3,0.63,Eran 4 amigos que trabajaban en el campo y des...,...,1.0,5.0,2.0,2.0,2.0,MX,ES,1.0,Latin,Marketing
316,317,f0b6d248520d5d24a7553a41ee5faaa9,11,0,W,2036261,0.0,4.8,0.58,se escuchaba que en el pueblo existía un tesor...,...,1.0,5.0,2.0,2.0,2.0,MX,ES,1.0,Latin,Marketing
317,318,f0b6d248520d5d24a7553a41ee5faaa9,13HM,2,W,2036261,-0.1,2.5,0.41,Marco es un doctor conocido de la ciudad. \r\n...,...,1.0,5.0,2.0,2.0,2.0,MX,ES,1.0,Latin,Marketing


In [76]:
prolex_df.count()

RowId            338
code             338
card             338
hum              338
mode             338
time             338
G-score          338
G-magnitude      338
Azure-TA         338
Text             338
Text-EN          338
nlu-sentiment    338
nlu-label        336
nlu-joy          338
nlu-anger        338
nlu-disgust      338
nlu-sadness      338
nlu-fear         338
es-len           338
en-len           338
NLP              322
TAS20            322
F1               322
F2               322
F3               322
Tas20Time        322
Sex              322
Gender           322
Age              322
Dhand            322
Studies          322
SClass           322
Siblings         322
SibPos           322
Origin           322
Resid            322
Rtime            322
Ethnic           322
Job              322
dtype: int64

In [77]:
# Check Nulls and NaNs
prolex_df[prolex_df.isnull().any(axis=1)]

Unnamed: 0,RowId,code,card,hum,mode,time,G-score,G-magnitude,Azure-TA,Text,...,Dhand,Studies,SClass,Siblings,SibPos,Origin,Resid,Rtime,Ethnic,Job
82,83,9d03e3a59f6ba89e93cb3db6f5ae54a7,1,1,W,341832,-0.7,1.4,0.29,A pepito le obligan a tocar el violín y el pre...,...,,,,,,,,,,
83,84,9d03e3a59f6ba89e93cb3db6f5ae54a7,9VH,4,W,341832,-0.6,0.6,0.67,Estos trabajadores después de una jornada inte...,...,,,,,,,,,,
84,85,9d03e3a59f6ba89e93cb3db6f5ae54a7,11,0,W,341832,0.7,0.7,0.44,Una cascada de agua fría que daba de beber a u...,...,,,,,,,,,,
85,86,9d03e3a59f6ba89e93cb3db6f5ae54a7,13HM,2,W,341832,-0.1,0.3,0.44,Una mujer desnuda que perdió la vida en mitad ...,...,,,,,,,,,,
126,127,71509121918c3bcb8eac7a59658bd644,1,1,W,750870,0.0,0.0,0.41,"Un niño que debía tener una clase, pero tenía ...",...,,,,,,,,,,
127,128,71509121918c3bcb8eac7a59658bd644,9VH,4,W,750870,0.9,0.9,0.86,Una fiesta muy alegre se realizó en un pueblit...,...,,,,,,,,,,
128,129,71509121918c3bcb8eac7a59658bd644,11,0,W,750870,0.0,0.2,0.76,en un lugar muy lejano en el inicio de los tie...,...,,,,,,,,,,
129,130,71509121918c3bcb8eac7a59658bd644,13HM,2,W,750870,0.3,1.1,0.45,"luego de un parto difícil, un médico se recupe...",...,,,,,,,,,,
138,139,d381cf2ee0ac9c9b5820936906d65472,1,1,W,563420,-0.6,0.6,0.35,Un niño q cada día le obligaban a tocar el vio...,...,,,,,,,,,,
139,140,d381cf2ee0ac9c9b5820936906d65472,9VH,4,W,563420,0.5,1.1,0.8,Después de un duro día de trabajo Javier se pu...,...,,,,,,,,,,


### Save merged tables

In [78]:
prolex_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\UNI\\MPGS\\2_TFM\\Datos\\prolexitim-merged-1.1.csv"
prolex_df.to_csv(prolex_dataset_path, sep='\t', encoding='utf-8', index=False)

### Generate specific tables for correlational analysis
TAS versus Text analytics scores (without text)

In [86]:
tas_sentiment_df = prolex_df[["code", "TAS20", "F1", "F2", "F3", "card", "G-score", "G-magnitude", "Azure-TA", "nlu-sentiment", "nlu-label", "nlu-joy", "nlu-anger", "nlu-fear", "nlu-disgust", "nlu-sadness", "es-len" ]]
tas_sentiment_df

Unnamed: 0,code,TAS20,F1,F2,F3,card,G-score,G-magnitude,Azure-TA,nlu-sentiment,nlu-label,nlu-joy,nlu-anger,nlu-fear,nlu-disgust,nlu-sadness,es-len
0,b7adde8a9eec8ce92b5ee0507ce054a4,39.0,12.0,14.0,13.0,13V,-0.2,0.2,0.62,-0.640157,negative,0.317920,0.143086,0.098997,0.422023,0.173421,115
1,b7adde8a9eec8ce92b5ee0507ce054a4,39.0,12.0,14.0,13.0,18NM,-0.5,0.5,0.41,0.000000,neutral,0.285100,0.168727,0.109176,0.057098,0.362623,110
2,b7adde8a9eec8ce92b5ee0507ce054a4,39.0,12.0,14.0,13.0,12VN,0.0,1.2,0.63,0.265769,positive,0.039779,0.205065,0.481812,0.244164,0.164005,93
3,76ef63369f7d5b6597a543017e1ef578,63.0,23.0,21.0,19.0,12VN,0.0,0.1,0.89,-0.353556,negative,0.208997,0.007244,0.190991,0.008434,0.698307,255
4,76ef63369f7d5b6597a543017e1ef578,63.0,23.0,21.0,19.0,10,0.3,0.1,0.24,-0.552068,negative,0.367801,0.063256,0.103351,0.095947,0.469062,184
5,76ef63369f7d5b6597a543017e1ef578,63.0,23.0,21.0,19.0,1,-0.3,0.9,0.34,-0.865733,negative,0.006273,0.065443,0.208772,0.113949,0.858103,284
6,3a7bc6a0450eda9cc016324a2ee5b749,56.0,18.0,20.0,18.0,3VH,-0.1,0.3,0.16,-0.952534,negative,0.004444,0.069648,0.118041,0.089484,0.938321,156
7,3a7bc6a0450eda9cc016324a2ee5b749,56.0,18.0,20.0,18.0,11,0.2,0.2,0.82,0.000000,neutral,0.561453,0.058646,0.172113,0.087763,0.215142,168
8,3a7bc6a0450eda9cc016324a2ee5b749,56.0,18.0,20.0,18.0,13N,0.8,0.8,0.69,0.000000,neutral,0.529544,0.084414,0.095722,0.044831,0.310398,223
9,4509cf6e9d9a624a3a809bf96cfbdbd7,38.0,12.0,14.0,12.0,3VH,-0.1,0.3,0.45,0.000000,neutral,0.263468,0.086924,0.295343,0.020744,0.158272,146


Check and remove NA's

In [87]:
tas_sentiment_df.count()

code             338
TAS20            322
F1               322
F2               322
F3               322
card             338
G-score          338
G-magnitude      338
Azure-TA         338
nlu-sentiment    338
nlu-label        336
nlu-joy          338
nlu-anger        338
nlu-fear         338
nlu-disgust      338
nlu-sadness      338
es-len           338
dtype: int64

In [89]:
tas_sentiment_df = tas_sentiment_df.dropna()

In [90]:
tas_sentiment_df.count()

code             320
TAS20            320
F1               320
F2               320
F3               320
card             320
G-score          320
G-magnitude      320
Azure-TA         320
nlu-sentiment    320
nlu-label        320
nlu-joy          320
nlu-anger        320
nlu-fear         320
nlu-disgust      320
nlu-sadness      320
es-len           320
dtype: int64

In [None]:
### Export TAS Sentiment table

In [92]:
tas_sentiment_dataset_path = "D:\\Dropbox-Array2001\\Dropbox\\UNI\\MPGS\\2_TFM\\Datos\\prolexitim-sentiment-1.1.csv"
tas_sentiment_df.to_csv(tas_sentiment_dataset_path, sep='\t', encoding='utf-8', index=False)