In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
dataset_path = "/content/gdrive/My Drive/datasets/tas-nlp/"

In [3]:
import re

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

print('Keras       ', keras.__version__)
print('TensorFlow  ', tf.__version__)

Keras        2.2.4-tf
TensorFlow   1.13.1


Using TensorFlow backend.


In [4]:

from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [0]:
# drive.mount("/content/gdrive", force_remount=True)

In [6]:
! ls -la /content/gdrive/'My Drive'/datasets/tas-nlp/prolexitim-merged-1.3.csv
      

-rw------- 1 root root 209682 May 13 09:39 '/content/gdrive/My Drive/datasets/tas-nlp/prolexitim-merged-1.3.csv'


In [0]:
tasnlp_df = pd.read_csv(dataset_path + "prolexitim-merged-1.3.csv", header=0, delimiter="\t")

In [0]:
# tasnlp_df.head()

In [0]:
# We only need text and label
data = tasnlp_df[['Text','alex-a']]

In [10]:
data.head()

Unnamed: 0,Text,alex-a
0,Era un niño pensando en el granero pensando a ...,NoAlex
1,"Una madre que está consolando a su hijo, despu...",NoAlex
2,Un pantanal con una barca abandonada. A ver qu...,NoAlex
3,"Era un paraje muy bonito, con una barca, un po...",Alex
4,"Era una vez un matrimonio, que se quería muchí...",Alex


In [11]:
print(data.dtypes)
print(data.count())

Text      object
alex-a    object
dtype: object
Text      334
alex-a    318
dtype: int64


In [0]:
data = data.dropna()

In [13]:
data.count()

Text      318
alex-a    318
dtype: int64

In [14]:
data.groupby('alex-a').count()

Unnamed: 0_level_0,Text
alex-a,Unnamed: 1_level_1
Alex,31
NoAlex,242
PosAlex,45


In [0]:
# Join alexithymia and possible alexithymia as one positive class
#data['alex-a'] = data['alex-a'].apply(lambda x: x.replace('PosAlex', 'NoAlex'))
# data['alex-a'] = data['alex-a'].apply(lambda x: x.replace('PosAlex', 'Alex'))

In [16]:
data.groupby('alex-a').count()

Unnamed: 0_level_0,Text
alex-a,Unnamed: 1_level_1
Alex,31
NoAlex,242
PosAlex,45


In [0]:
# Cleanining text

import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
                    

In [0]:
# Cleanining text

data['Text'] = data['Text'].apply(lambda x: remove_accents(x))
data['Text'] = data['Text'].apply(lambda x: x.lower())
data['Text'] = data['Text'].apply((lambda x: re.sub('[^ña-zA-z0-9\s]', '', x)))

In [19]:
data.tail(n=14)

Unnamed: 0,Text,alex-a
320,una cascada,NoAlex
321,hombre llorando porque su mujer ha muerto,NoAlex
322,erase una vez un nino que debia acudir a clase...,NoAlex
323,varios hombres habian decidido descansar en me...,NoAlex
324,el fin del camino empedrado estaba cerca si te...,NoAlex
325,acababa de fallecer su mujer estaba desconsola...,NoAlex
326,erase una vez un nino al que no le gustaba la ...,Alex
327,erase una vez un campesino que durante las hor...,Alex
328,erase una ve un lugar maravilloso donde la man...,Alex
329,erase una vez un hombre que no amaba a su muje...,Alex


In [20]:
nb_alex = data[data['alex-a'] == 'Alex'].size
nb_no_alex = data[data['alex-a'] == 'NoAlex'].size
nb_pos_alex = data[data['alex-a'] == 'PosAlex'].size


print(f"There are {nb_alex} positive intances of alexithymia or possible alexithymia.")
print(f"There are {nb_no_alex} negative instances, no alexithymia detected")
print(f"There are {nb_pos_alex} possible instances, possible alexithymia detected")


There are 62 positive intances of alexithymia or possible alexithymia.
There are 484 negative instances, no alexithymia detected
There are 90 possible instances, possible alexithymia detected


In [0]:
# Keras vectorization of the text corpus 
max_features = 2000

tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['Text'].values)

X = tokenizer.texts_to_sequences(data['Text'].values)

In [22]:
print(f"Text: {data['Text'][1]}. Vector: {X[1]}") 
print(f"Text: {data['Text'][2]}. Vector: {X[2]}") 

Text: una madre que esta consolando a su hijo despues de darle las notas que ha sacado no se cuantos insuficientes. Vector: [8, 160, 1, 16, 870, 5, 11, 125, 45, 2, 871, 18, 397, 1, 38, 872, 12, 10, 539, 873]
Text: un pantanal con una barca abandonada a ver que estan haciendo los de la barca buscar setas. Vector: [4, 874, 14, 8, 161, 540, 5, 112, 1, 39, 126, 19, 2, 7, 161, 212, 875]


In [0]:
# Make sure we have the same length in all document vector
X = pad_sequences(X)

In [24]:
print(f"Vector Length: {len(X[1])}")
print(X[1])
print(f"Vector Length: {len(X[131])}")
print(X[131])
print(f"Vector Length: {len(X[3])}")
print(X[3])

Vector Length: 162
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   8 160
   1  16 870   5  11 125  45   2 871  18 397   1  38 872  12  10 539 873]
Vector Length: 162
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0

In [0]:
# One HOT encoding for alexithymia label
# and the values in Y. 

Y = pd.get_dummies(data['alex-a']).values

In [26]:
# Train / Test partitions (90/10)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

print(f"Train shape. Feature Vector:{X_train.shape}, Label:{Y_train.shape} (binary classification)")
print(f"Test shape. Feature Vector: {X_test.shape}, Label:{Y_test.shape} (binary classification)")

Train shape. Feature Vector:(254, 162), Label:(254, 3) (binary classification)
Test shape. Feature Vector: (64, 162), Label:(64, 3) (binary classification)


In [0]:
# define roc_callback, inspired by https://github.com/keras-team/keras/issues/6050#issuecomment-329996505
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [31]:
# Deep RNN Model Definition:
# Embedding -> Dropout -> LSTM -> Dense 

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy', auc_roc])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 162, 128)          256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 162, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
from keras.callbacks import Callback, EarlyStopping

my_callbacks = [EarlyStopping(monitor='auc_roc', patience=300, verbose=1, mode='max')]

In [33]:
# Train the model
batch_size = 32
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 2, callbacks=my_callbacks)

Instructions for updating:
Use tf.cast instead.
Epoch 1/15
 - 3s - loss: 1.0117 - acc: 0.6929 - auc_roc: 0.6313
Epoch 2/15
 - 2s - loss: 0.7328 - acc: 0.7835 - auc_roc: 0.8282
Epoch 3/15
 - 2s - loss: 0.6986 - acc: 0.7835 - auc_roc: 0.8370
Epoch 4/15
 - 2s - loss: 0.6685 - acc: 0.7835 - auc_roc: 0.8359
Epoch 5/15
 - 2s - loss: 0.6445 - acc: 0.7835 - auc_roc: 0.8442
Epoch 6/15
 - 2s - loss: 0.6238 - acc: 0.7835 - auc_roc: 0.8497
Epoch 7/15
 - 2s - loss: 0.5897 - acc: 0.7835 - auc_roc: 0.8566
Epoch 8/15
 - 2s - loss: 0.5470 - acc: 0.7835 - auc_roc: 0.8683
Epoch 9/15
 - 2s - loss: 0.4955 - acc: 0.7953 - auc_roc: 0.8771
Epoch 10/15
 - 2s - loss: 0.3750 - acc: 0.8386 - auc_roc: 0.8878
Epoch 11/15
 - 2s - loss: 0.2997 - acc: 0.9016 - auc_roc: 0.8982
Epoch 12/15
 - 2s - loss: 0.2284 - acc: 0.9331 - auc_roc: 0.9091
Epoch 13/15
 - 2s - loss: 0.2078 - acc: 0.9409 - auc_roc: 0.9183
Epoch 14/15
 - 2s - loss: 0.1640 - acc: 0.9488 - auc_roc: 0.9264
Epoch 15/15
 - 2s - loss: 0.1368 - acc: 0.9567 - au

<keras.callbacks.History at 0x7fcbe8db7c88>

In [0]:
print(X_test.shape)
print(Y_test.shape)
print(f"Test shape. Feature Vector: {X_test.shape}, Label:{Y_test.shape} (binary classification)")

In [34]:
# validation_size = 2

# X_validate = X_test[-validation_size:]
# Y_validate = Y_test[-validation_size:]
# X_test = X_test[:-validation_size]
# Y_test = Y_test[:-validation_size]

score,acc,auc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print(model.metrics_names)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))
print("auc: %.2f" % (auc))

['loss', 'acc', 'auc_roc']
score: 1.29
acc: 0.64
auc: 0.94


In [0]:
narratives = [' un niño al que sus padres de clase acomodada le dijeron que tenía que aprender a tocar el violín y el niño lo odiaba.', 
             'Hay unos toros que están enfadados y se preparan para atacar. Van a bajar todos corriendo por la cuesta y si pillan a alguien lo más seguro que es que lo maten.',
             'Varios hombres se encuentran echando una siesta en su pausa para el almuerzo. Bajo un sol y calor de justicia, la parada se agradece tremendamente. Las horas de trabajo físico en el campo se hacen largas y cansadas para ganar un jornal que a duras penas les permite llegar a final de mes. Sólo esperan coger fuerzas durante el descanso para terminar la jornada lo antes posible.',
             'un hombre que mata a su amante porque ella le amenazó con contárselo a su mujer si este no se iba con ella..',
             'tanto me han apretado con lo de las clases extraescolares, que he desarrollado el superpoder de tocar el violín con la mente, mientras los demás compañeros se dejan el cuello sujetando el violín, yo lo toco mandándole órdenes cerebrales al instrumento, el primer día que lo conseguí el profesor me dió una colleja pensando que dormía, cuando empezó a sonar se asustó tanto que ya ni se me acerca, ahora aprovecho las clases de violín para tocar con la mente y para dormir a intervalos con la certeza de que la mente del profesor no va a decirle a su mano que me ajusticie. Cada uno a lo suyo. Fin',
             'Se levantó y vio que nada podía ser más bello y nadie podría contemplarlo igual']
p_narratives =[]

for text in narratives:
  p_narratives.append(remove_accents(text).lower())
  

print(p_narratives)

In [0]:
# vectorizing the text by the pre-fitted tokenizer instance

pp_narratives = tokenizer.texts_to_sequences(p_narratives)

print(pp_narratives)

In [0]:
# Padding the tweet to have exactly the same shape as `embedding_2` input

pp_narratives = pad_sequences(pp_narratives, maxlen=162, dtype='int32', value=0)
print(pp_narratives)


In [0]:
# Testing the predictive model
alex_screening = model.predict(pp_narratives, batch_size=1, verbose = 2)
text_idx = 0 
for prediction in alex_screening:
  # print(prediction)
  print("NARRATIVE: " + narratives[text_idx])
  text_idx += 1
  if(np.argmax(prediction) == 0):
      print("PREDICTION: negative: No Alexithymia.")
  elif (np.argmax(prediction) == 1):
    print("PREDICTION: positive: Possible Alexithymia.")
  print("-----------------")