In [1]:
import os
import sys

sys.path.append(f"{os.getcwd()}/../../")

In [2]:
from collections import defaultdict
import random
import warnings
from itertools import chain
from multiprocessing import cpu_count
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import ray

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec, Doc2Vec

import tensorflow as tf
from keras.utils import to_categorical
from keras.initializers import Constant
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense, Dropout, Activation, Flatten, 
    Embedding, Bidirectional, LSTM, GRU, Attention, 
    BatchNormalization, Conv1D, MaxPooling1D, TimeDistributed,
    SpatialDropout1D, GlobalMaxPooling1D, GlobalAveragePooling1D
)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from src.data.processing.utils import CleanUp, RSLP_STEMMER, SNOWBALL_STEMMER, NLP_LEMMATIZER

%matplotlib inline

clean_up = CleanUp(
    remove_accentuation=False,
    remove_4_comment=False,
    remove_numbers=False,
)

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices("GPU")))

Using TensorFlow backend.


Num GPUs Available:  1


In [3]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1



In [4]:
max_words = 250_000
batch_size = 128
w2v_size = 300

corpus = []

In [5]:
# ray.shutdown()
# ray.init(num_cpus=cpu_count(), include_webui=False, lru_evict=True)

# filename = f"{os.getcwd()}/../../../data/embedding/corpus.txt"
# file_it = pd.read_csv(filename, header=None, iterator=True, names=["sentence"], chunksize=1000,)

# @ray.remote
# def carregar_frases(lines):
#     import os
#     import sys

#     sys.path.append(f"{os.getcwd()}/../../")
#     from processing.utils import CleanUp, SNOWBALL_STEMMER
#     clean_up = CleanUp(stemmer=SNOWBALL_STEMMER)
    
#     lines = lines["sentence"].tolist()
#     return [clean_up.fit(line) for line in lines]

# i, itera = 0, []
# for lines in file_it:
#     itera.append(lines)
#     if i == 10:
#         break
#     i += 1
# corpus = list(chain(*ray.get([carregar_frases.remote(lines) for lines in itera])))

# ray.shutdown()

In [5]:
df = pd.read_csv(f"{os.getcwd()}/../../data/processed/dataset.csv", sep="|")
df["x"] = df["comentario"].apply(lambda comment: clean_up.fit(str(comment)))
df["y"] = df["classificacao"].apply(lambda clasf: 0 if clasf == "negativo" else 1)
textos = df[["x", "y"]].to_numpy()

In [6]:
print(random.choice(textos))
print(f'Dataset size: {len(textos)}')
# print(df[df['x'] == ''].index)
# print(df.loc[250])

for item in list(textos[:, 0]):
    corpus.append(item)
print(f'Corpus size: {len(corpus)}')

['cloroquina deve ser administrada aos pacientes logo início doença preferencialmente dia aparecimento dos primeiros sintomas como febre tosse coriza respiração superior vezes por minuto paolo zanoto virologista usp'
 0]
Dataset size: 2560
Corpus size: 2560


In [8]:
maxlen = 0
for phrase in textos[:, 0]:
    size = len(phrase.split())
    maxlen = size if size > maxlen else maxlen
maxlen += 1
print(f'Maxlen: {maxlen}')

Maxlen: 82


In [None]:
# word_counts = defaultdict(int)
# for phrases in textos[:, 0]:
#     for word in phrases.split():
#         word_counts[word] += 1
# v_count = len(word_counts.keys())
# words_list = list(word_counts.keys())
# word_index = dict((word, i) for i, word in enumerate(words_list))

# X = []
# for phrases in textos[:, 0]:
#     xp = []
#     for word in phrases.split():
#         xp.append(word_index[word])
#     X.append(xp)
# y = textos[:, 1].astype(np.int).ravel()
# y = to_categorical(y, classes)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# tokenizer = Tokenizer(num_words=max_words)
# X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
# X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')

In [None]:
X = textos[:, 0]
y = textos[:, 1].astype(np.int).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tokenizer = Tokenizer(num_words=max_words, lower=False, oov_token='<OOV>', char_level=False)
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

clf = LogisticRegression(
    random_state=0,
    n_jobs=-1,
    max_iter=3000,
    multi_class="ovr"
)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test, pred))

df_cm = confusion_matrix(pred, y_test)
plt.figure(figsize=(6, 4))
sn.heatmap(df_cm, annot=True, fmt="d")
plt.tight_layout()
plt.show()

In [7]:
w2v = Word2Vec.load(f"{os.getcwd()}/../../models/w2v.model")

for word in [
    "masculino", "sexo", "montanha", "oceano", "lua", "amor", "senhor", "medico", 
    "cimegripe", "passaro", "doenca", "coracao", "febre", "nimesulida", "rancor",
    "lobo", "mau", "odio", "dor", "coriza", "braco", ["lobo", "mau"], "maca", "coco",
    "espada", "cavaleiro", "arthur", ["rei", "arthur"]
]:
    print(word, "=>", w2v.most_similar(word)[:3])

masculino => [('feminino', 0.8773196935653687), ('sexo', 0.8331319689750671), ('sexos', 0.7289565801620483)]
sexo => [('masculino', 0.8331320285797119), ('feminino', 0.7899670600891113), ('sexos', 0.7375198602676392)]
montanha => [('cume', 0.8779435753822327), ('everest', 0.8501795530319214), ('íngreme', 0.8373156785964966)]
oceano => [('atlântico', 0.9057531356811523), ('antártida', 0.880797803401947), ('pacífico', 0.8795431852340698)]
lua => [('vênus', 0.7782222032546997), ('saturno', 0.764536440372467), ('cheia', 0.7553818225860596)]
amor => [('amar', 0.8543565273284912), ('felicidade', 0.8307334780693054), ('ama', 0.8102244734764099)]
senhor => [('dai', 0.7575034499168396), ('digo', 0.7542639970779419), ('comigo', 0.7504541277885437)]
medico => [('solicite', 0.9183236360549927), ('discuta', 0.8992379903793335), ('decidirá', 0.887068510055542)]
cimegripe => [('tylenol', 0.9584749341011047), ('tylalgin', 0.948271632194519), ('resfenol', 0.9426720142364502)]
passaro => [('porthos', 0.

In [None]:
X = textos[:, 0]
y = textos[:, 1].astype(np.int).ravel()
# y = to_categorical(y, 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tokenizer = Tokenizer(num_words=max_words, lower=False, oov_token='<OOV>', char_level=False)
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

print(f"Vocab size: {vocab_size}")

In [None]:
# y_train

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
class_weights = {i : class_weights[i] for i in range(2)}
class_weights

In [None]:
word_index = tokenizer.word_index

embedding_matrix = np.zeros((len(word_index) + 1, w2v_size))
for word, i in word_index.items():
    embedding_vector = None
    try:
        embedding_vector = w2v.wv[word]
    except:
        pass
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Embedding Matrix size:', embedding_matrix.shape)

In [None]:
# X_train[0], X_test[0]

In [None]:
model = Sequential(name='WordEmbeddings')
model.add(
    Embedding(
        len(word_index) + 1,
        w2v_size, 
        weights=[embedding_matrix],
        trainable=True,
    )
)
# model.add(Dropout(0.5))
# model.add(Conv1D(128, 2, padding='valid', activation='relu', strides=1))
# model.add(MaxPooling1D())
# model.add(Conv1D(128, 2, padding='valid', activation='relu', strides=1))
# model.add(MaxPooling1D())
# model.add(Conv1D(128, 2, padding='valid', activation='relu', strides=1))
# model.add(MaxPooling1D())
# model.add(Conv1D(128, 2, padding='valid', activation='relu', strides=1))
# model.add(MaxPooling1D())
# model.add(BatchNormalization())
# model.add(Conv1D(128, 2, padding='valid', activation='relu', strides=1))
# model.add(BatchNormalization())
# model.add(MaxPooling1D())
# model.add(Bidirectional(LSTM(128, return_sequences=True)))
# model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(128)))
# model.add(Dropout(0.5))
# model.add(GlobalMaxPooling1D())
# model.add(BatchNormalization())
# model.add(GlobalAveragePooling1D())
# model.add(Flatten())
# model.add(Conv1D(128, 2, padding='same', activation='relu', strides=1))
# model.add(GlobalMaxPooling1D())
# model.add(
#     Dense(
#         128,
#         activation='tanh',
#         kernel_regularizer=regularizers.l2(1e-3),
#         activity_regularizer=regularizers.l2(1e-3)
#     )
# )
# model.add(Dropout(0.5))
model.add(
    Dense(
        128,
        activation='elu',
        kernel_regularizer=regularizers.l2(1e-4),
        activity_regularizer=regularizers.l2(1e-4)
    )
)
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
epochs = 25
checkpoint_filepath = './weights/weights.hdf5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=0,
    save_best_only=True)

history = model.fit(
    X_train, y_train, 
    batch_size=batch_size, 
    epochs=epochs, 
    verbose=0,
    shuffle=True,
    validation_data=(X_test, y_test),
    callbacks=[model_checkpoint_callback]
)

model.load_weights(checkpoint_filepath)
score = model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', f'{round(score[1] * 100)}%')

In [None]:
pred = (model.predict(X_test) >= .5).astype(np.int).ravel()
print(classification_report(y_test, pred))

# print(np.unique(y_test, return_counts=True))
# print(np.unique(pred, return_counts=True))

df_cm = confusion_matrix(y_test, pred)
plt.figure(figsize=(6, 4))
sn.heatmap(df_cm, annot=True, fmt="d")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)

plt.plot(history.history["accuracy"], label="Acc", c="C0")
plt.plot(history.history["val_accuracy"], label="Val. Acc", c="C2")
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history["loss"], label="Loss", c="C1")
plt.plot(history.history["val_loss"], label="Val. Loss", c="C3")
plt.xticks(range(0, epochs, 2))
plt.legend()
plt.tight_layout()
plt.show()

### Do2Vec

In [8]:
d2v = Doc2Vec.load(f"{os.getcwd()}/../../models/d2v.model")

n_textos = []
for texto in textos:
    doc_vec = d2v.infer_vector(texto[0].split(), epochs=500)
    n_textos.append(doc_vec)
n_textos = np.asarray(n_textos)

In [None]:
X = n_textos
y = textos[:, 1].astype(np.int).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# # For Conv & LSTM
print(X_train.shape)
print(X_test.shape)
X_train = X_train.reshape(2048, 300, -1)
X_test = X_test.reshape(512, 300, -1)
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [None]:
model = Sequential(name='Doc2Vec')
# model.add(Conv1D(128, 2, padding='valid', activation='elu'))
# model.add(Conv1D(128, 2, padding='valid', activation='elu'))
# model.add(Conv1D(128, 2, padding='valid', activation='elu'))
# model.add(Conv1D(32, 5, padding='same', activation='elu'))
# model.add(MaxPooling1D())
# model.add(Dropout(0.5))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
# model.add(Dropout(0.5))
# model.add(GlobalMaxPooling1D())
# model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [None]:
epochs = 10
checkpoint_filepath_d2v = './weights/weights_d2v.hdf5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath_d2v,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=0,
    save_best_only=True)

history = model.fit(
    X_train, y_train, 
    batch_size=batch_size, 
    epochs=epochs, 
    verbose=0,
    shuffle=True,
    validation_data=(X_test, y_test),
    callbacks=[model_checkpoint_callback]
)

model.load_weights(checkpoint_filepath_d2v)
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', round(score[1] * 100))

In [None]:
pred = (model.predict(X_test) > 0.5).astype(np.int).ravel()
print(classification_report(y_test, pred))

df_cm = confusion_matrix(y_test, pred)
plt.figure(figsize=(6, 4))
sn.heatmap(df_cm, annot=True, fmt="d")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)

plt.plot(history.history["accuracy"], label="Acc", c="C0")
plt.plot(history.history["val_accuracy"], label="Val. Acc", c="C2")
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history["loss"], label="Loss", c="C1")
plt.plot(history.history["val_loss"], label="Val. Loss", c="C3")
plt.xticks(range(0, epochs, 2))
plt.legend()
plt.tight_layout()
plt.show()