# Sentiment analysis para comentarios de peliculas 
#### Proyecto parte 3
Ruben Gonzalez 20003314

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.metrics import roc_auc_score

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.callbacks import EarlyStopping

from keras.optimizers import Adam

import keras.models
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, BatchNormalization, GlobalMaxPooling1D, Dropout
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-reviews-dataset/imdb_reviews_dataset.csv


In [2]:
import tensorflow as tf

## Carga y preprocesamiento de datos

In [3]:
df = pd.read_csv('../input/imdb-reviews-dataset/imdb_reviews_dataset.csv')
df.head()

Unnamed: 0,data_type,id,rating,text
0,train,pos_0,9,Bromwell High is a cartoon comedy. It ran at t...
1,train,pos_10000,8,Homelessness (or Houselessness as George Carli...
2,train,pos_10001,10,Brilliant over-acting by Lesley Ann Warren. Be...
3,train,pos_10002,7,This is easily the most underrated film inn th...
4,train,pos_10003,8,This is not the typical Mel Brooks film. It wa...


In [4]:
df.rating.value_counts()

0     50000
1     10122
10     9731
8      5859
4      5331
3      4961
7      4803
9      4607
2      4586
Name: rating, dtype: int64

Se remueven los reviews sin rating pues no nos sirven para el analisis

In [5]:
df_etiquetado = df[df['rating'] != 0].copy() 
df_etiquetado.shape

(50000, 4)

In [6]:
df_etiquetado.rating.value_counts()

1     10122
10     9731
8      5859
4      5331
3      4961
7      4803
9      4607
2      4586
Name: rating, dtype: int64

Se crea una nueva columna que identifique si el sentimiento es positivo o no usando aquellos comentarios con ranquin mayor o igual a 7 como positivos

In [7]:
df_etiquetado['sentimiento'] = df_etiquetado.rating.apply(lambda x: 1 if x >= 7 else 0)
df_etiquetado["Length"] = df_etiquetado['text'].apply(lambda x: len(x.split()))

In [8]:
df_etiquetado.head()

Unnamed: 0,data_type,id,rating,text,sentimiento,Length
0,train,pos_0,9,Bromwell High is a cartoon comedy. It ran at t...,1,140
1,train,pos_10000,8,Homelessness (or Houselessness as George Carli...,1,428
2,train,pos_10001,10,Brilliant over-acting by Lesley Ann Warren. Be...,1,147
3,train,pos_10002,7,This is easily the most underrated film inn th...,1,124
4,train,pos_10003,8,This is not the typical Mel Brooks film. It wa...,1,120


Como el dataset ya esta segmentado en train y test, se usara esa misma segmentacion

In [9]:
from sklearn.model_selection import train_test_split


## Pre-procesamiento del texto
Se utiliza word2vec para la tokenizacion de las entradas X

In [10]:
review_lines = list()
lines = df['text'].values.tolist()

stop_words = set(stopwords.words('english'))

for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [11]:
len(review_lines)

100000

In [12]:
import gensim

In [13]:
%%time
EMBEDDING_DIM = 256

model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=5)
words = list(model.wv.vocab)
print('Tamaño del vocabulario: %d' % len(words))

Tamaño del vocabulario: 56449
CPU times: user 4min 45s, sys: 1.56 s, total: 4min 46s
Wall time: 2min 50s


Almacenamos el vocabulario en el formato requerido para su potencial uso posterior

In [14]:
filename = 'vocabulario.txt'
model.wv.save_word2vec_format(filename, binary=False)

Recuperacion del vocabulario creado usando word2vec

In [15]:
embedding_index = {}
with open(os.path.join('./vocabulario.txt')) as fin:
    for line in fin:
        values = line.split()
        if len(values) == 2:
            print('Num words - ', values[0])
            print('EMBEDDING_DIM =', values[1])
            continue
        word = values[0]
        coefs = np.asarray(values[1:])
        embedding_index[word] = coefs

Num words -  56449
EMBEDDING_DIM = 256


In [18]:
len_text_info = df_etiquetado['Length'].describe()
len_text_info

count    50000.000000
mean       231.156940
std        171.343997
min          4.000000
25%        126.000000
50%        173.000000
75%        280.000000
max       2470.000000
Name: Length, dtype: float64

In [None]:
# set max len for padding
max_length = int(len_text_info['mean'] + 2 * len_text_info['std'])
print(max_length) # = 200

In [16]:
tokenizer_obj = Tokenizer()
total_reviews = df_etiquetado['text'].values
tokenizer_obj.fit_on_texts(total_reviews)
sequences = tokenizer_obj.texts_to_sequences(total_reviews)

word_index = tokenizer_obj.word_index
print('Encontrados %s tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
sentiment = df_etiquetado['sentimiento'].values
print(review_pad.shape)
print(sentiment.shape)

Encontrados 124252 tokens.


NameError: name 'max_length' is not defined

In [None]:
## Diccionario que contiene las palabras unicas con el token asociado
word_index['the']

# Contiene los registros de texto tokenizados
review_pad

#Contiene el arreglo que indica si el sentimiento es positivo (1) o negativo (0)
sentiment

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

words_n = 0
finde_n = 0
for word, i in word_index.items():
    words_n += 1
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        finde_n += 1
        embedding_matrix[i] = embedding_vector

## Obteniendo particiones para entrenamiento/validacion/pruebas

In [None]:
review_pad.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(review_pad, sentiment, test_size = 0.02, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.05, random_state = 0)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

## Creacion del modelo RNN usando GRU



In [None]:
modelo = Sequential()
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    embeddings_initializer=Constant(embedding_matrix),
    input_length=max_length,
    trainable=False,
)

modelo.add(embedding_layer)
modelo.add(BatchNormalization())
modelo.add(Dropout(0.2))
modelo.add(GRU(units=32))
modelo.add(BatchNormalization())
modelo.add(Dropout(0.2))
modelo.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=0.0005)
modelo.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
modelo.summary()

## Entrenamiento

In [None]:
print('Inicio entrenamiento...')

my_callbacks = [
    EarlyStopping(patience=10),
]
bitacora = modelo.fit(X_train, y_train, batch_size=128, epochs=30, validation_data=(X_val, y_val), verbose=2, callbacks=my_callbacks)

## Graficando el error y la precision

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 4))

axs[0].plot(bitacora.history['loss'], label='loss')
axs[0].plot(bitacora.history['val_loss'], label='val_loss')

axs[1].plot(bitacora.history['accuracy'], label='acc')
axs[1].plot(bitacora.history['val_accuracy'], label='val_acc')

plt.legend();
plt.show();

## Guardando el modelo / Recuperando el modelo
Dado que ya supero el porcentaje minimo de exactitud solicitada, se procedera a almacenar el modelo

In [None]:
modelo.save('RNN-GRUModelo.h5')

In [None]:
#Cargamos nuevamente el modelo realizado. Activar solo si es necesario
#from keras.models import load_model
#modelo = load_model('CNN-Modelo.h5')

## Probando con datos de test y validando error/precision

In [None]:
metricasTest=modelo.evaluate(X_test,y_test)

Tiene un error de 0.24 y una precision de 91%

## Se procedera a realizar otro modelo esta vez usando BERT Model para ver si se obtienen mejores resultados

Instalando librerias necesarias

In [None]:
!pip3 install ktrain

In [None]:
import os.path
import numpy as np
import tensorflow as tf
import ktrain
from ktrain import text

## Creando datasets de training y de test

In [None]:
df_etiquetado.head()

In [None]:
(X_train_bert,  y_train_bert), (X_val_bert, y_val_bert), preproc = text.texts_from_df(df_etiquetado,
                                                                                     'text',
                                                                                     label_columns = ['sentimiento'],
                                                                                     preprocess_mode = 'bert',
                                                                                     maxlen = 500)

## Construyendo el modelo BERT

In [None]:
modeloBert = text.text_classifier(name='bert',
                             train_data=(X_train_bert, y_train_bert),
                             preproc=preproc)

## Entrenando el modelo BERT

In [None]:
learner = ktrain.get_learner(model=modeloBert,
                             train_data=(X_train_bert, y_train_bert),
                             val_data=(X_val_bert, y_val_bert),
                             batch_size=32)

In [None]:
learner.fit_onecycle(lr=2e-5,
                     epochs=1)