In [1]:
import os
import json
import gzip
import pandas as pd
import numpy as np
from urllib.request import urlopen
import pickle
! pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 KB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Collecting pyDAWG
  Downloading pyDAWG-1.0.1.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuit

## 3. Entrenamiento/test
En esta etapa, se entrenan dos modelos de clasificación de sentimientos utilizando el conjunto de reseñas preprocesado.

### Preparación de los datos

En primer lugar, se divide el conjunto de datos en un conjunto de entrenamiento y un conjunto de test.

In [2]:
data_lower = pd.read_csv('data_lower_preprocesado.csv')
data_lower['reviewText'] = data_lower['reviewText'].astype(str)
data_lower.shape

(26531, 13)

In [13]:
from sklearn.model_selection import train_test_split

# Dividimos en train y test a 80:20
X_train, X_test, y_train, y_test = train_test_split(data_lower['reviewText'], data_lower['sentiment'], test_size=0.2, random_state=0)
# Guardamos y_test para otro notebook
np.save('y_test.npy', y_test)
np.save('X_train.npy', X_train)

### Vectorización del texto
A continuación, se vectoriza el texto utilizando el modelo de bolsa de palabras (BoW) con la clase CountVectorizer de scikit-learn.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Creamos un objeto CountVectorizer 
count_vectorizer = CountVectorizer(max_features=3000,
                                   preprocessor=lambda x: x, tokenizer=lambda x: x)

# Ajustamos los datos de entrenamiento y obtenemos la matriz
X_train_vectors = count_vectorizer.fit_transform(X_train)

# transformamos test y obtenemos la matriz. 
#### Note we are not fitting the test data into the CountVectorizer()
X_test_vectors = count_vectorizer.transform(X_test)

# Guardar la variable en un archivo
with open('X_test_vectors.pkl', 'wb') as f:
    pickle.dump(X_test_vectors, f)



### Entrenamiento de los modelos

A continuación, se entrenarán dos modelos de clasificación de sentimientos: un modelo de regresión logística y un modelo de árbol de decisiones.

In [7]:
# Modelo de regresión logística
from sklearn.linear_model import LogisticRegression
import pickle
# Instanciamos con parametros por defecto
lr_model = LogisticRegression(random_state=0, max_iter=100)

# entrenamos
lr_model.fit(X_train_vectors, y_train)
#Guardamos 
filename = 'lr_model.sav'
pickle.dump(lr_model, open(filename, 'wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Arbol de desiciones

from sklearn.tree import DecisionTreeClassifier

# Instanciamos el arbol de desicipnes con max_depth=5 y random_state=0
dt_model = DecisionTreeClassifier(max_depth=5, random_state=0)

# Entrenamos
dt_model.fit(X_train_vectors, y_train)
# Guardamos
filename = 'dt_model.sav'
pickle.dump(dt_model, open(filename, 'wb'))

In [9]:
# Red neuronal 

from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Entrenar un modelo de Red Neuronal Convolucional (CNN)
max_features = 20000  # Número máximo de palabras a considerar en el vocabulario
maxlen = 100  # Longitud máxima de una opinión (en palabras)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen)
np.save('X_test_seq.npy', X_test_seq)
embedding_dim = 100  # Dimensión del espacio de embedding

model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=maxlen))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))
# Guardamos nuestro modelo de red neuronal, para cargarlo en un ejemplo
model.save('sentiment_model.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
np.save('X_test_seq.npy', X_test_seq)