In [27]:
# !pip install datasets
# !python -m spacy download es_core_news_sm

## Entrenamos NN con capa de Embeddings

In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
gpu = gpus[0]

tf.config.experimental.set_memory_growth(gpu, True)

In [2]:
import pandas as pd
from datasets import load_dataset

In [3]:
my_dataset = load_dataset("amazon_reviews_multi", "es", split='train')

Reusing dataset amazon_reviews_multi (/home/alejandro/.cache/huggingface/datasets/amazon_reviews_multi/es/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [4]:
sentences = [i['review_body'] for i in my_dataset]
labels = [i['stars'] for i in my_dataset]

In [5]:
df = pd.DataFrame()
df['sentences'] = sentences
df['labels'] = labels

In [6]:
df

Unnamed: 0,sentences,labels
0,Nada bueno se me fue ka pantalla en menos de 8...,1
1,"Horrible, nos tuvimos que comprar otro porque ...",1
2,Te obligan a comprar dos unidades y te llega s...,1
3,"No entro en descalificar al vendedor, solo pue...",1
4,Llega tarde y co la talla equivocada,1
...,...,...
199995,Mando funciona perfectamente y cumple con toda...,5
199996,"Compré la batería con cierta reticencia, pero ...",5
199997,Buena calidad. Satisfecha con la compra.,5
199998,Perfecto para el cumple de mi hijo,5


In [7]:
df.labels.value_counts()

1    40000
2    40000
3    40000
4    40000
5    40000
Name: labels, dtype: int64

# ENTRENAMIENTO

In [8]:
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = df.sentences.to_list()

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(sentences)

X = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(X, maxlen=20, padding='post', truncating='post')

In [9]:
import numpy as np

labels = df.labels.to_list()
y = np.array(labels)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(len(tokenizer.word_counts) + 1, 10, input_length=20))
model.add(Flatten())
model.add(Dropout(0.7))
model.add(Dense(5, activation="softmax"))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 10)            566410    
_________________________________________________________________
flatten (Flatten)            (None, 200)               0         
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 1005      
Total params: 567,415
Trainable params: 567,415
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.fit(X, y, batch_size=16, epochs=2, validation_split=0.25, verbose=1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fc18ea62130>

# PREDICCIÓN

In [28]:
sentences = ["Muy agradecido con el resultado", "Me parece horrible", "Ya te gustaría", "son las tres en punto", "Insatisfecho con el producto"]
X = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(X, maxlen=20, padding='post', truncating='post')

In [29]:
values = [np.argmax(i) for i in model.predict(X, verbose=0)]
values

[4, 1, 4, 3, 1]