In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import numpy as np
import tensorflow as tf
import pickle

In [2]:
# Carrega os dados
column_names = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(r"..\data\training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names=column_names)


In [3]:
# Pré-processamento dos dados
df = df[["target", "text"]]  # Mantém apenas as colunas necessárias
df["target"] = df["target"].replace({0: 0, 4: 1})


In [4]:
# Tokenização e Vetorização
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=100)


In [5]:
# Divisão dos dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df["target"], test_size=0.2, random_state=42)


In [8]:
# Construção do modelo RNN
model = Sequential([
    Embedding(input_dim=10000, output_dim=16),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [9]:
# Compilação do modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [10]:
# Treinamento do modelo
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))


Epoch 1/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m775s[0m 77ms/step - accuracy: 0.7770 - loss: 0.4697 - val_accuracy: 0.8156 - val_loss: 0.4037
Epoch 2/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m804s[0m 80ms/step - accuracy: 0.8162 - loss: 0.4066 - val_accuracy: 0.8195 - val_loss: 0.3964
Epoch 3/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m828s[0m 83ms/step - accuracy: 0.8223 - loss: 0.3954 - val_accuracy: 0.8232 - val_loss: 0.3884
Epoch 4/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m838s[0m 84ms/step - accuracy: 0.8260 - loss: 0.3875 - val_accuracy: 0.8244 - val_loss: 0.3873
Epoch 5/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m844s[0m 84ms/step - accuracy: 0.8291 - loss: 0.3826 - val_accuracy: 0.8254 - val_loss: 0.3859
Epoch 6/10
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m838s[0m 84ms/step - accuracy: 0.8310 - loss: 0.3783 - val_accuracy: 0.8267 - val

<keras.src.callbacks.history.History at 0x18ef6276cb0>

In [11]:
# Avaliação do modelo
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Acurácia do modelo nos dados de teste:", test_acc)

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 14ms/step - accuracy: 0.8285 - loss: 0.3805
Acurácia do modelo nos dados de teste: 0.8286499977111816


In [15]:
# Salvar o modelo e o tokenizer
model.save(r"..\modelos\modelo_rnn.keras")
with open(r"..\modelos\tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Modelo treinado e tokenizer salvos com sucesso.")

Modelo treinado e tokenizer salvos com sucesso.
