In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('datasets/train.csv')
df['label'] = df['label'].astype(float)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
train_texts = train_data['text'].astype(str).tolist()
train_labels = train_data['label'].astype(float).values
test_texts = test_data['text'].astype(str).tolist()
test_labels = test_data['label'].astype(float).values

In [4]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

In [5]:
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [6]:
max_len = 280
train_data = pad_sequences(train_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Construir o modelo
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

# Compilar o modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Resumo do modelo
model.summary()

In [11]:
# Treinamento do modelo
history = model.fit(train_data, train_labels, epochs=5, batch_size=64, validation_split=0.3)

Epoch 1/5
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.9666 - loss: 0.0877 - val_accuracy: 0.7664 - val_loss: 1.0443
Epoch 2/5
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.9731 - loss: 0.0703 - val_accuracy: 0.7639 - val_loss: 0.9319
Epoch 3/5
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step - accuracy: 0.9710 - loss: 0.0832 - val_accuracy: 0.7614 - val_loss: 1.0979
Epoch 4/5
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step - accuracy: 0.9811 - loss: 0.0561 - val_accuracy: 0.7584 - val_loss: 1.1540
Epoch 5/5
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step - accuracy: 0.9798 - loss: 0.0591 - val_accuracy: 0.7552 - val_loss: 1.3763


In [12]:
# Avaliação do modelo
loss, accuracy = model.evaluate(x=test_data, y=test_labels)
print("Test Accuracy:", accuracy)

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6987 - loss: 1.7195
Test Accuracy: 0.6928571462631226


In [10]:
from tensorflow.keras.models import save_model

# Supondo que 'model' seja o nome do seu modelo
model.save("nlp_toxic.keras")