Load Data

In [None]:
import pandas as pd

data_train = pd.read_csv('train.csv', delimiter=";") # Load Train Datasets
data_test = pd.read_csv('test.csv', delimiter=";") # Load Test Datasets
data_eval = pd.read_csv('evaluation.csv', delimiter=";") # Load Evaluation Datasets

X_train = data_train['title'].values + data_train['text'].values # Create Train Data
X_test = data_test['title'].values + data_test['text'].values # Create Test Data
X_eval = data_eval['title'].values + data_eval['text'].values # Create Evaluation Data

Y_train = data_train['label'].values # Create Train Labels
Y_test = data_test['label'].values # Create Test Labels
Y_eval = data_eval['label'].values # Create Evaluation Labels

Preprocessing Data

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token="Token") # Create Empty Dictionary
tokenizer.fit_on_texts(X_train) # Create Dictionary With 5000 words popular in train data

# Preprocessing Train Data
train_sequences = tokenizer.texts_to_sequences(X_train) # Mapping text to tokenizer
padded_train_sequence = pad_sequences(train_sequences, maxlen=500, padding="post") # Create sequences based on tokenizer

# Preprocessing Test Data
test_sequences = tokenizer.texts_to_sequences(X_test) # Mapping text to tokenizer
padded_test_sequence = pad_sequences(test_sequences, maxlen=500, padding="post") # Create sequences based on tokenizer

# Preprocessing Evaluate Data
eval_sequences = tokenizer.texts_to_sequences(X_eval) # Mapping text to tokenizer
padded_eval_sequence = pad_sequences(eval_sequences, maxlen=500, padding="post") # Create sequences based on tokenizer



Build And Train RNN simple Models

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.callbacks import EarlyStopping

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=500),
    SimpleRNN(128, activation="tanh"),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])

early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.fit(
    padded_train_sequence, Y_train, 
    validation_data=(padded_test_sequence, Y_test)
    batch_size=32, 
    epochs=10,
    callbacks=[early_stopping]
)

Evaluate Models

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib as plt
import seaborn as sns

predictions = (model.predict(padded_eval_sequence) > 0.5).astype("int32")
print("Confusion Matrix: ")

cm = confusion_matrix(Y_eval, predictions)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

plt.xlabel("Predicted")
plt.ylabel("Actual")

plt.show()


Save Models And Tokenizer

In [None]:
model.save("Simple_RNN.h5")
import pickle

with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)