In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Load the datasets
training_file_path = 'sentiment_updated_sortedDate_Training.csv'
testing_file_path = 'sentiment_updated_sortedDate_Testing.csv'

train_data = pd.read_csv(training_file_path)
test_data = pd.read_csv(testing_file_path)

# Preprocess the data
# Convert categorical sentiment labels to numerical
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['sentiment'])
test_labels = label_encoder.transform(test_data['sentiment'])

train_texts = train_data['summary'].astype(str)
test_texts = test_data['summary'].astype(str)

# Tokenizing
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences
max_seq_length = 100  # Adjust as needed
X_train = pad_sequences(train_sequences, maxlen=max_seq_length)
X_test = pad_sequences(test_sequences, maxlen=max_seq_length)
y_train = to_categorical(train_labels)
y_test = to_categorical(test_labels)

# K-Fold Cross Validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train, val in kfold.split(X_train, y_train):
    # Define the RNN model with L2 regularization
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_seq_length))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01)))
    model.add(Dense(y_train.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    print(f'Training for fold {fold_no} ...')
    model.fit(X_train[train], y_train[train], batch_size=32, epochs=10, verbose=1)

    # Evaluate the model
    scores = model.evaluate(X_train[val], y_train[val], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    fold_no += 1

# Testing the model on the separate testing dataset
final_loss, final_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {final_accuracy*100}%')

# Average cross-validation scores
print('Average scores for all folds:')
print(f'Average Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'Average Loss: {np.mean(loss_per_fold)}')


Training for fold 1 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 1: loss of 1.3500229120254517; accuracy of 72.01834917068481%
Training for fold 2 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 2: loss of 1.4532197713851929; accuracy of 75.11520981788635%
Training for fold 3 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 3: loss of 0.8980908393859863; accuracy of 59.907835721969604%
Training for fold 4 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 4: loss of 1.806442379951477; accuracy of 71.42857313156128%
Training for fold 5 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 5: