# Modelling

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('Data/clean_data.csv')

df

### Split the data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['sentiment'], random_state=42)

# Display the class distribution in the training set before downsampling
print("Training set class distribution before downsampling:")
print(train_df['sentiment'].value_counts())

# Display the class distribution in the test set
print("Test set class distribution:")
print(test_df['sentiment'].value_counts())

### Up sampling

In [None]:
# Determine the size of the largest class in the training set
max_class_size = train_df['sentiment'].value_counts().max()

# Upsample each sentiment class in the training set
train_positive = train_df[train_df['sentiment'] == 'positive'].sample(max_class_size, replace=True, random_state=42)
train_negative = train_df[train_df['sentiment'] == 'negative'].sample(max_class_size, replace=True, random_state=42)
train_neutral = train_df[train_df['sentiment'] == 'neutral'].sample(max_class_size, replace=True, random_state=42)

# Combine the upsampled dataframes
train_df_upsampled = pd.concat([train_positive, train_negative, train_neutral])

# Shuffle the combined dataframe to mix the classes
train_df_upsampled = train_df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the class distribution after downsampling in the training set
print("Training set class distribution after downsampling:")
print(train_df_upsampled['sentiment'].value_counts())


### TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill NaN values in 'nouns_comm' and 'raw_comm' columns with an empty string
train_df_upsampled['nouns_comm'] = train_df_upsampled['nouns_comm'].fillna('')
train_df_upsampled['raw_comm'] = train_df_upsampled['raw_comm'].fillna('')
test_df['nouns_comm'] = test_df['nouns_comm'].fillna('')
test_df['raw_comm'] = test_df['raw_comm'].fillna('')

# TF-IDF Vectorization for 'nouns_comm' and 'raw_comm'
tfidf_vectorizer_nouns = TfidfVectorizer()
tfidf_vectorizer_raw = TfidfVectorizer()

X_train_nouns = tfidf_vectorizer_nouns.fit_transform(train_df_upsampled['nouns_comm'])
X_test_nouns = tfidf_vectorizer_nouns.transform(test_df['nouns_comm'])

X_train_raw = tfidf_vectorizer_raw.fit_transform(train_df_upsampled['raw_comm'])
X_test_raw = tfidf_vectorizer_raw.transform(test_df['raw_comm'])

y_train = train_df_upsampled['sentiment']
y_test = test_df['sentiment']


## Naive Bayes Model

### Train the model

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train Naive Bayes models for both 'nouns_comm' and 'raw_comm'
nb_nouns = MultinomialNB()
nb_raw = MultinomialNB()   

# Fit the models
nb_nouns.fit(X_train_nouns, y_train)
nb_raw.fit(X_train_raw, y_train)

# Predict on the training set
y_train_pred_nouns = nb_nouns.predict(X_train_nouns)
y_train_pred_raw = nb_raw.predict(X_train_raw)

# Predict on the test set
y_test_pred_nouns = nb_nouns.predict(X_test_nouns)
y_test_pred_raw = nb_raw.predict(X_test_raw)


### Compute models' accuracy

In [None]:
# Evaluate the Naive Bayes model on the test set ('nouns_comm')
print("\nTest Set - Naive Bayes Model on 'nouns_comm'")
print("Accuracy:", accuracy_score(y_test, y_test_pred_nouns))
print("Classification Report:\n", classification_report(y_test, y_test_pred_nouns))

# Evaluate the Naive Bayes model on the test set ('raw_comm')
print("\nTest Set - Naive Bayes Model on 'raw_comm'")
print("Accuracy:", accuracy_score(y_test, y_test_pred_raw))
print("Classification Report:\n", classification_report(y_test, y_test_pred_raw))


### Add predictions in the test_df

In [None]:
test_df['NB_pred_noun'] = y_test_pred_nouns
test_df['NB_pred_raw'] = y_test_pred_raw

test_df

## FNN

### FNN with nouns comm

#### Create the FNN architecture

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def create_fnn_model(input_dim, layers=1, nodes=32, learning_rate=0.001, dropout_rate=0.5, l2_reg=0.01):
    """
    Creates a feedforward neural network model with L2 regularization and dropout.
    
    Args:
        input_dim (int): Number of input features.
        layers (int): Number of hidden layers.
        nodes (int): Number of nodes per layer.
        learning_rate (float): Learning rate for the optimizer.
        dropout_rate (float): Dropout rate to prevent overfitting.
        l2_reg (float): L2 regularization strength.

    Returns:
        model: Compiled Keras model.
    """
    model = Sequential()
    model.add(Dense(nodes, input_dim=input_dim, activation='relu', kernel_regularizer=l2(l2_reg)))
    
    # Add hidden layers with dropout and L2 regularization
    for _ in range(layers - 1):
        model.add(Dense(nodes, activation='relu', kernel_regularizer=l2(l2_reg)))
        model.add(Dropout(dropout_rate))
    
    # Output layer with 3 nodes for the 3 classes
    model.add(Dense(3, activation='softmax'))
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

#### Transform in Keras wrap and set hyperparam

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import activations  


# Function to create the KerasClassifier model for grid search
def build_keras_classifier(input_dim, layers, nodes, learning_rate, dropout_rate=0.5, l2_reg=0.01):
    return KerasClassifier(model=create_fnn_model, input_dim=input_dim, layers=layers, 
                           nodes=nodes, learning_rate=learning_rate, dropout_rate=dropout_rate, 
                           l2_reg=l2_reg, verbose=0)

# Define hyperparameters for grid search
param_grid = {
    'layers': [3, 5, 7],
    'nodes': [128, 256, 512],
    'learning_rate': [0.01, 0.001, 0.0001],
    'epochs': [20],
    'batch_size': [32]
}

# Build the model
input_dim = X_train_nouns.shape[1]
model = build_keras_classifier(input_dim=input_dim, layers=1, nodes=32, learning_rate=0.001)

# EarlyStopping and ReduceLROnPlateau
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.001, 
    patience=10,  # Stop if no improvement after 10 epochs
    verbose=0, 
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=5,  # Reduce LR if no improvement after 5 epochs
    verbose=0, 
    min_lr=0.0001
)

### Train the model

In [None]:
# Perform Grid Search with EarlyStopping and ReduceLROnPlateau callbacks
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_nouns, y_train, callbacks=[early_stopping, reduce_lr])

#### Save the model

In [None]:
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

best_model_nouns = grid_result.best_estimator_

# Access the underlying Keras model
keras_model_nouns = best_model_nouns.model_

# Save the model in the recommended format
keras_model_nouns.save('FNN_nouns.keras')

# Load the model from the saved file
best_model_nouns = load_model('FNN_nouns.keras')

# Convert Sentiment labels to numerical values
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

# Evaluate the loaded model with the encoded labels
best_model_nouns.evaluate(X_test_nouns, y_test_encoded)


#### Print the mean cross validation accuracy per combination 

In [None]:
# Summarize the results from the grid search
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Extract the mean and standard deviation of test scores, and the corresponding parameters
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

# Print the results in a readable format
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


#### Retrain the best model to store the history

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder

# Retrieve the best hyperparameters from GridSearchCV
best_params = grid_result.best_params_

# Create the final model using the best parameters
best_model = create_fnn_model(
    input_dim=X_train_nouns.shape[1],
    layers=best_params['layers'],
    nodes=best_params['nodes'],
    learning_rate=best_params['learning_rate'],
)

# Model Summary
print(best_model.summary())

# EarlyStopping to stop training when validation loss has not improved
early_stopping = EarlyStopping(
    monitor='val_loss', 
    min_delta=0.001, 
    patience=10, 
    verbose=1, 
    restore_best_weights=True
)

# ReduceLROnPlateau to reduce the learning rate when validation loss has stopped improving
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=10, 
    verbose=1, 
    min_lr=0.0005
)

# ModelCheckpoint to save the model after every epoch
checkpoint = ModelCheckpoint(
    'best_model.keras', 
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Fit the model using the encoded labels
history = best_model.fit(
    X_train_nouns, y_train_encoded,  # <-- Use encoded labels here
    epochs=best_params['epochs'],  # Use the best number of epochs from the grid search
    batch_size=best_params['batch_size'],  # Use the best batch size from the grid search
    validation_split=0.2,  # Using 20% of the training data as validation data
    callbacks=[early_stopping, reduce_lr, checkpoint],
    verbose=1
)


#### Plot the history of the loss function and accuracy

In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    """
    Plots the training and validation loss and accuracy.

    Args:
        history: Keras History object returned by model.fit().
    """
    # Plot training & validation accuracy values
    plt.figure(figsize=(14, 6))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.show()

# Plot the training and validation loss/accuracy using the history object
plot_training_history(history)


### Predict on the test set with the best model

In [None]:
# Retrieve the best model from GridSearchCV
best_model_nouns = grid_result.best_estimator_

# Make predictions on the test set
y_test_pred = best_model_nouns.predict(X_test_nouns)


#### Compute metrics of the predictions

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model on the test set
test_accuracy = best_model_nouns.score(X_test_nouns, y_test)
print("Test set accuracy: {:.4f}".format(test_accuracy))

# Generate a classification report
print(classification_report(y_test, y_test_pred))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(conf_matrix)


## FNN with raw comms

### Train model with raw comments

In [None]:
# Perform Grid Search with EarlyStopping and ReduceLROnPlateau on raw comments
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_raw, y_train, callbacks=[early_stopping, reduce_lr])


### Save the model

In [None]:
#### Save the model
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

best_model_raw = grid_result.best_estimator_

# Access the underlying Keras model
keras_model_raw = best_model_raw.model_

# Save the model in the recommended format
keras_model_raw.save('FNN_raw.keras')

# Load the model from the saved file
best_model_raw = load_model('FNN_raw.keras')

# Convert Sentiment labels to numerical values
label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

# Evaluate the loaded model with the encoded labels
best_model_nouns.evaluate(X_test_nouns, y_test_encoded)


### Plot training history

In [None]:
# Retrieve the best model from GridSearchCV
best_model_raw = grid_result.best_estimator_

# Access the underlying Keras model
keras_model_raw = best_model_raw.model_

# Save the best model trained on raw comments
keras_model_raw.save('fnn_raw_model.keras')

# Retrain the best model on the full training set and store the training history
history = best_model_raw.model_.fit(
    X_train_raw, y_train, 
    epochs=best_model_raw.get_params()['epochs'], 
    batch_size=best_model_raw.get_params()['batch_size'],
    validation_split=0.2,  # Use 20% of the training data for validation
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Plot the training and validation accuracy and loss using the history
plot_training_history(history)

### Metrics with raw comments

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Retrieve the best model from GridSearchCV
best_model_raw = grid_result.best_estimator_

# Make predictions on the test set (use X_test_raw instead of X_test_nouns)
y_test_pred = best_model_raw.predict(X_test_raw)

# Evaluate the model on the test set (using raw comments)
test_accuracy = best_model_raw.score(X_test_raw, y_test)
print("Test set accuracy: {:.4f}".format(test_accuracy))

# Generate a classification report
print(classification_report(y_test, y_test_pred))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(conf_matrix)


## RNN with nouns comments