# Optimising the Logistic Regression Model for Fake News Detection

This notebook focuses on optimizing the previously trained Logistic Regression model.
We will use grid search with cross-validation to find the best hyperparameters.

# Import Libraries

In [22]:
import h5py
import pickle
import numpy as np
import os
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

# Load Necessary Data and Files

In [19]:
model_path = '../models/logistic_regression_model.pkl'
print("Path exists:", os.path.exists(model_path))

Path exists: True


In [12]:
# Load the training and testing datasets
with h5py.File('../data/splits/train_test_split.h5', 'r') as f:
    X_train = f['X_train'][:]
    X_test = f['X_test'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]

print("Training and testing datasets loaded successfully.")

Training and testing datasets loaded successfully.


In [13]:
# Load the training and testing datasets
with h5py.File('../data/splits/train_test_split.h5', 'r') as f:
    X_train = f['X_train'][:]
    X_test = f['X_test'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]

print("Training and testing datasets loaded successfully.")


Training and testing datasets loaded successfully.


In [21]:
# Load the initial Logistic Regression model
model_path = '../models/logistic_regression_model.pkl'

try:
    initial_model = joblib.load(model_path)
    print("Initial Logistic Regression model loaded successfully.")
except Exception as e:
    print(f"An unexpected error occurred while loading the model: {e}")

Initial Logistic Regression model loaded successfully.


# Initialise the Logistic Regression Model

In [23]:
# Initialise the Logistic Regression model
initial_model = LogisticRegression(max_iter=1000)


# Define the Parameter Grid for GridSearchCV

In [24]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularisation strength
    'penalty': ['l1', 'l2'],  # Type of regularisation
    'solver': ['liblinear']   # Solver that supports L1 and L2
}


# Initialise GridSearchCV and Fit the Model

In [25]:
######## WE MAY NEED TO USE GOOGLE COLLAB FOR BETTER PERFORMANCE!!! I have insufficient computational resources to run this on my local machine.

# Initialise GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=initial_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Output the best parameters and best score
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits


MemoryError: Unable to allocate 8.26 GiB for an array with shape (46141, 24017) and data type float64

# Evaluate the Best Model on the Test Set

In [None]:
# # Evaluate the best model on the test set
# y_pred = best_model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)

# print(f"Test Accuracy: {accuracy:.4f}")
# print(f"Test Precision: {precision:.4f}")
# print(f"Test Recall: {recall:.4f}")
# print(f"Test F1 Score: {f1:.4f}")


# Display the Classification Report and Confusion Matrix

In [None]:
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred))


# Save the Optimised Model

In [None]:
# # Save the optimised model
# model_save_path = '../models/optimized_logistic_regression_model.pkl'
# joblib.dump(best_model, model_save_path)
# print(f"Optimized Logistic Regression model saved to {model_save_path}.")


# Save Performance Metrics as a JSON File

In [None]:
# # Save the performance metrics and best parameters as a JSON file
# metrics = {
#     "accuracy": accuracy,
#     "precision": precision,
#     "recall": recall,
#     "f1_score": f1,
#     "best_params": grid_search.best_params_,
#     "best_cross_val_score": grid_search.best_score_
# }

# metrics_save_path = '../models/optimized_logistic_regression_metrics.json'
# with open(metrics_save_path, 'w') as f:
#     json.dump(metrics, f)

# print(f"Model performance metrics saved to {metrics_save_path}.")
