# **Optimising the Logistic Regression Model for Fake News Detection**

This notebook focuses on optimising the previously trained Logistic Regression model.
We will use grid search with cross-validation to find the best hyperparameters.

> *This code requires significant computational resources and a Google Drive account (with `Google Collab`) to run.*

# 1. Setup Environment

In [None]:
# Quick environment check
!pip list


## Install the required packages

In [8]:
!pip install scikit-learn==1.5.1 joblib==1.2.0 h5py==3.11.0 tqdm==4.66.5




## Import Libraries

In [9]:
# Import necessary libraries
import h5py
import numpy as np
import joblib
import sklearn
import tqdm

# Display the versions of the libraries
print(f"h5py version: {h5py.__version__}")
print(f"numpy version: {np.__version__}")
print(f"joblib version: {joblib.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"tqdm version: {tqdm.__version__}")


h5py version: 3.11.0
numpy version: 1.26.4
joblib version: 1.2.0
scikit-learn version: 1.5.1
tqdm version: 4.66.5


# 2. Mount Google Drive:

In [None]:
from google.colab import drive
drive.mount('/content/drive')



# 3. Check and Define Paths

In [10]:
import os


## Define the correct absolute paths

In [12]:
folder_path = '/content/drive/My Drive/Shared with Others/For Uth Tam Sin/Project-4/fake-news-predictor/'
model_path = os.path.join(folder_path, 'models/logistic_regression_model.pkl')
train_test_split_path = os.path.join(folder_path, 'data/splits/train_test_split.h5')

## Verify if paths exist


In [13]:
print("Model path exists:", os.path.exists(model_path))
print("Train/test split path exists:", os.path.exists(train_test_split_path))

Model path exists: True
Train/test split path exists: True


# 4. Load Data

In [16]:
# Load a sample of the training and testing datasets
with h5py.File(train_test_split_path, 'r') as f:
    X_train_sample = f['X_train'][:10000]  # Load a sample of the training data
    X_test_sample = f['X_test'][:2000]     # Load a sample of the testing data
    y_train_sample = f['y_train'][:10000]
    y_test_sample = f['y_test'][:2000]

print("Sample of training and testing datasets loaded successfully.")


Sample of training and testing datasets loaded successfully.


# 5. Load the Pre-trained Model

In [17]:
try:
    initial_model = joblib.load(model_path)
    print("Initial Logistic Regression model loaded successfully.")
except Exception as e:
    print(f"An unexpected error occurred while loading the model: {e}")

Initial Logistic Regression model loaded successfully.


# 6. Hyperparameter Tuning with GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## Initialise the Logistic Regression model correctly

In [24]:
initial_model = LogisticRegression(max_iter=1000, solver='liblinear')

## Define the parameter grid for GridSearchCV

In [25]:
param_grid = {
    'C': [0.1, 1, 10, 100],      # Regularisation strength
    'penalty': ['l1', 'l2'],     # Type of regularisation
    'solver': ['liblinear']      # Solver that supports L1 and L2
}

## Initialise GridSearchCV with cross-validation

In [26]:
grid_search = GridSearchCV(
    estimator=initial_model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=3,  # Increased verbosity for detailed progress tracking
    n_jobs=-1   # Utilise all available cores
)

## Fit the grid search to the training data

In [27]:
grid_search.fit(X_train_sample, y_train_sample)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


## Output the best parameters and best score

In [28]:
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.9444


## Evaluate the best model on the test set

In [29]:
y_pred = best_model.predict(X_test_sample)

## Calculate the evaluation metrics

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [32]:
accuracy = accuracy_score(y_test_sample, y_pred)
precision = precision_score(y_test_sample, y_pred)
recall = recall_score(y_test_sample, y_pred)
f1 = f1_score(y_test_sample, y_pred)

In [33]:
# Print the evaluation metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Test Accuracy: 0.9415
Test Precision: 0.9442
Test Recall: 0.9433
Test F1 Score: 0.9437


## Save the optimised model and metrics

In [34]:
import json

In [36]:
# Save the optimized model
optimized_model_path = os.path.join(folder_path, 'models/optimized_logistic_regression_model.pkl')
joblib.dump(best_model, optimized_model_path)
print(f"Optimized Logistic Regression model saved to {optimized_model_path}.")

Optimized Logistic Regression model saved to /content/drive/My Drive/Shared with Others/For Uth Tam Sin/Project-4/fake-news-predictor/models/optimized_logistic_regression_model.pkl.


In [37]:
# Save the performance metrics as a JSON file
metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
    "best_params": grid_search.best_params_,
    "best_cross_val_score": grid_search.best_score_
}

In [39]:
metrics_save_path = os.path.join(folder_path, 'models/optimized_logistic_regression_metrics.json')
with open(metrics_save_path, 'w') as f:
    json.dump(metrics, f)

In [40]:
print(f"Model performance metrics saved to {metrics_save_path}.")

Model performance metrics saved to /content/drive/My Drive/Shared with Others/For Uth Tam Sin/Project-4/fake-news-predictor/models/optimized_logistic_regression_metrics.json.
