This notebook is dedicated to the initial modeling phase, including baseline model creation. The subsequent auto-optimisation will be handled in a separate notebook.

# Import necessary libraries

In [8]:
import h5py
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
import joblib
import json


# Load the data

In [9]:
# Load the training and testing datasets
with h5py.File('../data/splits/train_test_split.h5', 'r') as f:
    X_train = f['X_train'][:]
    X_test = f['X_test'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]

print("Training and testing sets loaded successfully.")

Training and testing sets loaded successfully.


# Initialise and Train the Model

Initialising the Logistic Regression model and training it using our training data (X_train and y_train)

In [3]:
# Initialise the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, '../models/logistic_regression_model.pkl')
print("Model trained and saved successfully.")


Model trained and saved successfully.


# Evaluate the Model

In [4]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model using various metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print the classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9547
Precision: 0.9533
Recall: 0.9588
F1 Score: 0.9561

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      7010
           1       0.95      0.96      0.96      7409

    accuracy                           0.95     14419
   macro avg       0.95      0.95      0.95     14419
weighted avg       0.95      0.95      0.95     14419

Confusion Matrix:
 [[6662  348]
 [ 305 7104]]


# Cross-Validation Score

In [5]:
# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

# Print the cross-validation scores
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Cross-Validation Accuracy: 0.9548 ± 0.0027


# Saving the Outputs

In [11]:
model_path = '../models/logistic_regression_model.pkl'

try:
    # Save the model using joblib
    joblib.dump(model, model_path)
    print("Model saved successfully using joblib.")
except Exception as e:
    print(f"An unexpected error occurred while saving the model with joblib: {e}")

Model saved successfully using joblib.


In [12]:
# Ensure all metrics are properly calculated and not functions
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

# Compute mean and std deviation of cross-validation scores
cross_val_mean_accuracy = cv_scores.mean()
cross_val_std_accuracy = cv_scores.std()

# Save performance metrics as a JSON file
metrics = {
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1_score,
    "cross_val_mean_accuracy": cross_val_mean_accuracy,
    "cross_val_std_accuracy": cross_val_std_accuracy
}

with open('../models/logistic_regression_metrics.json', 'w') as f:
    json.dump(metrics, f)

print("Model performance metrics saved successfully.")


Model performance metrics saved successfully.
