# Fitting a Random Forest Machine Learning Model

In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA, PCA
from sklearn.preprocessing import StandardScaler
import h5py
import joblib
import json

In [2]:
# Load the training and testing datasets
with h5py.File('../data/splits/train_test_split.h5', 'r') as f:
    X_train = f['X_train'][:]
    X_test = f['X_test'][:]
    y_train = f['y_train'][:]
    y_test = f['y_test'][:]

print("Training and testing sets loaded successfully.")

Training and testing sets loaded successfully.


In [8]:
# Standardise the data before applying IPCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
# Apply Incremental PCA directly to reduce the number of components to a manageable size
# You can choose the number of components manually or reduce dimensions by a factor (e.g., to 100 components)
ipca = IncrementalPCA(n_components=100, batch_size=200)  # Adjust n_components based on your data size

In [12]:
# Fit and transform the training data
X_train_ipca = ipca.fit_transform(X_train_scaled)

In [13]:
# Transform the test data
X_test_ipca = ipca.transform(X_test_scaled)

print(f"IPCA completed. Reduced dimensions from {X_train.shape[1]} to {X_train_ipca.shape[1]}.")


IPCA completed. Reduced dimensions from 24017 to 100.


In [14]:
# Initialise the Random Forest model
model = RandomForestClassifier(random_state=42)

In [15]:
# Setting hyperparameters
params = {
    'n_estimators': 100,  # Number of trees in the forest
    'max_depth': None,    # Maximum depth of the tree
    'min_samples_split': 2,  # Minimum number of samples required to split an internal node
    'min_samples_leaf': 1,  # Minimum number of samples required to be at a leaf node
    'criterion': 'gini'  # Measure the quality of a split
}

model.set_params(**params)

In [16]:
# Train the Random Forest model
model.fit(X_train_ipca, y_train)

In [17]:
# Evaluate the Model
y_pred = model.predict(X_test_ipca)

In [18]:
# For classification
accuracy = accuracy_score(y_test, y_pred)
classification_report_dict = classification_report(y_test, y_pred, output_dict=True)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9239891809418129
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      7010
           1       0.92      0.93      0.93      7409

    accuracy                           0.92     14419
   macro avg       0.92      0.92      0.92     14419
weighted avg       0.92      0.92      0.92     14419



In [19]:
# Save the Random Forest model to a .pkl file using joblib
model_save_path = '../models/random_forest_ipca_model.pkl'
joblib.dump(model, model_save_path)
print(f"Random Forest model saved to {model_save_path}.")


Random Forest model saved to ../models/random_forest_ipca_model.pkl.


In [20]:
# Create a dictionary to store the metrics
metrics = {
    "accuracy": accuracy,
    "classification_report": classification_report_dict  # Already a dictionary
}

# Specify the path where the metrics will be saved
metrics_save_path = '../models/random_forest_ipca_metrics.json'

# Save the metrics dictionary to a JSON file
with open(metrics_save_path, 'w') as f:
    json.dump(metrics, f)

print(f"Model performance metrics saved to {metrics_save_path}.")

Model performance metrics saved to ../models/random_forest_ipca_metrics.json.
