In [20]:
import sys
import os
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import sys
import os
from pathlib import Path

# Extend system path to import project modules
PACKAGE_ROOT = Path(os.getcwd()).parent  #points to project's root directory
sys.path.append(str(PACKAGE_ROOT))

from prediction_model.config import config
from prediction_model.processing.data_handling import load_dataset, load_pipeline, save_pipeline



In [21]:
# Load and prepare data
df = load_dataset(config.TRAIN_FILE)
X = df[config.FEATURES]
y = df[config.TARGET].map({'Y': 1, 'N': 0})

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Load the full pipeline and remove the final classifier
pipeline = load_pipeline(config.MODEL_NAME)
preprocessor = Pipeline(pipeline.steps[:-1])

# Transform the data using the pipeline's preprocessing
X_train_processed = preprocessor.transform(X_train)
X_val_processed = preprocessor.transform(X_val)

Model has been loaded


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(self.mean_dict[col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(self.mode_dict[col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [23]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}


In [24]:
results = {}

# Step 4: Evaluate each model
for name, model in models.items():
    model.fit(X_train_processed, y_train)
    y_pred = model.predict(X_val_processed)

    f1 = f1_score(y_val, y_pred)
    roc = roc_auc_score(y_val, y_pred)

    # Store results
    results[name] = {
        "model": model,
        "F1 Score": f1,
        "ROC AUC": roc
    }

    print(f"\n[RESULT] {name}")
    print(f"Accuracy      : {accuracy_score(y_val, y_pred):.4f}")
    print(f"Precision     : {precision_score(y_val, y_pred):.4f}")
    print(f"Recall        : {recall_score(y_val, y_pred):.4f}")
    print(f"F1 Score      : {f1:.4f}")
    print(f"ROC-AUC Score : {roc:.4f}")


[RESULT] Logistic Regression
Accuracy      : 0.7886
Precision     : 0.7596
Recall        : 0.9875
F1 Score      : 0.8587
ROC-AUC Score : 0.7031

[RESULT] Random Forest
Accuracy      : 0.7724
Precision     : 0.7600
Recall        : 0.9500
F1 Score      : 0.8444
ROC-AUC Score : 0.6959


In [25]:
# Step 5: Select best model based on F1 score (you can switch to ROC AUC if needed)
best_model_name = max(results, key=lambda x: results[x]["F1 Score"])
best_model = results[best_model_name]["model"]

print(f"\n✅ Best model selected: {best_model_name} based on highest F1 Score")


✅ Best model selected: Logistic Regression based on highest F1 Score


In [26]:
# Step 6: Create new pipeline with best model
final_pipeline = Pipeline(steps=[
    *preprocessor.steps,
    ("classifier", best_model)
])

# Step 7: Save the upgraded pipeline
save_pipeline(final_pipeline)

print("[✅] New model pipeline saved successfully as the final production model.")

Model has been saved under the name classification.pkl
[✅] New model pipeline saved successfully as the final production model.
