In [1]:
import sys
sys.path.append('../src')  # 👈 allows importing from src/

import pandas as pd
import numpy as np
import joblib
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

from preprocessor import load_and_split_data, scale_data

# Set data path
data_path = "../data/ckd_simulated_input.csv"

# Load and split
# Load and split with correct target column name
X_train, X_test, y_train, y_test = load_and_split_data(data_path, target_column='CKD')

# Scale
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)

print(f"✅ X_train shape: {X_train_scaled.shape}, y_train shape: {y_train.shape}")

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

✅ X_train shape: (1351, 13), y_train shape: (1351,)


In [2]:
import os

# Create models directory if it doesn't exist
os.makedirs("models", exist_ok=True)

# Now save
joblib.dump(model, "models/random_forest_model.pkl")
joblib.dump(scaler, "models/scaler.joblib")

['models/scaler.joblib']

In [3]:
y_pred = model.predict(X_test_scaled)

print("✅ Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model and scaler
joblib.dump(model, "models/random_forest_model.pkl")
joblib.dump(scaler, "models/scaler.joblib")  # Overwrites same scaler for reuse
print("✅ Random Forest model and scaler saved successfully!")

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.81      0.75       214
           1       0.54      0.39      0.45       124

    accuracy                           0.65       338
   macro avg       0.62      0.60      0.60       338
weighted avg       0.64      0.65      0.64       338

✅ Confusion Matrix:
 [[173  41]
 [ 76  48]]
✅ Random Forest model and scaler saved successfully!


In [4]:
# Save models with version tags
joblib.dump(scaler, "models/scaler_v2.joblib")
joblib.dump(rf_model, "models/random_forest_model_v2.pkl")

NameError: name 'rf_model' is not defined

In [None]:
import pandas as pd

df = pd.read_csv("../data/ckd_simulated_input.csv")
print(df.columns)

In [None]:
# Use relative path from the notebook location
df = pd.read_csv("../data/ckd_simulated_input.csv")
print(df.head())

In [6]:
# STEP 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump
import os

# STEP 2: Load Data
data = pd.read_csv("../data/ckd_simulated_input.csv")

# STEP 3: Split features and target
X = data.drop(columns=["SEQN", "CKD"])  # drop ID and target
y = data["CKD"]

# STEP 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# STEP 5: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# STEP 6: Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# STEP 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# STEP 8: Evaluate model
y_pred = rf_model.predict(X_test_scaled)
print("✅ Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# STEP 9: Save model and scaler
os.makedirs("../models", exist_ok=True)
dump(rf_model, "../models/random_forest_model.joblib")
dump(scaler, "../models/scaler.joblib")

print("✅ Random Forest model and scaler saved successfully!")

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.69      0.71       214
           1       0.51      0.56      0.54       124

    accuracy                           0.64       338
   macro avg       0.62      0.63      0.62       338
weighted avg       0.65      0.64      0.65       338

✅ Confusion Matrix:
 [[147  67]
 [ 54  70]]
✅ Random Forest model and scaler saved successfully!


In [7]:
# Save model
import pickle
with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

# Save X_test and y_test for SHAP
import joblib
joblib.dump((X_test, y_test), '../data/test_set.joblib')

['../data/test_set.joblib']

In [8]:
import os
print("models/", os.listdir('../models'))
print("data/", os.listdir('../data'))

models/ ['logistic_regression_model.joblib', 'logistic_regression_model.pkl', 'random_forest_model.joblib', 'random_forest_model.pkl', 'scaler.joblib']
data/ ['.ipynb_checkpoints', 'ckd_simulated_input.csv', 'test_set.joblib']


In [9]:
import os
os.remove('../models/random_forest_model.pkl')

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

In [11]:
import pickle
with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

In [12]:
import os
print("Size of saved model file:", os.path.getsize('../models/random_forest_model.pkl'), "bytes")

Size of saved model file: 5349484 bytes
