In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [11]:
data = pd.read_csv('../dataset/cleaned-dataset.csv')
print("Dataset loaded:", data.shape)


Dataset loaded: (11162, 17)


In [23]:
# --- CORRECTION 1: Target Column ---
# Step 3: Separate features and target
X = data.drop('deposit', axis=1)   # Target column is 'deposit'
y = data['deposit']
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

Features (X) shape: (11162, 17)
Target (y) shape: (11162,)


In [24]:
# Step 4: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [25]:
# --- CORRECTION 2 & 3: Scaler Saving and Using Scaled Features for Training ---
# Step 5: Apply KMeans clustering and Prepare Scaled Features DataFrame
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

In [26]:
# Create a DataFrame with SCALED features to be used for Logistic Regression training
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
X_scaled_df['Cluster'] = cluster_labels
X_scaled_df['deposit_target'] = y # Add target column

In [27]:
    # Save KMeans model and Scaler (crucial for backend)
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("KMeans model and StandardScaler saved.")

KMeans model and StandardScaler saved.


In [28]:
# Add cluster labels to original data for overall reporting/reference
data['Cluster'] = cluster_labels

In [30]:
# Step 6: Train Logistic Regression per cluster using SCALED FEATURES
logistic_models = {}
accuracies = []
# Iterate over the SCALED features DataFrame (X_scaled_df)
for cluster_id in sorted(X_scaled_df['Cluster'].unique()):
    cluster_data = X_scaled_df[X_scaled_df['Cluster'] == cluster_id]
    
    # Features are all columns EXCEPT 'deposit_target' and 'Cluster'
    X_cluster = cluster_data.drop(['deposit_target', 'Cluster'], axis=1)
    y_cluster = cluster_data['deposit_target'] # Use the target column
    
    X_train, X_test, y_train, y_test = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
    
    # Train Logistic Regression (max_iter=5000 is safer for convergence)
    model = LogisticRegression(max_iter=5000) 
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f'Cluster {cluster_id} Accuracy: {acc:.2f}')
    
    logistic_models[cluster_id] = model
    accuracies.append(acc)

# Save all logistic models
with open('logistic_models.pkl', 'wb') as f:
    pickle.dump(logistic_models, f)
print("Logistic models saved.")

Cluster 0 Accuracy: 0.76
Cluster 1 Accuracy: 0.88
Cluster 2 Accuracy: 0.78
Logistic models saved.


In [31]:
# Step 7: Overall Hybrid Accuracy
print("Average Hybrid Model Accuracy:", np.mean(accuracies))

Average Hybrid Model Accuracy: 0.8067442950202469
