In [5]:
!pip3 install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 11.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import joblib

In [4]:

def load_data(path):
    df = pd.read_csv(path)
    return df

def build_preprocessor():
    # Ordinal mapping for education
    education_order = ['unknown', 'primary', 'secondary', 'tertiary']
    ordinal_cols = ['education']
    nominal_cols = ['job', 'marital', 'contact', 'month', 'poutcome']
    binary_cols = ['default', 'housing', 'loan']
    numeric_cols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous', 'day']

    ordinal_pipe = Pipeline([
        ('ord', OrdinalEncoder(categories=[education_order], handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    nominal_pipe = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    binary_pipe = Pipeline([
        ('onehot', OneHotEncoder(drop='if_binary'))
    ])
    numeric_pipe = Pipeline([
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('ord', ordinal_pipe, ordinal_cols),
        ('nom', nominal_pipe, nominal_cols),
        ('bin', binary_pipe, binary_cols),
        ('num', numeric_pipe, numeric_cols)
    ])
    return preprocessor


def preprocess(df, preprocessor=None, fit=True):
    X = df.drop(['y', 'id'], axis=1)
    y = df['y']
    # Optionally create interaction features here (example)
    X['balance_x_duration'] = X['balance'] * X['duration']
    X['age_x_education'] = X['age'] * X['education'].map({'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3})
    if fit:
        X_processed = preprocessor.fit_transform(X)
    else:
        X_processed = preprocessor.transform(X)
    return X_processed, y

In [5]:
def get_models():
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForest": RandomForestClassifier(n_estimators=100),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        "LightGBM": LGBMClassifier(),
        "SVM": SVC(probability=True),
        "NaiveBayes": GaussianNB(),
        "MLPClassifier": MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500)
    }
    return models

def train_and_evaluate(model_name,model, X, y):

    results = {}
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
   
    model.fit(X_train, y_train)
    results[model_name] = {
        "model": model,
        "score": model.score(X_test, y_test)
    }
    joblib.dump(model, f"models_new/{model_name}.joblib")
    return results, X_test, y_test

In [6]:

df = load_data(r"D:\GUVI-Projects\Capstone\Project1_Bank\code-models\data\train.csv")
preprocessor = build_preprocessor()
X, y = preprocess(df, preprocessor)

# Save preprocessor object for later use (e.g., in Streamlit/app)
joblib.dump(preprocessor, "models_new/preprocessor.joblib")
print("Preprocessor saved to models_new/preprocessor.joblib")

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Baseline model
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Logistic Regression Score:", lr.score(X_test, y_test))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("Random Forest Score:", rf.score(X_test, y_test))





Preprocessor saved to models/preprocessor.joblib
Logistic Regression Score: 0.91506
Random Forest Score: 0.9305


In [None]:

models = get_models()
model_results = {}
for name, model in models.items():
    results, X_test, y_test = train_and_evaluate(name, model, X, y)
    print({k: v["score"] for k,v in results.items()})
    model_results = results

print(model_results)

{'LogisticRegression': 0.91506}
{'RandomForest': 0.93092}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'XGBoost': 0.9337733333333333}
[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021938 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1031
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289




{'LightGBM': 0.93342}


In [6]:
df = load_data(r"D:\GUVI-Projects\Capstone\Project1_Bank\code-models\data\train.csv")
preprocessor = build_preprocessor()
X, y = preprocess(df, preprocessor)

In [7]:



models = {
    "NaiveBayes": GaussianNB(),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500)
}
model_results = {}
for name, model in models.items():
    results, X_test, y_test = train_and_evaluate(name, model, X, y)
    print({k: v["score"] for k,v in results.items()})
    model_results = results

print(model_results)

{'NaiveBayes': 0.8599933333333334}
{'MLPClassifier': 0.9316133333333333}
{'MLPClassifier': {'model': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500), 'score': 0.9316133333333333}}
