In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0


In [5]:
df.columns

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [7]:
df.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
for col in df.columns:
    uni_value = df[col].unique()
    print(f"{col} has {len(uni_value)} values")
    if len(uni_value) <=15:
        print(f"    values are : {uni_value}")

id has 750000 values
age has 78 values
job has 12 values
    values are : ['technician' 'blue-collar' 'student' 'admin.' 'management' 'entrepreneur'
 'self-employed' 'unknown' 'services' 'retired' 'housemaid' 'unemployed']
marital has 3 values
    values are : ['married' 'single' 'divorced']
education has 4 values
    values are : ['secondary' 'primary' 'tertiary' 'unknown']
default has 2 values
    values are : ['no' 'yes']
balance has 8217 values
housing has 2 values
    values are : ['no' 'yes']
loan has 2 values
    values are : ['no' 'yes']
contact has 3 values
    values are : ['cellular' 'unknown' 'telephone']
day has 31 values
month has 12 values
    values are : ['aug' 'jun' 'may' 'feb' 'apr' 'nov' 'jul' 'jan' 'oct' 'mar' 'sep' 'dec']
duration has 1760 values
campaign has 52 values
pdays has 596 values
previous has 50 values
poutcome has 4 values
    values are : ['unknown' 'other' 'failure' 'success']
y has 2 values
    values are : [0 1]


MODEL TRAINING

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

X = df.drop(columns=['y'], axis=1)
y = df['y']

num_col = X.select_dtypes(exclude = 'object').columns
cat_col = X.select_dtypes(include = 'object').columns

preprocessor = ColumnTransformer(
    [
        ("numerical", StandardScaler(), num_col),
        ("categorical", OneHotEncoder(), cat_col)
    ]
)

X = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost as xgb 
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score

models = {
    "LogisticRegression" : LogisticRegression(max_iter=1000),
    "RandomForestClassifier" : RandomForestClassifier(n_estimators=200, random_state=42),
    "XGBoost" : xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM" : lgb.LGBMClassifier(random_state=42),
    "CatBoost" : CatBoostClassifier(verbose=0, random_state=42)
}

result = {}

for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    result[name] = acc_score

for classifier , acc_score in result.items():
    print(f"{classifier} has predicted with an accuracy score of {acc_score :.4f}")



[LightGBM] [Info] Number of positive: 60495, number of negative: 442005
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1299
[LightGBM] [Info] Number of data points in the train set: 502500, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120388 -> initscore=-1.988760
[LightGBM] [Info] Start training from score -1.988760
LogisticRegression has predicted with an accuracy score of 0.9157
RandomForestClassifier has predicted with an accuracy score of 0.9317
XGBoost has predicted with an accuracy score of 0.9341
LightGBM has predicted with an accuracy score of 0.9339
CatBoost has predicted with an accuracy score of 0.9356


HYPERPARAMETER TUNING  --> the 2 best performing models (CatBoost , XGBoost)

In [11]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import xgboost

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# CatBoost parameter grid
cat_params = {
    'iterations': [200, 400, 600],
    'depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

# XGBoost parameter grid
xgb_params = {
    'n_estimators': [200, 500, 800],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

cat = CatBoostClassifier(verbose=0, random_state=42)
xgb_clf = xgboost.XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=42)

cat_random = RandomizedSearchCV(
    estimator=cat,
    param_distributions=cat_params,
    n_iter=20,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

xgb_random = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=xgb_params,
    n_iter=20,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Fit models separately
cat_random.fit(X_train, y_train)
xgb_random.fit(X_train, y_train)

# Print best results
print("\nCatBoost :")
print("     Best params: ", cat_random.best_params_)
print("     Best score: ", cat_random.best_score_)

print("\nXGBoost :")
print("     Best params: ", xgb_random.best_params_)
print("     Best score: ", xgb_random.best_score_)


CatBoost :
     Best params:  {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 600, 'depth': 7}
     Best score:  0.9352199004975125

XGBoost :
     Best params:  {'subsample': 1.0, 'n_estimators': 800, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
     Best score:  0.9361990049751243


In [12]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import  accuracy_score, classification_report

cat_model = CatBoostClassifier( learning_rate=0.1, l2_leaf_reg=3, iterations=600, depth=7, verbose=0, random_state=42 )

xgb_model = XGBClassifier( subsample=1.0, n_estimators=800, max_depth=9, learning_rate=0.05, colsample_bytree=0.8, random_state=42,
                           eval_metric='logloss', use_label_encoder=False )


xgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

cat_preds = cat_model.predict_proba(X_test)[:, 1]
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]

cat_weight = 0.5
xgb_weight = 0.5
ensemble_preds = (cat_weight * cat_preds) + (xgb_weight * xgb_preds)

ensemble_class_preds = (ensemble_preds >= 0.5).astype(int)

print("Ensemble Accuracy:", accuracy_score(y_test, ensemble_class_preds))

Ensemble Accuracy: 0.9359070707070707


In [21]:
test_df = pd.read_csv('test.csv')
X_test_data = test_df  # no need to drop y

print(test_df.columns) 

print("Transforming test data...")
X_test_transformed = preprocessor.transform(X_test_data)

print("Making predictions...")
cat_preds_test = cat_model.predict_proba(X_test_transformed)[:, 1]
xgb_preds_test = xgb_model.predict_proba(X_test_transformed)[:, 1]

ensemble_preds_test = (cat_weight * cat_preds_test) + (xgb_weight * xgb_preds_test)
ensemble_class_preds_test = (ensemble_preds_test >= 0.5).astype(int)

# Create submission dataframe
submission_df = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(ensemble_class_preds_test)),
    'y': ensemble_class_preds_test
})

Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome'],
      dtype='object')
Transforming test data...
Making predictions...


In [None]:
submission_df.to_csv('submission.csv', index=False)

print(f"\nSubmission file created with {len(submission_df)} predictions")
print("\nFirst few predictions:")
print(submission_df.head())

print("\nPrediction distribution:\n", submission_df['y'].value_counts())


Submission file created with 250000 predictions

First few predictions:
       id  y
0  750000  0
1  750001  0
2  750002  0
3  750003  0
4  750004  0

Prediction distribution:
 y
0    223433
1     26567
Name: count, dtype: int64
