In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0


In [11]:
X = train.drop(["y", "id"], axis=1)
y = train["y"]
X_test = test.drop("id", axis=1)

In [12]:
cat_cols = X.select_dtypes(include="object").columns
cat_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [13]:
# Encode categorical features (SAFE)

for col in cat_cols:
    le = LabelEncoder() # label encoder to convert category vals into machine lang
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [14]:
dtrain = lgb.Dataset(X, label=y)

In [15]:
# Minimal parameters (LightGBM handles rest)
params = {
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "seed": 42,
    'learning_rate': 0.2,
    "num_leaves": 48,
    "min_data_in_leaf": 80,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l1": 0.5,
    "lambda_l2": 0.5
}

In [16]:
# Cross Validation

cv_results = lgb.cv(
    params,
    dtrain,
    num_boost_round=3000,
    nfold=5,
    stratified=True,
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=200)
    ]
)

best_iter = len(cv_results["valid auc-mean"])
best_auc = cv_results["valid auc-mean"][-1]

print("Best number of trees:", best_iter)
print("Best CV ROC-AUC:", best_auc)

Training until validation scores don't improve for 200 rounds
[200]	valid's auc: 0.968016 + 0.000268884
[400]	valid's auc: 0.968235 + 0.000274739
Early stopping, best iteration is:
[367]	valid's auc: 0.968245 + 0.000286389
Best number of trees: 367
Best CV ROC-AUC: 0.9682446633528216


In [17]:
# Train FINAL model on full data

model = lgb.train(
    params,
    dtrain,
    num_boost_round=best_iter
)

In [18]:
# Predict test.csv

test_preds = model.predict(X_test)

# Submission file

submission = pd.DataFrame({
    "id": test["id"],
    "y": test_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully")

submission.csv created successfully
