# Data Overview

In [1]:
import pandas as pd


df_train = pd.read_csv('train.csv')

In [2]:
def explore_dataframe(df):
    print(" Shape of DataFrame:", df.shape)

    print("\n Column Names:")
    print(df.columns.tolist())

    print("\n Data Types:")
    print(df.dtypes)
    
    print("\n Missing Values (per column):")
    print(df.isnull().sum())

    print("\n Number of Unique Values (per column):")
    print(df.nunique())
    
    print("\n Descriptive Statistics (numeric columns):")
    display(df.describe())

    print("\n Memory Usage:")
    df.info(memory_usage='deep')
    
    print("\n First 5 Rows:")
    display(df.head())

    print("\n Last 5 Rows:")
    display(df.tail())

In [3]:
explore_dataframe(df_train)

 Shape of DataFrame: (750000, 18)

 Column Names:
['id', 'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

 Data Types:
id            int64
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

 Missing Values (per column):
id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

 Number of Unique Values (per column):
id           750000
age          

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,216506.495284,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,0.0,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,187499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,374999.5,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,562499.25,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,749999.0,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0



 Memory Usage:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 402.1 MB

 First 5 Rows:


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1



 Last 5 Rows:


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
749995,749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0
749999,749999,42,technician,married,secondary,no,1559,no,no,cellular,4,aug,143,1,1,7,failure,0


# Data Preprocessing

### Binary Features

In [4]:
binary_features = ['default','housing','loan']
df_train[binary_features] = df_train[binary_features].apply(lambda x: x.map({'yes': 1, 'no': 0}))

### One Hot Encoding

In [5]:
categorical_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']

df_encoded = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True).astype(int)

In [6]:
print("Encoded DataFrame Shape:", df_encoded.shape)
print("Encoded DataFrame Types:", df_encoded.dtypes.value_counts())
df_encoded.head()

Encoded DataFrame Shape: (750000, 44)
Encoded DataFrame Types: int64    44
Name: count, dtype: int64


Unnamed: 0,id,age,default,balance,housing,loan,day,duration,campaign,pdays,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,0,42,0,7,0,0,25,117,3,-1,...,0,0,0,0,0,0,0,0,0,1
1,1,38,0,514,0,0,18,185,1,-1,...,0,1,0,0,0,0,0,0,0,1
2,2,36,0,602,1,0,14,111,2,-1,...,0,0,0,1,0,0,0,0,0,1
3,3,27,0,34,1,0,28,10,2,-1,...,0,0,0,1,0,0,0,0,0,1
4,4,26,0,889,1,0,3,902,1,-1,...,0,0,0,0,0,0,0,0,0,1


In [7]:
df_encoded.columns.tolist()

['id',
 'age',
 'default',
 'balance',
 'housing',
 'loan',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'y',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_married',
 'marital_single',
 'education_secondary',
 'education_tertiary',
 'education_unknown',
 'contact_telephone',
 'contact_unknown',
 'month_aug',
 'month_dec',
 'month_feb',
 'month_jan',
 'month_jul',
 'month_jun',
 'month_mar',
 'month_may',
 'month_nov',
 'month_oct',
 'month_sep',
 'poutcome_other',
 'poutcome_success',
 'poutcome_unknown']

### Transformation function

It will be used for transforming Test data

In [8]:
def preprocess_test_data(df_test, binary_features, categorical_cols, train_columns):

    df_test[binary_features] = df_test[binary_features].apply(lambda x: x.map({'yes': 1, 'no': 0}))
    
    df_test_encoded = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True).astype(int)
    
    df_test_encoded = df_test_encoded.reindex(columns=train_columns, fill_value=0)
    
    return df_test_encoded


In [9]:
binary_features = ['default', 'housing', 'loan']
categorical_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
train_columns = df_encoded.columns

# Process test data
# df_test_processed = preprocess_test_data(df_test, binary_features, categorical_cols, train_columns)

# Model training

In [10]:
from sklearn.model_selection import train_test_split

train_data = df_encoded.drop(['id', 'y'], axis=1)
train_labels = df_encoded['y']

X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)



In [11]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report

In [12]:
model = xgb.XGBClassifier(
    colsample_bytree=0.8,
    gamma=0,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    subsample=0.8,
    eval_metric='logloss',   # for binary classification
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
y_pred_prob = model.predict_proba(X_test)[:, 1]

#  ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.6f}")

ROC AUC Score: 0.965813


### Training on full data

In [21]:
xgb_model =  xgb.XGBClassifier(
    colsample_bytree=0.8,
    gamma=0,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    subsample=0.8,
    eval_metric='logloss',   # for binary classification
    random_state=42
)

xgb_model.fit(train_data, train_labels)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [22]:
y_pred_prob = xgb_model.predict_proba(train_data)[:, 1]

#  ROC AUC score
roc_auc = roc_auc_score(train_labels, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.6f}")

ROC AUC Score: 0.970566


# Final Prediction

In [23]:
df2 = pd.read_csv('test.csv')

In [24]:
X = preprocess_test_data(df2, binary_features, categorical_cols, train_columns)
X = X.drop(['id', 'y'], axis=1)

In [25]:
y = xgb_model.predict_proba(X)[:, 1]

In [26]:
y

array([2.9266151e-03, 1.2693128e-01, 6.1816379e-04, ..., 8.4975928e-01,
       7.4402714e-04, 7.2696842e-02], shape=(250000,), dtype=float32)

In [28]:
import numpy as np

submission = pd.DataFrame({
    "id": np.arange(750000, 750000 + len(y)),  
    "y": y
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

Saved submission.csv
