In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd

In [2]:
train_data = pd.read_csv('bank-full_train.csv')

In [3]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,ID,y
0,45,blue-collar,married,secondary,no,2,no,no,cellular,26,aug,105,10,-1,0,unknown,22944,no
1,34,admin.,divorced,secondary,no,0,no,no,cellular,10,jul,268,1,-1,0,unknown,13870,no
2,40,technician,divorced,secondary,no,311,no,no,cellular,6,aug,738,2,-1,0,unknown,19301,yes
3,58,self-employed,married,tertiary,no,5810,no,no,cellular,12,mar,139,1,-1,0,unknown,31334,yes
4,59,blue-collar,married,secondary,no,169,yes,no,unknown,16,may,181,3,-1,0,unknown,3849,no


In [4]:
# Display the data types of the features in the training data
data_types = train_data.dtypes
print(data_types)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
ID            int64
y            object
dtype: object


In [5]:
# Define features and target variable
X = train_data.drop(columns=['y'])
y = train_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [6]:
cat_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [7]:
# Preprocessing for numerical data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Create a pipeline that includes the preprocessor and the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model on the entire training data
model.fit(X, y)

In [8]:
# Load the test data
test_data = pd.read_csv('bank-full_test.csv')

# Make predictions on the test data
test_preds_prob = model.predict_proba(test_data)[:, 1]

# Convert probabilities to 'yes' or 'no' based on a threshold of 0.5
test_preds = ['yes' if prob > 0.5 else 'no' for prob in test_preds_prob]

# Create a submission DataFrame
submission = pd.DataFrame({'y': test_preds})

# Save the submission to a CSV file
submission.to_csv('bank-full_test_predictions.csv', index=False)


In [9]:
submission.head()

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no


In [10]:
import numpy as np
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_curve

# Perform cross-validation and get predicted probabilities
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_prob = cross_val_predict(model, X, y, cv=cv, method='predict_proba')[:, 1]

# Calculate KS statistic
fpr, tpr, thresholds = roc_curve(y, y_pred_prob)
ks_statistic = max(tpr - fpr)

print(f'Cross-Validation KS Score: {ks_statistic}')

Cross-Validation KS Score: 0.7639855117662754
