In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import itertools

bank_train = pd.read_csv("D:\\data ANALYTICS AND SCIENCE\\R EDVANCER\\PROJECT SUBMISSION DATA\\banking\\bank-full_train.csv")
bank_test = pd.read_csv("D:\\data ANALYTICS AND SCIENCE\\R EDVANCER\\PROJECT SUBMISSION DATA\\banking\\bank-full_test.csv")

# Step 2: Data preprocessing
X_train = bank_train.drop("y", axis=1)  # Features
y_train = bank_train["y"]  # Target variable

X_test = bank_test  # Test data

# Encode categorical variables
categorical_features = X_train.select_dtypes(include=["object"]).columns
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])
    label_encoders[feature] = le

# Step 3: Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 4: Create the gradient boosting classifier
gbm = GradientBoostingClassifier(random_state=42)

# Step 5: Define hyperparameter grid
param = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200, 500, 700],
    'max_depth': [1, 2, 3, 4, 5, 6, 7],
    'min_samples_leaf': [1, 2, 5, 10]
}

# Step 6: Hyperparameter tuning using grid search
num_trials = 10
my_params = list(itertools.islice(itertools.product(*param.values()), num_trials))

best_score = 0
best_params = None

for params in my_params:
    learning_rate, n_estimators, max_depth, min_samples_leaf = params

    gbm.set_params(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf
    )

    gbm.fit(X_train, y_train)
    y_val_pred = gbm.predict_proba(X_val)[:, 1]
    ks_score = roc_auc_score(y_val, y_val_pred)

    if ks_score > best_score:
        best_score = ks_score
        best_params = params

best_learning_rate, best_n_estimators, best_max_depth, best_min_samples_leaf = best_params
 
print("KS score:", ks_score)

# Step 7: Train the best model on the full training set
gbm.set_params(
    learning_rate=best_learning_rate,
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_leaf=best_min_samples_leaf
)

gbm.fit(X_train, y_train)

# Step 8: Make predictions on the test set using the best model
y_test_pred = gbm.predict_proba(X_test)[:, 1]
y_test_pred 


KS score: 0.938666583379479


array([0.00811653, 0.0088039 , 0.00955491, ..., 0.64134021, 0.76314037,
       0.63978902])

In [33]:
threshold = 0.4
binary_predictions = ["yes" if pred >= threshold else "no" for pred in y_test_pred]
binary_predictions
predictions_df = pd.DataFrame({"ID": bank_test["ID"], "y": binary_predictions})
predictions_df


Unnamed: 0,ID,y
0,3,no
1,4,no
2,6,no
3,7,no
4,14,no
...,...,...
13559,45196,yes
13560,45198,yes
13561,45200,yes
13562,45202,yes


**The code block performs a manual hyperparameter tuning using a nested loop. It generates all possible combinations of hyperparameters from the param dictionary using itertools.product. It then trains the Gradient Boosting Classifier (gbm) with each combination of hyperparameters and evaluates its performance on the validation set using the roc_auc_score. The best performing model based on the roc_auc_score is selected and trained on the full training set. Finally, this best model is used to make predictions on the test set.**