# 8_1. Tuning parameters of XGBC(SMOTE)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import pickle
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#load the files

X_train = pd.read_csv('../data/x_y_data/SMOTE/X_resampled.csv')
y_train = pd.read_csv('../data/x_y_data/SMOTE/y_resampled.csv')
X_test = pd.read_csv('../data/x_y_data/SMOTE/X_test.csv')
y_test = pd.read_csv('../data/x_y_data/SMOTE/y_test.csv')
data = pd.read_csv('../data/cleaned_data/cleaned_data.csv')

y_train.loc[y_train['churn'] == 'Yes', 'churn'] = 1
y_train.loc[y_train['churn'] == 'No', 'churn'] = 0

y_train['churn'] =y_train['churn'].astype('int64')


y_test.loc[y_test['churn'] == 'Yes', 'churn'] = 1
y_test.loc[y_test['churn'] == 'No', 'churn'] = 0

y_test['churn'] =y_test['churn'].astype('int64')


In [19]:
#set the high-parameter

param_grid = {
    'learning_rate': [0.1,0.01],
    'n_estimators': [100, 500,1000],
    'max_depth':[3,5],
    }
kappa_scorer = make_scorer(cohen_kappa_score)
xgb = XGBClassifier(random_state=12345)

# make_scorer(error_metric_kappa)
grid_search = GridSearchCV(xgb, param_grid, cv=5,return_train_score=True,n_jobs=-1, verbose = 20,  scoring=kappa_scorer)
grid_search.fit(X_train,y_train)
grid_search.best_params_ #To check the best set of parameters returned

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 3/5; 1/12] START learning_rate=0.1, max_depth=3, n_estimators=100...........
[CV 3/5; 1/12] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=0.679, test=0.718) total time=  15.3s
[CV 5/5; 2/12] START learning_rate=0.1, max_depth=3, n_estimators=500...........
[CV 5/5; 2/12] END learning_rate=0.1, max_depth=3, n_estimators=500;, score=(train=0.751, test=0.805) total time=  52.8s
[CV 3/5; 4/12] START learning_rate=0.1, max_depth=5, n_estimators=100...........
[CV 3/5; 4/12] END learning_rate=0.1, max_depth=5, n_estimators=100;, score=(train=0.750, test=0.757) total time=  16.0s
[CV 1/5; 5/12] START learning_rate=0.1, max_depth=5, n_estimators=500...........
[CV 1/5; 5/12] END learning_rate=0.1, max_depth=5, n_estimators=500;, score=(train=0.938, test=0.419) total time= 1.5min
[CV 4/5; 6/12] START learning_rate=0.1, max_depth=5, n_estimators=1000..........
[CV 4/5; 6/12] END learning_rate=0.1, max_depth=5,

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}

[CV 4/5; 1/12] START learning_rate=0.1, max_depth=3, n_estimators=100...........
[CV 4/5; 1/12] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=(train=0.678, test=0.748) total time=  15.3s
[CV 2/5; 3/12] START learning_rate=0.1, max_depth=3, n_estimators=1000..........
[CV 2/5; 3/12] END learning_rate=0.1, max_depth=3, n_estimators=1000;, score=(train=0.856, test=0.556) total time= 1.7min
[CV 5/5; 5/12] START learning_rate=0.1, max_depth=5, n_estimators=500...........
[CV 5/5; 5/12] END learning_rate=0.1, max_depth=5, n_estimators=500;, score=(train=0.885, test=0.823) total time= 1.6min
[CV 4/5; 7/12] START learning_rate=0.01, max_depth=3, n_estimators=100..........
[CV 4/5; 7/12] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=(train=0.553, test=0.540) total time=  11.6s
[CV 2/5; 8/12] START learning_rate=0.01, max_depth=3, n_estimators=500..........
[CV 2/5; 8/12] END learning_rate=0.01, max_depth=3, n_estimators=500;, score=(train=0.673, test=0.607) total 

The parameter: {'criterion': 'squared_error','loss': 'log_loss','max_depth': 15,'max_features': 'sqrt','min_samples_leaf': 1,'min_samples_split': 2, 'n_estimators': 150} was chosen.

####  Cross validation socre - Kappa-

In [15]:
xgb1= XGBClassifier(learning_rate= 0.01,max_depth = 5,n_estimators = 1000)
xgb_tuned1 = xgb.fit(X_train, y_train)

In [None]:
xgb= XGBClassifier(random_state=12345)

results = []
kfold = KFold(n_splits=10, shuffle=False)
cv_results = cross_val_score(xgb, X_train, y_train, cv=kfold, scoring=kappa_scorer)

results.append(cv_results)
    
msg = f" GB: {cv_results.mean()} ({cv_results.std()})"
print(msg)

In [None]:
#save the model
gb1.fit(X_train, y_train)

filename = "../models/TOMEK_gb.pickle" # Path with filename

with open(filename, "wb") as file:
        pickle.dump(gb1,file)

### Variable Importance 

In [None]:
X_train2 = X_train.copy()
y_train2 = y_train.copy()

xgb= XGBClassifier(random_state=12345)
xgb_tuned = xgb.fit(X_train2, y_train2)

from sklearn.inspection import permutation_importance


# Calculate permutation feature importances
result = permutation_importance(xgb_tuned, X_train2, y_train2, n_repeats=10, random_state=42)

# Sort and display the results
feature_importances = pd.Series(result.importances_mean, index=X_train2.columns).sort_values(ascending=False)

# Plot the feature importances
import matplotlib.pyplot as plt

sns.barplot(x=feature_importances, y=feature_importances.index)
plt.xlabel('Significance Score Of Variables')
plt.ylabel('Variables')
plt.title("Variable Importance for Gradient Boosting  Model")
plt.show()


In [None]:
feature_importances_df = pd.DataFrame(feature_importances)
feature_importances_df = feature_importances_df.rename(columns={0:'Significance Score'})
feature_importances_df.head(2)

In [None]:
feature_importances_df.to_csv('../data/Significant_Score/TOMEK_GB_Significance_Score.csv', index=False)

### the classification report

In [16]:
from sklearn.metrics import confusion_matrix


y_pred = xgb_tuned1.predict(X_test)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
display(cm)

Confusion Matrix:


array([[851, 182],
       [147, 227]])

In [17]:
y_pred_df = pd.DataFrame(y_pred)

y_pred_df.value_counts()

0    998
1    409
Name: count, dtype: int64

In [18]:
y_test_df = pd.DataFrame(y_test)

y_test_df.value_counts()

churn
0        1033
1         374
Name: count, dtype: int64

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score


y_pred = xgb_tuned1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)

# Create a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score", "Kappa"],
    "Score": [accuracy, precision, recall, f1, kappa]
})

display(metrics_df)

Unnamed: 0,Metric,Score
0,Accuracy,0.766169
1,Precision,0.555012
2,Recall,0.606952
3,F1-Score,0.579821
4,Kappa,0.41828


In [None]:
metrics_df.to_csv('../data/metrics/SMOTE_gb_metrics.csv', index=False)