In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
np.random.seed(42)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")
df.head()

In [None]:
df.columns

Removing unwanted columns

In [None]:
unwanted_cols = ['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']
df.drop(unwanted_cols, inplace=True, axis=1)

In [None]:
df.columns

In [None]:
cat_cols = df.columns[df.dtypes == 'object']
cat_cols

Categorical columns: Attrition_Flag, Gender, Education_Level, Marital_Status, Income_Category, Card_Category

In [None]:
df.isna().sum()

No Null values present

# Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16,10))
corr = df.corr()
sns.heatmap(corr.abs(), 
        xticklabels=corr.columns,
        yticklabels=corr.columns, annot=True)

**Visualize categorical columns**

In [None]:

for col in cat_cols:
    g = sns.catplot(x=col,
                hue="Attrition_Flag",
                data=df, kind="count",
                height=6, aspect=.8)
plt.show()

# Pre-processing

Label encoding the categorical columns

In [None]:
cat_cols

In [None]:
df["Attrition_Flag"] = df["Attrition_Flag"].map({'Existing Customer':0, 'Attrited Customer':1})

In [None]:
df["Gender"] = df.Gender.map({'M':0, 'F':1})

In [None]:
df["Education_Level"].unique()

In [None]:
education_mappping = {
    "Uneducated":0,
    "High School":1,
    "Graduate": 2,
    "College": 3,
    "Post-Graduate":4,
    "Doctorate":5,
    "Unknown": -9
}
df["Education_Level"] = df["Education_Level"].map(education_mappping)

In [None]:
df["Marital_Status"].unique()

In [None]:
df["Marital_Status"] = df["Marital_Status"].map({'Married':1, 'Single':0, 'Unknown':-99, 'Divorced':2})

In [None]:
df["Income_Category"].unique()

In [None]:
earning_map = {
    '$60K - $80K':2, 'Less than $40K':0, '$80K - $120K':3, '$40K - $60K':1,
       '$120K +':4, 'Unknown':-999
}
df["Income_Category"] = df["Income_Category"].map(earning_map)

In [None]:
df["Card_Category"].unique()

In [None]:
df["Card_Category"] = df["Card_Category"].map({'Blue':0, 'Gold':2, 'Silver':1, 'Platinum':3})

In [None]:
df.head()

In [None]:
df.dtypes

# Model

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from numpy import mean
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

In [None]:
target = df["Attrition_Flag"]
df.drop(["Attrition_Flag"], inplace=True, axis=1)

In [None]:
df.shape

In [None]:
target.shape

**Prepare the train-test sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, target,test_size=0.2, stratify=target)

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
xgb_model = XGBClassifier(
    learning_rate=0.09,
    n_estimators=250,
    max_depth=3,
    objective= 'binary:logistic',
    subsample = 0.75,
    alpha= 0.01,
    gamma= 0.1,
    min_child_weight = 3,
    use_label_encoder = False,
    colsample_bytree = 0.8,
    eval_metric='logloss'
                     )
# scores = cross_val_score(xgb_model, X_train, y_train, scoring='accuracy', cv=cv)
# print(scores*100)
# print("XGB cross validation score:",mean(scores)*100)

**Testing the XGB model**

In [None]:
xgb_model.fit(X_train, y_train)
from sklearn.metrics import roc_auc_score
y_train_pred = xgb_model.predict(X_train)
print("LGB train accuracy:",roc_auc_score(y_train_pred,y_train)*100)
y_pred = xgb_model.predict(X_test)
print("XGB test accuracy:",roc_auc_score(y_pred,y_test)*100)

In [None]:
lgb_model = LGBMClassifier(learning_rate=0.09,
                       num_leaves = 250,
                       boosting_type='gbdt',
                       objective='binary',
                       metric='binary_logloss,auc',
                       max_depth = 3,
                       n_estimators=2500, subsample_for_bin=40000, 
                       min_split_gain=2, min_child_weight=2, min_child_samples=5, subsample=0.9)

**Testing the LGB model**

In [None]:
lgb_model.fit(X_train, y_train)
y_train_pred = lgb_model.predict(X_train)
print("LGB train accuracy:",roc_auc_score(y_train_pred,y_train)*100)
y_pred = lgb_model.predict(X_test)
print("LGB test accuracy:",roc_auc_score(y_pred,y_test)*100)