# Version 3: Using Catboost to train the data without SMOTE
# Version 2: Using OneHotEncoder to deal with Unknown Data
# Version 1: Using LabelEncoder to deal with Unknown Data 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries 

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix, make_scorer, recall_score
#!pip install imblearn
from imblearn.over_sampling import SMOTE #Here SMOTE is used to generate samples.
#!pip install xgboost
from xgboost.sklearn import XGBClassifier
!pip install catboost

# Import Data

In [None]:
data = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")

# EDA

**1. This is a classification problem, the first thing is to check whether the samples are balanced.**

In [None]:
#Set the style of seaborn ploting
sns.set_style("white")
sns.set_context("talk", font_scale = 1.8) 

In [None]:
pd.value_counts(data["Attrition_Flag"]) #Check the labels whether they are balanced or imbalanced

In [None]:
#Use pie chart to demonstrate
churn = pd.value_counts(data["Attrition_Flag"]).tolist()
plt.figure(figsize = (20,11))
plt.title("The Ratio of Normal and Churned Customers")
plt.pie(x = churn, labels = ["Existing Customers", "Attrited Customers"], autopct='%.2f%%')

**We could find that the data is imbalanced.**

**2. Age with Churned or not**

In [None]:
plt.figure(figsize = (28,15))
plt.title("Age with Churned or not", fontsize = 30)
sns.countplot(data = data, x = data["Customer_Age"], hue = "Attrition_Flag")

**This looks like a Gussian distribution.**

**3. Gender with Churned or not**

In [None]:
#countplot
plt.figure(figsize = (28,15))
plt.title("Gender with Churned or not")
sns.countplot(data = data, x = data["Gender"], hue = "Attrition_Flag")

In [None]:
#Pie
gender_normal = data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Gender"]].value_counts().tolist()
gender_churned = data.loc[data["Attrition_Flag"] == "Existing Customer", ["Gender"]].value_counts().tolist()
fig, ax = plt.subplots(1, 2, dpi = 200, figsize = (28,15))
ax[0].set_title("Gender in Normal Customers")
ax[0].pie(x = gender_normal, labels = ["Female", "Male"], autopct='%.2f%%')
ax[1].set_title("Gender in Churned Customers")
ax[1].pie(x = gender_churned, labels = ["Female", "Male"], autopct='%.2f%%')

**It seems like that female people and male people both have a similar ratio to churn or to stay.**

**4. Education Level with Churned or not**

In [None]:
#countplot
plt.figure(figsize = (28,15))
plt.title("Education Level with Churned or not")
sns.countplot(data =data, x = data["Education_Level"], hue = "Attrition_Flag",
             order = ["Unknown","Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate"])

In [None]:
#Pie
gender = data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Education_Level"]].value_counts()
gender_normal = data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Education_Level"]].value_counts().tolist()
gender_churned = data.loc[data["Attrition_Flag"] == "Existing Customer", ["Education_Level"]].value_counts().tolist()
fig, ax = plt.subplots(1, 2, dpi = 200, figsize = (32,20))
ax[0].set_title("Education Level in Normal Customers")
ax[0].pie(x = gender_normal, labels = gender.index, autopct='%.2f%%')
ax[1].set_title("Education Level in Churned Customers")
ax[1].pie(x = gender_churned, labels = gender.index, autopct='%.2f%%')

**5. Marriage Status with Churned or not**

In [None]:
#countplot
plt.figure(figsize = (28,15))
plt.title("Marriage Status with Churned or not")
sns.countplot(data =data, x = data["Marital_Status"], hue = "Attrition_Flag")

In [None]:
#pie
marriage = data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Marital_Status"]].value_counts()
marriage_normal = data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Marital_Status"]].value_counts().tolist()
marriage_churned = data.loc[data["Attrition_Flag"] == "Existing Customer", ["Marital_Status"]].value_counts().tolist()
fig, ax = plt.subplots(1, 2, dpi = 200, figsize = (32,18))
ax[0].set_title("Marital Status in Normal Customers", fontsize = 30)
ax[0].pie(x = marriage_normal, autopct='%.2f%%', labels = marriage.index)
ax[1].set_title("Marital Status in Churned Customers", fontsize = 30)
ax[1].pie(x = marriage_churned, autopct='%.2f%%', labels = marriage.index)

**The ratio is quite similar- single people, married people and divorced people would churn mostly unrelated with their marital status.**

# Data Preprocessing

**So in this notebook, logistic regression is used to compare the results of undersampling, oversampling and samples without sampling. This might give you some ideas about dealing with imbalanced data.**

**Change str type data to numeric data/ Dealing with Unkown data using OneHotEncoder**

In [None]:
#Attrition_Flag/ Label
data.loc[data["Attrition_Flag"] == "Existing Customer", ["Attrition_Flag"]] =0
data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Attrition_Flag"]] =1
data["Attrition_Flag"] = data["Attrition_Flag"].astype(int)

**OneHotEncoder**

In [None]:
#OneHotEncoder
data = data.join(pd.get_dummies(data["Gender"], prefix = "Gender"))
data.drop("Gender", axis = 1, inplace = True)
data = data.join(pd.get_dummies(data["Education_Level"], prefix = "Education_Level"))
data.drop("Education_Level", axis = 1, inplace = True)
data = data.join(pd.get_dummies(data["Marital_Status"], prefix = "Marital_Status"))
data.drop("Marital_Status", axis = 1, inplace = True)
data = data.join(pd.get_dummies(data["Income_Category"], prefix = "Income_Category"))
data.drop("Income_Category", axis = 1, inplace = True)
data = data.join(pd.get_dummies(data["Card_Category"], prefix = "Card_Category"))
data.drop("Card_Category", axis = 1, inplace = True)

**Standardization**

In [None]:
#Standardization
ss = StandardScaler()
cols_for_scaler = ["Customer_Age", "Dependent_count", "Months_on_book", "Total_Relationship_Count",
                   "Months_Inactive_12_mon", "Contacts_Count_12_mon", "Credit_Limit", "Total_Revolving_Bal",
                   "Avg_Open_To_Buy", "Total_Amt_Chng_Q4_Q1", "Total_Trans_Amt", "Total_Trans_Ct",
                   "Total_Ct_Chng_Q4_Q1", "Avg_Utilization_Ratio"]
data[cols_for_scaler] = ss.fit_transform(data[cols_for_scaler])

**Other Things**

In [None]:
#Drop two columns because these two columns are not features(check the dataset description)
data.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", axis = 1, inplace = True)
data.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2", axis = 1, inplace = True)
data #check what the data looks now

#Using client number as index
data = data.set_index("CLIENTNUM") #This should help you quickly spot customers using their client number

#Check what data looks like now
data

In [None]:
#Make independent/dependent variables
#Make train/test data
x = data.loc[:, data.columns != "Attrition_Flag"]
y = data.loc[:, data.columns == "Attrition_Flag"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 66)

**1. No Sampling**

**It means we would use the original imbalanced data to predict who would possibly churn.**

In [None]:
#This method is to print the best C for logisticregression 
def printing_Kfold_scores(x_train_data,y_train_data):
    #kfold cross valiidation
    fold = KFold(n_splits=5,shuffle=False)
    #different C params
    c_param_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    #Two list would be given using KFold Cross Validationï¼štrain_indices = indices[0], test_indices = indices[1]
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')
        recall_accs = []
        for iteration,indices in enumerate(fold.split(x_train_data)):
            #solver = lbfgs, L2 Regularization is used
            lr = LogisticRegression(C = c_param,penalty = 'l2',solver='lbfgs', max_iter = 10000)
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
            #Recall score calculation
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs .append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)
        #calculate the average recall score
        results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')
    best_c = results_table.loc[results_table['Mean recall score'].values.argmax()]['C_parameter']
    #Print the best c
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    return best_c

In [None]:
#This method is to print the scores used in classification
def as_rs_ps(y_test, y_pred):
    print("The Accuracy is:", metrics.accuracy_score(y_test, y_pred))    #Accuracy Score: TP + TN / TP + TN + FP + FN
    print("The Recall Score is:", metrics.recall_score(y_test, y_pred))    #Recall Score:  TP/ TP + FN    
    print("The Precision Score is:", metrics.precision_score(y_test, y_pred))    #Precision Score:  TP/ TP + FP
    print("The F1 Score is:", metrics.f1_score(y_test, y_pred)) #F1 Score/ A combination of Recall Score and Precision Score

In [None]:
#This method could help you to draw the confusion matrix
def ploting_confusion_matrix(cm, title, cmap= "GnBu_r"):
    plt.figure(figsize = (20,12))
    plt.title(title)   
    sns.heatmap(cm, annot=True, fmt ="d", cmap="GnBu_r")    

In [None]:
#No sampling
best_c = printing_Kfold_scores(x_train, y_train) #get the best C
lr = LogisticRegression(solver = "lbfgs", penalty= "l2", C = best_c, max_iter = 99999) #use the best C to build the model
lr.fit(x_train, y_train.values.ravel()) #use the model to fit the data
predictions_lr = lr.predict(x_test) #get the predictions

**1. The best C for no-sampling model is 10.**

**2. If there are many features, L2 regularization usually has better results than L1 regularization because L2 regularization has a stronger penalty in preventing from overfitting.**

**3. L1 penalty is usually used in features choosing (filtering features that are useless or meanless).**

In [None]:
#Check the score and the confusion matrix
as_rs_ps(y_test, predictions_lr) 
cm_lr = confusion_matrix(y_test, predictions_lr) 
ploting_confusion_matrix(cm_lr, "Confusion Matrix Result without Sampling Using Logistic Regression")

**No Sampling Method Is Used**

**1. The accuracy score is good (90%).**

**2. The recall score is relatively low as well as the precision score(60%/77%). This is a relatively bad recall score.**

**2. Undersampling**

**This means we randomly choose samples whose label is "Existing Customer"(8500 samples -> 1627 samples) and then merge these samples with samples whose label are "Attrited Customer"(1627 samples).**

In [None]:
y.value_counts() #8500 Existing Customer/ 1627 Attrited Customer
number_of_churned = len(y[y.Attrition_Flag == 1]) 
normal_indices = y[y["Attrition_Flag"] == 0].index #get the indices of existing customers
churned_indices = y[y["Attrition_Flag"] == 1].index #get the indices of attrited customers
random_normal_indices = np.random.choice(normal_indices, number_of_churned) #randomly select 1627 existing customers from 8500 existing customers
undersample_indices = np.concatenate([churned_indices,random_normal_indices]) #concatenate these two sets of indices
data_undersample = data.loc[undersample_indices,:] #pick up undersampling data using their indices

In [None]:
#Make dependent/independent variables in undersample data
x_undersample = data_undersample.loc[:, data_undersample.columns != "Attrition_Flag"]
y_undersample = data_undersample.loc[:, data_undersample.columns == "Attrition_Flag"]

In [None]:
#Make train/test data in undersample data
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undersample =train_test_split(
    x_undersample, y_undersample, test_size = 0.3, random_state = 66)

In [None]:
#build the model
best_c= printing_Kfold_scores(x_train_undersample, y_train_undersample)
lr_undersample = LogisticRegression(solver = "lbfgs", penalty = "l2", C = best_c, max_iter = 99999)
lr_undersample.fit(x_train_undersample, y_train_undersample.values.ravel())

In [None]:
#Check the result
#Here we are now using the undersample test data to test in the undersample data.
prediciton_lr_undersample = lr_undersample.predict(x_test_undersample)
cm_lr_undersample_undersample = confusion_matrix(y_test_undersample, prediciton_lr_undersample)
as_rs_ps(y_test_undersample, prediciton_lr_undersample) 
ploting_confusion_matrix(cm_lr_undersample_undersample, "Confusion Matrix of Undersample")

**This could suggest that the undersample model is not bad when testing the undersample test data.**

In [None]:
#What could we get if we use the whole test data to test in model built by undersample data?
predictions_sample = lr_undersample.predict(x_test)
cm_lr_undersample_sample = confusion_matrix(y_test, predictions_sample)
as_rs_ps(y_test, predictions_sample) 
ploting_confusion_matrix(cm_lr_undersample_sample, "Confusion Matrix of Data Test in Undersample") 

**1. The accuracy is not bad(tips: each time you would possibly get a sligtly different value of accuracy/recall score/precision score/F1 score because undersamples are merged or concatenated with random samples sampled from 8500 existing customers.**

**2. The recall score is improved compared to the result in which no sampling method is used. However, we get a bad precision score.**

**3. Mostly we should improve our recall score because this kind of error generally affects a business or project to a larger extent than an precision error does. Things go like that we have to first make customers stay because left customers mean less profit or other direct loss. But for customers that we mistakenly think they would leave, we just have to do some precautions like offering some benefits. Different business or project may attach different importance to these two statistical errors(F-Score could be used to assign the weights of the recall score and the precision score). Generally, achieving a high recall score is more important than getting a high precision score.**

**Notice: The result of the whole test set in undersampling data might be misleading. What we have to notice is that we actually first splited the training and test set, then we processed the undersampling. There are identical samples that are both in whole test set and undersample training set. A more precise or rigorous way to get this prediction is to exclude the data that has already been put into the whole test data when processing undersampling.**


**3. Oversampling (SMOTE is used)**

**This means we generate samples whose label is "attrited customer" using features that are quite similar to those data whose label is "attrited customer"(This description may have flaws and be misleading).**

**Learn more about SMOTE: https://github.com/scikit-learn-contrib/imbalanced-learn**

In [None]:
#make train/test data
#using SMOTE to generate data whose label is "Attrited Customer"
oversampler = SMOTE(random_state = 66)
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(x, y, test_size = 0.3,
                                                                            random_state = 66)
x_oversample, y_oversample = oversampler.fit_sample(x_train_smote, y_train_smote) #we should make the training data balance.
y_oversample.value_counts() 

**Now we have a balanced data.**

In [None]:
#get best C in oversampling data
best_c = printing_Kfold_scores(x_oversample, y_oversample)
lr_oversample = LogisticRegression(solver = "lbfgs", penalty = "l2", C = best_c, max_iter = 99999)
lr_oversample.fit(x_oversample, y_oversample.values.ravel())

In [None]:
#check the result and confusion matrix
prediction_lr_oversample = lr_oversample.predict(x_test_smote)
cm_lr_oversample = confusion_matrix(y_test_smote, prediction_lr_oversample)
as_rs_ps(y_test_smote, prediction_lr_oversample) 
ploting_confusion_matrix(cm_lr_oversample, "Confusion Matrix of Oversample")

**1. The accuracy score is relatively good(each time we would get the same score(s) because of SMOTE's generating rules).**

**2. The recall score is relatively lower than the recall score achieved when undersampling is used(because undersampling cheated in some way. [See the notice above in the undersamping section! or this recall score should be quit similar to the recall score achieved by undersampling and I have tested for several times]).**

**3. The precision score is slightly improved(Actually I have done several projects, sometimes the precision score would be improved significantly when using the oversampling method. But for this data/case, it is just slightly improved).**

**Therefore, a comparision among no-sampling method, undersampling method, oversampling method would suggest that oversampling data might be a good choice when dealing with imbalanced data(labels).**

# Using xgboost to predict the churned customers with oversampling data

In [None]:
xgbc = XGBClassifier() #substantialize the model
params_xgbc = {"learning_rate": [0.05],
               "max_depth": [9],
               "min_child_weight": [1],
               "subsample": [0.8, 0.9],
               "colsample_bytree": [0.8, 0.9],
               "lambda": [0.1],
               "gamma": [0.1, 0.2,]}  #set the params of xgbc for grid search
recallscore = make_scorer(recall_score) #ranking as recall score/ we have to use "make_score" to accomplish ranking as the recall score
grid_xgbc = GridSearchCV(xgbc, params_xgbc, cv = 5, scoring = recallscore) 
grid_xgbc.fit(x_oversample, y_oversample.values.ravel()) #grid search for best params/This may take several minutes to get results.
grid_xgbc.best_params_ #get the best params

**best params: (If you run and wait, you should get this result.)**


{'colsample_bytree': 0.8,
 'gamma': 0.2,
 'lambda': 0.1,
 'learning_rate': 0.05,
 'max_depth': 9,
 'min_child_weight': 1,
 'subsample': 0.9}

In [None]:
#use the best params to fit the xgbc model
xgbc = XGBClassifier(colsample_bytree = 0.8, #the ratio of features used in each tree
                     learning_rate = 0.05, #just learning rate
                     max_depth = 9, #just max_depth
                     min_child_weight = 1, #the minimum sum of instance weight
                     subsample = 0.9, #the ratio of samples used when fitting a tree
                     reg_lambda = 0.1, #L2 regularization param/reg_lambda = lambda
                     gamma = 0.2, #the minimum loss function decrease value needed to split
                    ) 
xgbc.fit(x_oversample, y_oversample.values.ravel()) #apply the xgbc model to data
predictions_xgbc_oversample = xgbc.predict(x_test_smote) #get the prediction
#check the score and plot the confusion matrix
cm_xgbc_oversample = confusion_matrix(y_test_smote, predictions_xgbc_oversample) 
as_rs_ps(y_test_smote, predictions_xgbc_oversample)
ploting_confusion_matrix(cm_xgbc_oversample, "Confusion Matrix for XGBClassifier in Oversampling Data")

**1. The accuracy score is very good (around 96.5%).**

**2. The recall score is good (nearly 90%), and the precision score is good (89%).**

**3. Generally, we have got a relatively good prediction result using XGBClassifier.**

# Version 3 Starts Here

**What I found was that using SMOTE could cause great overfitting in this dataset, so I abandoned SMOTE as a way to sampling. Catboost gives us a way to set the param of the imbalanced ratio of dataset like auto_class_weights = "Balanced" (other models do offer these parameters too), so I'll just use this param instead of using SMOTE**

In [None]:
#Import the dataset
data = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")
#Drop two columns because these two columns are not features(check the dataset description)
data.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", axis = 1, inplace = True)
data.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2", axis = 1, inplace = True)
#Encode the Label/Dealing with unknown data with meadians(because the distplot suggests to use median to fill the Nan)
#Labels
data.loc[data["Attrition_Flag"] == "Existing Customer", ["Attrition_Flag"]] =0
data.loc[data["Attrition_Flag"] == "Attrited Customer", ["Attrition_Flag"]] =1
data["Attrition_Flag"] = data["Attrition_Flag"].astype(int)
le = LabelEncoder()
#Gender
data["Gender"] = le.fit_transform(data["Gender"])
#Education_Level
data["Education_Level"] = le.fit_transform(data["Education_Level"])
data.loc[data["Education_Level"] == 6, ["Education_Level"]] = np.nan
data["Education_Level"].fillna(data["Education_Level"].median(), inplace = True)
#Marital Status
data["Marital_Status"] = le.fit_transform(data["Marital_Status"])
data.loc[data["Marital_Status"] == 3, ["Marital_Status"]] = np.nan
data["Marital_Status"].fillna(data["Marital_Status"].median(), inplace = True)
#Income_Category
data["Income_Category"] = le.fit_transform(data["Income_Category"])
data.loc[data["Income_Category"] == 5, ["Income_Category"]] = np.nan
data["Income_Category"].fillna(data["Income_Category"].median(), inplace = True)
#Card_Category
data["Card_Category"] = le.fit_transform(data["Card_Category"])
#Standardization and using the client numbers as indices
ss = StandardScaler()
data = data.set_index("CLIENTNUM")
cols_standard = ['Customer_Age', 'Dependent_count', 
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
data[cols_standard] = ss.fit_transform(data[cols_standard])
#split the train and test set
x = data.loc[:, data.columns != "Attrition_Flag"]
y = data.loc[:, data.columns == "Attrition_Flag"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 66)
#Using the Catboost to train and validate
from catboost import CatBoostClassifier
cbc = CatBoostClassifier(loss_function="Logloss", #using logloss as the loss of binary classification
                         eval_metric= "Recall", #ranking as the recall score
                         custom_metric= "Recall", #showing as the recall score
                         one_hot_max_size= 10, #If a feature has less then 10 unique elements, it would be OneHotEncoded.
                         #cat_features = [1, 3, 4, 5, 6],
                         early_stopping_rounds = 50, #stop early to prevent from overfitting
                         auto_class_weights = "Balanced", #balance the ratio of the imbalanced data weights
                         depth = 4, #just depth
                         l2_leaf_reg = 12, #L2 regularization param
                         learning_rate = 0.2, #just learning_rate
                         rsm = 0.6, #AKa colsample_bylevel
                         subsample = 0.3, #what is the ratio of samples to be randomly used to train each tree.
                         )

#You can explore your own params using the gridsearch
#GridSearch for the best parameters within the model
#params_cbc = {"depth": [3, 4, 5],
#              "subsample": [0.2, 0.3, 0.4],
#              "learning_rate": [0.1, 0.2, 0.3],
#              "l2_leaf_reg": [11, 12, 13],
#              'colsample_bylevel':[0.5, 0.6, 0.7],
#             }
#grid_search_result = cbc.grid_search(params_cbc, 
#                                     X= x_train, 
#                                     y= y_train, 
#                                     plot=True,
#                                     cv = 5,)

#When the model is done with gridsearching, the model would be already to use.
#cbc.get_params() #you could check the model's parameters
cbc.fit(x_train, y_train.values.ravel()) #using the model trained to fit the data

In [None]:
#Check the score and plot the confusion matrix
predictions_cbc = cbc.predict(x_test) #receive the prediction result
as_rs_ps(y_test, predictions_cbc) #check the score
cm_cbc_no_sampling = confusion_matrix(y_test, predictions_cbc) #confusion matrix
ploting_confusion_matrix(cm_cbc_no_sampling, "Confusion Matrix for Catboost Without SMOTE")

**The Accuracy is: 0.9776242184929254**

**The Recall Score is: 0.9337349397590361**

**The Precision Score is: 0.93**

**The F1 Score is: 0.9318637274549098**

**Generally this is a good result with a relatively higher recall score and remains with a high accuracy score and F1 score.**

**The result suggests that sampling might not be neccessary all the time because some sampling method would cause overfitting. It might be better to set the imbalanced ratio to deal with the imbalanced dataset.**

# Thanks


 **If I could find an another model to get a better recall score, I would update this notebook.**
 
 
 **This is my first try on Kaggle.I would apppreciate it very much if you like or upvote this notebook:)**