In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2 style=color:blue align="left"> Table of Contents </h2>

#### 1) Introduction
#### 2) Load Required Libraries
#### 3) Read Data
#### 4) EDA (Exploratory Data Analysis)
       - i) Missing Values
       - ii) Data Visualization
         - a) Regular method
         - b) Pandas Profiling
         - c) AutoViz
         - b) Data Prep
       - iii) Data Preprocessing
#### 5) EvalML (AutoML)

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 1) Introduction</h1>

<h2 style=color:green align="left"> About Features in data </h2>

|Feature                   |Description                       |
|--------------------------|----------------------------------|
|CLIENTNUM                 |Client number. Unique identifier for the customer holding the account |
|Attrition_Flag            |Internal event (customer activity) variable - if the account is closed then 1 else 0 |
|Customer_Age              |Demographic variable - Customer's Age in Years    |
|Gender                    |Demographic variable - M=Male, F=Female           |
|Dependent_count           |Demographic variable - Number of dependents       |
|Education_Level           |Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)   |
|Marital_Status            |Demographic variable - Married, Single, Divorced, Unknown    |
|Income_Category           |Demographic variable - Annual Income Category of the account holder (< 40K, 40K - 60K, 60K - 80K, 80K-120K, >)    |
|Card_Category             |Product Variable - Type of Card (Blue, Silver, Gold, Platinum)        |
|Months_on_book            |Period of relationship with bank            |
|Total_Relationship_Count  |Total no. of products held by the customer  |
|Months_Inactive_12_mon    |No. of months inactive in the last 12 months         |
|Contacts_Count_12_mon     |No. of Contacts in the last 12 months                |
|Credit_Limit              |Credit Limit on the Credit Card                      |
|Total_Revolving_Bal       |Total Revolving Balance on the Credit Card           |
|Avg_Open_To_Buy           |Open to Buy Credit Line (Average of last 12 months)  |
|Total_Amt_Chng_Q4_Q1      |Change in Transaction Amount (Q4 over Q1)            |
|Total_Trans_Amt           |Total Transaction Amount (Last 12 months)            |
|Total_Trans_Ct            |Total Transaction Count (Last 12 months)             |
|Total_Ct_Chng_Q4_Q1       |Change in Transaction Count (Q4 over Q1)             |            
|Avg_Utilization_Ratio     |Average Card Utilization Ratio                       |

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 2) Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style('darkgrid')

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 3) Read Data </h1>

In [None]:
bank = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")

In [None]:
display(bank.head())
display(bank.tail())

In [None]:
bank.shape

In [None]:
# CLIENTNUM : Client number. Unique identifier for the customer holding the account ==> so we gonna drop it (usless)

bank = bank.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1)

In [None]:
bank.info()

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 4) EDA (Exploratory Data Analysis) </h1>

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > i) Missing Values </h1>

#### Visualize missing values (NaN) values using Missingno Library
a) Visualize missing values as a matrix

b) Visualize missing values as a barplot

c) Visualize missing values as a heatmap

d) Visualize missing values as a dendrogram

In [None]:
import missingno as msno

In [None]:
bank.isnull().sum()

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > a) Visualize missing values as a matrix </h1>

In [None]:
# Visualize missing values as a matrix
# msno.matrix(train,figsize=(11,7), sparkline=False, fontsize=12, color=(0.27, 0.52, 1.0));
msno.matrix(bank,figsize=(11,7), fontsize=12, color=(1, 0.38, 0.27));

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > b) Visualize missing values as a barplot </h1>

In [None]:
# Visualize the number of missing values as a bar chart
# color="dodgerblue" "orangered"
msno.bar(bank, color="dodgerblue", sort="ascending", figsize=(13,7), fontsize=12);

In [None]:
fig = plt.figure(figsize=(15,7))

ax1 = fig.add_subplot(1,2,1)
msno.bar(bank, color="tomato", fontsize=12, ax=ax1);

ax2 = fig.add_subplot(1,2,2)
msno.bar(bank, log=True, color="tab:green", fontsize=12, ax=ax2);

plt.tight_layout()

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > c) Visualize missing values as a heatmap </h1>

In [None]:
# Visualize the correlation between the number of missing values in different columns as a heatmap
msno.heatmap(bank, cmap="RdYlGn", figsize=(10,5), fontsize=12)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > d) Visualize missing values as a dendogram </h1>

In [None]:
msno.dendrogram(bank, figsize=(12,7), fontsize=12)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > ii) Data Visualization </h1>

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > a) Regular Method </h1>

In [None]:
for feature in bank.columns:
    if bank[feature].dtype not in ['int64', 'float64']:
        print(f'{feature}: {bank[feature].unique()}')

In [None]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(16, 28))
sns.set(font_scale=1.6)
cols_ = ["Education_Level", "Marital_Status", "Income_Category"]

for ind, col in enumerate(cols_):
    sns.countplot(x=col, data=bank, ax=ax[ind])

In [None]:
fig, ax = plt.subplots(figsize=(20, 16))
df_corr = bank.corr(method="pearson")
mask = np.zeros_like(np.array(df_corr))
mask[np.triu_indices_from(mask)] = True
ax = sns.heatmap(df_corr, mask=mask, annot=True)

In [None]:
plt.figure(figsize = (10,8))
sns.countplot(x = bank['Attrition_Flag'], edgecolor = 'black', saturation = 0.55)
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
sns.countplot(x = bank['Card_Category'], hue = bank['Gender'], edgecolor = 'black', saturation = 0.55)
plt.show()

In [None]:
plt.figure(figsize = (15,8))
sns.countplot(x= bank['Education_Level'], edgecolor = 'black', saturation = 0.55)
plt.show()

In [None]:
# data type plot
plt.figure(figsize=(9,7))
bank.dtypes.value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('type of our data');

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > b) Pandas Profiling </h1>

#### Refer below notebook on **Pandas Profiling**
 - https://www.kaggle.com/sureshmecad/pandas-profiling-autoeda

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > c) AutoViz </h1>

#### Refer below notebook on **AutoViz**
 - https://www.kaggle.com/sureshmecad/autoviz-autoeda

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > d) DataPrep </h1>

#### Refer below notebook on **DataPrep**
 - https://www.kaggle.com/sureshmecad/dataprep-autoeda

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > iii) Data Preprocessing </h1>

In [None]:
X = bank.copy()
X = X.drop(['Credit_Limit'], axis=1)
y = X.pop('Attrition_Flag')

X['Income_Category'] = X['Income_Category'].replace({'Less than $40K':0,
                                                     '$40K - $60K':1,
                                                     '$60K - $80K':2,
                                                     '$80K - $120K':3,
                                                     '$120K +':4})
X['Card_Category'] = X['Card_Category'].replace({'Blue':0,
                                                 'Silver':1,
                                                 'Gold':2,
                                                 'Platinum':3})
X['Education_Level'] = X['Education_Level'].replace({'Uneducated':0,
                                                     'High School':1,
                                                     'College':2,
                                                     'Graduate':3,
                                                     'Post-Graduate':4,
                                                     'Doctorate':5})

y = y.replace({'Existing Customer':0,
               'Attrited Customer':1})

In [None]:
# Encoding the Target feature
y = y.replace({'Existing Customer':0,
               'Attrited Customer':1})

<h1 style='background-color:magenta; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;' > 5) EvalML (AutoML) </h1>

<img src="https://imgur.com/vOXQtGh.png" width='600px'>

<img src="https://imgur.com/m1roGCX.png" width='1000px'>

<h2 style=color:green align="left"> Problem and Dataset </h2>
- Predicting Customer Churn dataset on Kaggle is a supervised classification task where the objective is to predict whether or not a customer will end up leaving their bank's credit card service.

 - We use **Dataprep** for auto visualisation.
 
 - We use **EvalML** to search and select a pipeline that performs the best at identifying customers who are more likely to leave their bank.

<h2 style=color:green align="left"> Approach </h2>
- We will show how EvalML can be leveraged to perform preprocessing, visualization, and automated machine learning. While EvalML allows for plenty of options for customization to improve prediction outcomes, we'll focus on a fairly high-level implementation.

<h2 style=color:green align="left"> Our approach will be as follows: </h2>

 - Read in the data and analyze it.
 - Understand the data through visualization.
 - Perform basic preprocessing.
 - Search for a best performing pipeline based on our objective.
 - Review the best pipeline chosen and analyze its performance.

#### a) Install EvalML (AutoML)
#### b) Configure search
#### c) Pipeline Rankings
#### d) Get pipeline
#### e) Select Best pipeline
#### f) Describe pipeline
#### g) Access raw results

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > a) Install EvalML (AutoML) </h1>

In [None]:
!pip install evalml

In [None]:
import evalml

In [None]:
from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer

def preprocessing(X, y):
    imputer = SimpleImputer(impute_strategy="most_frequent", missing_values="Unknown")
    X = imputer.fit_transform(X, y)
    
    return X

X = preprocessing(X, y)

In [None]:
from evalml.utils import infer_feature_types

X = infer_feature_types(X, feature_types={'Income_Category': 'categorical',
                                          'Education_Level': 'categorical'})
X

In [None]:
# Splitting the dataset into 80% train and 20% test.
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='binary',
                                                                         test_size=.2)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > b) Configure search </h1>

In [None]:
from evalml import AutoMLSearch

automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type="binary", objective="F1", 
                      allowed_model_families=['random_forest' , 'xgboost', 'lightgbm'],
                      additional_objectives=['accuracy binary'], max_batches=5)
automl.search()

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > c) Pipeline Rankings </h1>

In [None]:
automl.rankings

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > d) Get pipeline </h1>

In [None]:
# We can get the object of any pipeline via their id as well:
pipeline = automl.get_pipeline(1)
print('Name:\n',pipeline.name)
print('\n\nParameters:\n\n', pipeline.parameters)

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > e) Select Best pipeline </h1>

In [None]:
best_pipeline_ = automl.best_pipeline
print('Best pipeline:\n\n', best_pipeline_.name)
print('\n\nBest pipeline parameters:\n\n', best_pipeline_.parameters)

In [None]:
# We can also visualize the structure of our pipeline
best_pipeline_.graph()

In [None]:
# Display Hyperparameters
automl.best_pipeline.hyperparameters

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > f) Describe pipeline </h1>

- Each **pipeline is given an id.** We can get more information about any **particular pipeline** using that id. Here, we will get more information about the **pipeline with id = 1.**

In [None]:
automl.describe_pipeline(3)

In [None]:
automl.describe_pipeline(automl.rankings.iloc[1]["id"])

<h1 style='background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:center; border-radius: 15px 50px;' > g) Access raw results </h1>

- The **AutoMLSearch** class records detailed results information under the results field, including information about the **cross-validation scoring and parameters.**

In [None]:
automl.results

In [None]:
# Evaluate on the test data
scores = best_pipeline_.score(X_test, y_test, objectives=evalml.objectives.get_core_objectives('binary'))
print(f'Accuracy Binary: {scores["Accuracy Binary"]}') 

In [None]:
best_pipeline_.fit(X_train, y_train)
predictions = best_pipeline_.predict(X_test)

In [None]:
from evalml.model_understanding.graphs import (
    graph_binary_objective_vs_threshold, 
    graph_permutation_importance, 
    graph_confusion_matrix
)

graph_binary_objective_vs_threshold(best_pipeline_, X_test, y_test, "F1")

### Feature Importance

In [None]:
graph_permutation_importance(best_pipeline_, X_test, y_test, "F1")

### Confusion Matrix

In [None]:
graph_confusion_matrix(y_test, predictions)

In [None]:
from evalml.objectives.standard_metrics import AccuracyBinary, AUC, F1, PrecisionWeighted, Recall

acc = AccuracyBinary()
auc = AUC()
f1 = F1()
pre_w = PrecisionWeighted()
rec = Recall()

print(f"Accuracy (Binary): {acc.score(y_true=y_test, y_predicted=predictions)}")
print(f"Area Under Curve: {auc.score(y_true=y_test, y_predicted=predictions)}")
print(f"F1: {f1.score(y_true=y_test, y_predicted=predictions)}")
print(f"Precision (Weighted): {pre_w.score(y_true=y_test, y_predicted=predictions)}")
print(f"Recall: {rec.score(y_true=y_test, y_predicted=predictions)}")