In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### **Foreword**
  
Hello everyone, this is the first notebook that I made since I start learning data science. I was hoping that for whoever read this notebook can give me feedback, it may from the coding, visualization used, insight, business recommendation, etc. Thank you very much!!!

### **Problem Background**

Customer churn is **the percentage of customers that stopped using bank's product or service during a certain time frame**. Losing customer mean losing revenue, so in long term it might have a serious impact on the bank. As most of us already know, customer acquisition is costing more than customer retention but in term of revenue, moreover, the `retained customer` will give the bank more money than `new customer`. 

Therefore, when `new customer` churn (higher acquisition cost, lower revenue), the bank might suffer from short term loss, but when `retained customer` churn(lower retention cost, high revenue), the bank revenue will decrease. That is why predicting customer behavior that is going to churn is very important. In fact, by simply reaching out to the customer early enough, 11% of the churn can be avoided.

Note: In this dataset, the columns `Exited` mean churn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/churn-for-bank-customers/churn.csv')
df.sample(5)

# 1. General Information about the Dataset

In [None]:
df.info()

In [None]:
df_drop = df.drop(columns = ['RowNumber','CustomerId','Surname'])
df_drop.head()

In [None]:
df_drop.duplicated().sum()

In [None]:
numeric = ['float64','int64']

df_numerical = df_drop.select_dtypes(include=numeric)
column_numerical = df_numerical.columns

df_categorical = df_drop.select_dtypes('object')
column_categorical = df_categorical.columns

In [None]:

df_numerical.describe()

In [None]:
df_categorical.describe()

Summary highlight:
1. The bank has 10,000 customers
2. The dataset has **14 features**, but `RowNumber`, `CustomerID` and `Surname` are removed.
3. There are **no duplicate** and **no null value** from the dataset
4. Most of the customer are located at **France**
5. Most of the customer's credit score is around 650 which considered as **fair**
6. Customer's `age` are vary, ranging from 19 to 92
7. The dataset only has tenure up to 10 years

# 2. Exploratory Data Analysis (EDA)

## 2.1 Ratio of customer exited and retained

In [None]:
labels = 'Exited', ' Retained'
sizes = [20.37, 100-20.37]
explode = (0, 0.1)

fig1, ax1 = plt.subplots(figsize=(8, 5))

ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.2f%%',
        shadow=False, startangle=0, textprops = {'size': 'x-large'}, colors = ['lightseagreen','lightgrey'])
ax1.axis('equal')

plt.title("Proportion of customer exited and retained", size = 16)

## 2.2 Barplot and distribution

In [None]:
plt.figure(figsize=(12,10))
for i in range(0,len(column_numerical)):
    plt.subplot(3,len(column_numerical)/3, i+1)
    sns.boxplot(y = df_numerical[column_numerical[i]], color='green',orient = 'v')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(12,8))
for i in range(0,len(column_numerical)):
    plt.subplot(3,3,i+1)
    sns.distplot(df_numerical[column_numerical[i]], color='green')
    plt.tight_layout()

Highlight :
1. From the barplot we can see that `CreditScore` and `Age` have outliers, before machine learning(ML) we need to remove those outliers
2. `NumOfProducts` and `Exited` also have outliers, but these 2 features are categorical in normal sense, so we will not remove those outliers
3. The `EstimatedSalary` is distributed evenly.

In [None]:
df_wo_exited = df_drop.drop(columns = 'Exited')
df_wo_exited

In [None]:
plt.figure(figsize=(25,10))
for i in range(0,len(df_wo_exited.columns)):
    plt.subplot(2,len(df_wo_exited.columns)/2,i+1)
    sns.histplot(df_drop, x = df_drop[df_drop.columns[i]], hue = 'Exited')
    plt.tight_layout()

Summary :
1. Customer with low `CreditScore` has the tendency to exit
2. Most of bank's customer come from France, but in Germany more customer exit
3. Customer ranging from 50-60 years old have the tendency to exit
4. `NumOfProducts` 3 and 4 have more customer exit, although only a small number of customer use that  products
5. Interesting graph from `Balance`, more customer with `Balance` around 0 don't exit

## 2.3 Heatmap

In [None]:
df_onehot = df_drop.copy()

for i in column_categorical:
    df_onehot[i] = df_onehot[i].astype('category').cat.codes
    

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_onehot.corr(), cmap='Blues', annot=True, fmt = '.2f')

# 3. Insights

## 3.1 Age

In [None]:
df_age = df_drop.copy()

df_age['age_segmentation'] = np.where(df_drop['Age'] <= 20, '1-20',
                                      np.where(df_drop['Age'] <= 30, '21-30',
                                              np.where(df_drop['Age'] <= 40, '31-40',
                                                      np.where(df_drop['Age'] <= 50, '41-50',
                                                              np.where(df_drop['Age'] <= 60, '51-60',
                                                                      np.where(df_drop['Age'] <= 70, '61-70',
                                                                              np.where(df_drop['Age'] <= 80, '71-80', '90++')))))))
                                                
df_age.sample(5)

In [None]:
df_age = df_age.groupby(['age_segmentation','Exited']).agg({'Gender' : 'count'}).reset_index()
df_age.columns = ['age_segmentation','Exited','user_count']
df_age['Exited'] = df_age['Exited'].apply(lambda x : 'Exited' if x == 1 else 'Retained')
df_age

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(data = df_age, x = 'age_segmentation', y = 'user_count', hue = 'Exited', palette = 'Purples')
plt.ylabel('Total User')

## 3.2 Geography

In [None]:
df_geo = df_drop.groupby(['Geography','Exited']).agg({'Gender':'count'}).reset_index()
df_geo.columns = ['Geography','Exited','user_count']

df_geo

In [None]:
sns.barplot(data=df_geo, x = 'Geography',y='user_count', hue='Exited',palette = 'Purples')

In [None]:
## 3.3 Product type

In [None]:
df_product = df_drop.groupby(['NumOfProducts','Exited']).agg({'Gender' : 'count'}).reset_index()

df_product.columns = ['NumOfProducts','Exited','user_count']

df_product['total_user'] = df_product.groupby(['NumOfProducts'])['user_count'].transform('sum')

df_product['Percentage'] = round(df_product['user_count']/df_product['total_user']*100,2)
df_product['Percentage'] = df_product['Percentage'].astype(str)
df_product['Percentage'] = df_product['Percentage']+'%'

df_product['Exited'] = df_product['Exited'].apply(lambda x : 'Exited' if x == 1 else 'Retained')
df_product

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(data= df_product, x = 'NumOfProducts', y ='user_count', hue='Exited', palette='Purples')

plt.title('Product Types and the Churn Rate', loc = 'center', fontweight = 'bold', fontsize=20)

plt.xlabel('Number of Products', fontsize = 16, fontdict = {'weight' :'bold'})
plt.ylabel('User Count', fontsize=16, fontdict = {'weight' :'bold'})

plt.legend(loc = 'center right')

plt.text(x = -0.32 , y = df_product['user_count'][0] - 150, s = df_product['Percentage'][0], fontsize = 12)
plt.text(x = 0.07 , y = df_product['user_count'][1] - 150, s = df_product['Percentage'][1], fontsize = 12)
plt.text(x = 0.68 , y = df_product['user_count'][2] - 150, s = df_product['Percentage'][2], fontsize = 12)
plt.text(x = 1.1 , y = df_product['user_count'][3] + 100, s = df_product['Percentage'][3], fontsize = 12)
plt.text(x = 1.68, y = df_product['user_count'][4] + 100, s = df_product['Percentage'][4], fontsize = 12)
plt.text(x = 2.07, y = df_product['user_count'][5] + 100, s = df_product['Percentage'][5], fontsize = 12)
plt.text(x = 3.07 , y = df_product['user_count'][6] + 100, s = df_product['Percentage'][6], fontsize = 12)

## 3.4 Credit Score

In [None]:
df_credit = df_drop.loc[:, ['CreditScore','Exited']]

labels = ["{0} - {1}".format(i, i + 99) for i in range(300, 800, 100)]
df_credit['Classification'] = pd.cut(df_credit.CreditScore, range(300, 900, 100), right=False, labels=labels)

df_credit = df_credit.groupby(['Exited','Classification']).agg({'CreditScore':'count'}).reset_index()
df_credit.columns = ['Exited','CreditScore','user_count']

df_credit['sum'] = df_credit.groupby('CreditScore')['user_count'].transform('sum')

df_credit['percentage'] = round(df_credit['user_count'] * 100 / df_credit['sum'],2)
df_credit['percentage'] = df_credit['percentage'].astype(str)
df_credit['percentage'] = df_credit['percentage'] + '%'

df_credit['Exited'] = df_credit['Exited'].apply(lambda x : 'Exited' if x == 1 else 'Retained')
df_credit.transpose()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(data = df_credit, x = 'CreditScore', y = 'user_count', hue='Exited', palette = 'Blues')

Insight summary:
1. The majority of the customer is adult around 31-40 years old
2. Older customer around 51-60 years old, even though only a small portion of the customer, but they have a tendency to exit the bank.
3. 30% of customer in Germany exit the bank
4. Product number 1 and 2 are the most common product used by the customer, meanwhile product number 3 and 4 are rarely used but has higher exit rate
5. Most of the customer has credit score of 600-699 


# 4. Business Recommendation

1. Considering the geography, customer in Germany have the tendency to churn. The bank needs to focus on improving facilities in Germany, for example : are there enough bank's branches in `Germany`? are there enough ATM machines in `Germany`? are there any advertisement or promotion for new user and retained user?

2. Older people (51-60) years old might need a better product or promotion, such as a better retirement plan, higher interest or so on. 

3. The bank need to stop using `product number` `3` and `4`, because there are less likely to be chosen and the churn rate is high. Therefore, the bank can focus on promoting `product number` `1` and `2`. For the product number 1, the company can review customer satisfaction and opinion, because the churn rate reach up to 27%. 