In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

plt.rcParams['font.size'] = 15

## Analysis Objective

A manager at the bank is disturbed with more and more customers leaving their credit card services. They would really appreciate if one could predict for them who is gonna get churned (leave) so they can proactively go to the customer to provide them better services and turn customers' decisions in the opposite direction

Now, this dataset consists of 10,000+ customers mentioning their age, salary, marital_status, credit card limit, credit card category, etc. There are exactly 21 features.

We have only 16.07% of customers who have churned (have left). Thus, it's a bit difficult to train our model to predict churning (leaving) customers.

A business manager of a consumer credit card portfolio is facing the problem of customer attrition. They want to analyze the data to find out the reason behind this and leverage the same to predict customers who are likely to drop off.

In [None]:
bank_churners = pd.read_csv("../input/credit-card-customers/BankChurners.csv")

# Based on the initial warning of dropping the last two columns
bank_churners = bank_churners.iloc[:, :-2]

In [None]:
bank_churners.head(3)

In [None]:
bank_churners.columns

In [None]:
bank_churners.shape

Labelling each client by their client credit card number using the `CLIENTNUM` column

In [None]:
bank_churners = bank_churners.set_index('CLIENTNUM')

### Data Exploration Functions

In [None]:
def pie_plot(feature, data=bank_churners, figsize=(10, 6), 
             autopct='%1.1f%%', explode=0.1, shadow=True):
    """
    Function to plot a pie graph.
    
    Since seaborn does not provide an instant pie graph function. 
    In order to implement a DRY principle of programming, this function
    will help to automatically plot pie graph across neccessary features.
    """
    
    feature_groups = data.groupby(feature)
    
    categories = data[feature].unique()
    
    f = {
        category: feature_groups.get_group(category)[feature].count()
        for category in categories
    }
    
    explode = [explode for _ in range(len(f))]
    
    fig = plt.figure(figsize=figsize)
    
    plt.pie(f.values(), labels=f.keys(), explode=explode, autopct=autopct, shadow=shadow)
    

In [None]:
def relationship_ratio(feature, hue, data=bank_churners):
    """
    Function to compare weights on categorical data.
    """
    
    feature_groups = data.groupby(feature)
    hue_filters = data[hue].unique()
    categories = data[feature].unique()
    
    def group_hue(category):
        hue_groups = category.groupby(hue)
        feature_count = category[feature].count()
        
        def hue_count(_filter):
            try:
                return hue_groups.get_group(_filter)[feature].count()
            except KeyError:
                return 0
        
        return {
            _filter: f"{round((hue_count(_filter) / feature_count) * 100, 3)}%"
            for _filter in hue_filters
        }
    
    return {
        category: group_hue(feature_groups.get_group(category))
        for category in categories
    }

## Exploratory Data Analysis

1. **CLIENTNUM**: Unique ID to represent each customer
2. **Attrition_Flag**: Internal event (customer activity) variable - if the account is closed then 1 else 0; Shows existing and non-existing customers `['Existing Customer', 'Attrited Customer']`
3. **Customer_Age**: Demographic variable - Customer's Age in Years
4. **Gender**: Demographic variable - M=Male, F=Female (M / F)
5. **Dependent_count**: Demographic variable - Number of dependents. `[range(6), dtype('int64')]`
6. **Education_Level**: Demographic variable - Educational Qualification of the account holder `['High School', 'Graduate', 'Uneducated', 'Unknown', 'College', 'Post-Graduate', 'Doctorate']`
7. **Marital_Status**: Information on customers marital status `['Married', 'Single', 'Unknown', 'Divorced']`
8. **Income_Category**: Demographic variable - Annual Income Category of the account holder `['$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K', '$120K +', 'Unknown']`
9. **Card_Category**: Product Variable - Type of Card `['Blue', 'Gold', 'Silver', 'Platinum']`
10. **Months_on_book**: Period of relationship with bank (measured by months) `[range(13, 57), dtype('int64')]`
11. **Total_Relationship_Count**: Total no. of products held by the customer `[range(1, 7), dtype('int64')]`
12. **Months_Inactive_12_mon**: No. of months inactive in the last 12 months `[range(7), dtype('int64')]`
13. **Contacts_Count_12_mon**: No. of Contacts in the last 12 months `[range(7), dtype('int64')]`
14. **Credit_Limit**: Credit Limit on the Credit Card
15. **Total_Revolving_Bal**: Total Revolving Balance on the Credit Card `[range(2518), dtype('int64')]`
16. **Avg_Open_To_Buy**: Open to Buy Credit Line (Average of last 12 months) `[range(3, 34517), dtype('float64')]`
17. **Total_Amt_Chng_Q4_Q1**: Change in Transaction Amount (Q4 over Q1)
18. **Total_Trans_Amt**: Total Transaction Amount (Last 12 months) `[range(510, 18484), dtype('int64')]`
19. **Total_Trans_Ct**: Total Transaction Count (Last 12 months) `[range(10, 140), dtype('int64')]`
20. **Total_Ct_Chng_Q4_Q1**: Change in Transaction Count (Q4 over Q1)
21. **Avg_Utilization_Ratio**: Average Card Utilization Ratio


In [None]:
sns.catplot(x="Attrition_Flag", kind="count", data=bank_churners)

In [None]:
pie_plot("Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Attrition Flag</b> shows the distribution of both churned and un-churned customers
</p>

<p style="text-align:center; font-size: 18px">
We can see that about 84% of the customers are un-churned, making over 8000 of the customer. And about 16% of the customers have been churned which is slightly above 1500 customers. 
</p>

<p style="text-align:center; font-size: 18px">
The Attrition Flag column will be essential in creating a model that properly predicts churning customers, especially as the large majority of the customers are un-churned.
</p>

In [None]:
sns.displot(bank_churners, x="Customer_Age", kde=True)

In [None]:
sns.displot(bank_churners, x="Customer_Age", hue="Attrition_Flag", multiple="stack")

<p style="text-align:center; font-size: 18px">
    The <b>Customers Age</b> shows the distribution of registered customers across there ages.
</p>

<p style="text-align:center; font-size: 18px">
Notice the degree of distortion from the symmetrical bell curve (skewness) of the ages of the customers and the similar distribution among churned and un-churned customers
</p>

<p style="text-align:center; font-size: 18px">
This states that the customers age column is not entirely necessary or does not have much impart on differentiating between churned and un-churned customers
</p>

In [None]:
pie_plot("Gender")

In [None]:
sns.catplot(x="Gender", hue="Attrition_Flag", kind="count", data=bank_churners)

<p style="text-align:center; font-size: 18px">
    The <b>Gender</b> shows the distribution of customers gender.
</p>

<p style="text-align:center; font-size: 18px">
    There's a similar count between both churned and un-churned customers which is not entirely necessary on differentiating between churned and un-churned customers
</p>

In [None]:
sns.displot(bank_churners, x="Dependent_count", kind="kde", multiple="stack", hue="Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Dependent Count</b> shows the distribution of customers across those they are dependent on.
</p>

<p style="text-align:center; font-size: 18px">
    There's a tight sparse relationship between churned and un-churned customers on dependent count, thus this column won't be necessary.
</p>

In [None]:
pie_plot("Education_Level")

In [None]:
sns.catplot(y="Education_Level", hue="Attrition_Flag", kind="count", data=bank_churners)

<p style="text-align:center; font-size: 18px">
    The <b>Education Level</b> shows the distribution of customers on their level of education.
</p>

<p style="text-align:center; font-size: 18px">
    Across every level of education there's a slight count of churned customers similar in proportion to un-churned customers, this signifies that information from this column may not be overly effective.
</p>

In [None]:
pie_plot("Marital_Status")

In [None]:
sns.catplot(y="Marital_Status", hue="Attrition_Flag", kind="count", data=bank_churners)

<p style="text-align:center; font-size: 18px">
    The <b>Marital Status</b> shows the distribution of customers marital status.
</p>

<p style="text-align:center; font-size: 18px">
    Similar to the Education Level. counts of churned customers are similar in proportion to un-churned customers, signifing that information from this column may not be effective.
</p>

In [None]:
pie_plot("Income_Category")

In [None]:
sns.catplot(y="Income_Category", hue="Attrition_Flag", kind="count", data=bank_churners)

In [None]:
relationship_ratio("Income_Category", "Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Income Category</b> shows the distribution of customers annual income.
</p>

<p style="text-align:center; font-size: 18px">
    Similar to the Education Level and Marital Status. counts of churned customers are similar in proportion to un-churned customers, signifing that information from this column may not be effective. A closer look into the data, notice customers are over 80% and less than 20% across each type of income.
</p>

In [None]:
sns.catplot(x="Card_Category", hue="Attrition_Flag", kind="count", data=bank_churners)

In [None]:
# Measure distance (by age) between churned and un-churned customers on card type

sns.catplot(x="Card_Category", y="Customer_Age", hue="Attrition_Flag", kind="point", data=bank_churners)

In [None]:
sns.displot(bank_churners, x="Card_Category", y="Income_Category")

<p style="text-align:center; font-size: 18px">
    The <b>Card Category</b> shows the distribution of customers preferred/chosen card type.
</p>

<p style="text-align:center; font-size: 18px">
    Across the card categories counts of churned customers are similar in proportion to un-churned customers, signifing that information from this column may not be effective.
</p>

<p style="text-align:center; font-size: 18px">
    <b>Observation:</b> Most of the customer prefer the blue card and most of the blue card users earn less than $40K annually, but there's very little difference between the age (46-48) of churned and un-churned blue cards customers.
</p>

In [None]:
sns.displot(bank_churners, x="Months_on_book", hue="Attrition_Flag", kind="kde", multiple="stack")

<p style="text-align:center; font-size: 18px">
    The <b>Months On Book</b> shows the distribution of customers period of relationship with the bank (measured by months).
</p>

<p style="text-align:center; font-size: 18px">
    There's a tight sparse relationship between churned and un-churned customers on Months On Book, thus this column won't be necessary.
</p>

In [None]:
sns.displot(bank_churners, x="Total_Relationship_Count", hue="Attrition_Flag", kind="kde", multiple="stack")

In [None]:
sns.catplot(data=bank_churners, x="Total_Relationship_Count", hue="Attrition_Flag", kind="count")

In [None]:
relationship_ratio("Total_Relationship_Count", "Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Total Relationship Count</b> shows the distribution of number of products held by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    Across the Total Relationship Count counts of churned customers are similar in proportion to un-churned customers, signifing that information from this column may not be effective.
</p>

In [None]:
sns.catplot(data=bank_churners, x="Months_Inactive_12_mon", kind="count", hue="Attrition_Flag")

In [None]:
relationship_ratio("Months_Inactive_12_mon", "Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Months_Inactive_12_mon</b> shows the distribution of number of months inactive in the last 12 months by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    Across the Months_Inactive_12_mon counts of churned customers are similar in proportion to un-churned customers, signifing that information from this column may not be too effective.
</p>

In [None]:
sns.catplot(data=bank_churners, x="Contacts_Count_12_mon", kind="count", hue="Attrition_Flag")

In [None]:
relationship_ratio("Contacts_Count_12_mon", "Attrition_Flag")

<p style="text-align:center; font-size: 18px">
    The <b>Contacts_Count_12_mon</b> shows the distribution of number of Contacts in the last 12 months by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    This column provides a pretty useful information. The higher the Contacts_Count_12_mon the higher the churned customers and the lower the un-churned customers.
</p>

In [None]:
sns.displot(bank_churners, x="Credit_Limit", hue="Attrition_Flag", kind="kde", multiple="stack")

<p style="text-align:center; font-size: 18px">
    The <b>Credit Limit</b> shows the distribution of credit limit on the credit card of the customers.
</p>

<p style="text-align:center; font-size: 18px">
    There's a similar sparse relationship between churned and un-churned customers on Credit Limit, thus this column won't be too effective.
</p>

In [None]:
sns.displot(bank_churners, x="Total_Revolving_Bal", hue="Attrition_Flag", multiple="stack")

In [None]:
sns.displot(bank_churners, x="Total_Revolving_Bal", hue="Attrition_Flag", multiple="stack", kind="kde")

<p style="text-align:center; font-size: 18px">
    The <b>Total Revolving Balance</b> shows the distribution of the total revolving balance on the customers credit card .
</p>

<p style="text-align:center; font-size: 18px">
    This column provides a pretty useful information. The relationship between churned and un-churned customers changed between the points 500 and 2500 of the total revolving balance column implying customers are very less likely to churn at this point.
</p>

In [None]:
sns.displot(bank_churners, x="Avg_Open_To_Buy", hue="Attrition_Flag", multiple="stack", kind="kde")

<p style="text-align:center; font-size: 18px">
    The <b>Avg_Open_To_Buy</b> shows the distribution of open to buy credit line (average of last 12 months) for customers.
</p>

<p style="text-align:center; font-size: 18px">
    There's a similar sparse relationship between churned and un-churned customers on Credit Avg_Open_To_Buy, thus this column won't be effective.
</p>

In [None]:
sns.displot(bank_churners, x="Total_Amt_Chng_Q4_Q1", hue="Attrition_Flag", multiple="stack", kind="kde")

<p style="text-align:center; font-size: 18px">
    The <b>Total_Amt_Chng_Q4_Q1</b> shows the distribution of open to buy credit line (average of last 12 months) for customers.
</p>

<p style="text-align:center; font-size: 18px">
    There's a similar sparse relationship between churned and un-churned customers on the Total_Amt_Chng_Q4_Q1, thus this column won't be effective.
</p>

In [None]:
sns.displot(bank_churners, x="Total_Trans_Amt", hue="Attrition_Flag", multiple="stack")

In [None]:
sns.displot(bank_churners, x="Total_Trans_Amt", hue="Attrition_Flag", multiple="stack", kind="kde")

<p style="text-align:center; font-size: 18px">
    The <b>Total Transaction Amount</b> shows the distribution of the total transaction amount in the last 12 months by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    This column provides a pretty useful information. There are fewer to no churned customers for transactions exceeding 5000 implying most churned customers have a total transaction amount between 0 - 5000 in the last 12 months
</p>

In [None]:
sns.displot(bank_churners, x="Total_Trans_Ct", hue="Attrition_Flag", multiple="stack", kde=True)

<p style="text-align:center; font-size: 18px">
    The <b>Total Transaction Count</b> shows the distribution of the total transaction count in the last 12 months by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    This column provides a pretty useful information. There are fewer to no churned customers for transaction counts exceeding 85 implying most churned customers have a total transaction count between 0 - 85 in the last 12 months
</p>

In [None]:
sns.displot(bank_churners, x="Total_Ct_Chng_Q4_Q1", hue="Attrition_Flag", multiple="stack", kind="kde")

<p style="text-align:center; font-size: 18px">
    The <b>Total_Ct_Chng_Q4_Q1</b> shows the distribution of the change in transaction count (Q4 over Q1) by customers.
</p>

<p style="text-align:center; font-size: 18px">
    Like the Total_Amt_Chng_Q4_Q1 there's a similar sparse relationship between churned and un-churned customers on the Total_Ct_Chng_Q4_Q1, thus this column won't be effective.
</p>

In [None]:
sns.displot(bank_churners, x="Avg_Utilization_Ratio", hue="Attrition_Flag", multiple="stack")

<p style="text-align:center; font-size: 18px">
    The <b>Average Utilization Ratio</b> shows the distribution of the average card utilization ratio by the customers.
</p>

<p style="text-align:center; font-size: 18px">
    This column provides a pretty useful information. Customers exceeding 0.1 of the average card utilization ratio have lesser tendency of be churned.
</p>

In [None]:
# required features for predicting churning customers

required_fields = ["Attrition_Flag", "Contacts_Count_12_mon", "Total_Trans_Ct", "Total_Trans_Amt",
                   "Total_Revolving_Bal", "Avg_Utilization_Ratio"]

In [None]:
bank_churners_copy = bank_churners.copy()

In [None]:
bank_churners_copy = bank_churners_copy.loc[:, required_fields]

In [None]:
bank_churners_copy.head(3)

<p style="font-size: 18px">
    From the problem statement, the manager wishes to detect chunking customers (that is customers more likely to leave from existing ones). The best way that i believe this can be done is to
</p>

<ol style="font-size: 18px">
    <li>Clearly distinguish between churned and un-churned customers (grouping).</li>
    <li>Use factors from our data exploration that have clearly seperated churned from un-churned customers to select distinctive un-churned customers.</li>
    <li>distinctive un-churned customers will be used along with the distinctive churned customers to create a model to identify churning customers from what is left in the un-churned (non-distinctively churned) customers.</li>
</ol>

In [None]:
customers_type = bank_churners_copy.groupby("Attrition_Flag")

In [None]:
# 1. Clearly distinguish between chunked and un-chunked customers (grouping).

churned_customers = customers_type.get_group("Attrited Customer")
unchurned_customers = customers_type.get_group("Existing Customer")

In [None]:
# 2. Use factors from our data exploration that have clearly seperated 
# chunked from un-chunked customers to select distinctive un-chunked customers.

dist_churned_customers = unchurned_customers.where(
    (bank_churners_copy.Total_Trans_Ct > 85) &
    (bank_churners_copy.Total_Trans_Amt > 5000) &
    ((bank_churners_copy.Total_Revolving_Bal > 500) & (bank_churners_copy.Total_Revolving_Bal < 2500)) &
    (bank_churners_copy.Avg_Utilization_Ratio > 0.1)
).dropna()

In [None]:
non_dist_indexes = [i for i in list(bank_churners_copy.index) if i not in list(dist_churned_customers.index)]
non_dist_churned_customers = unchurned_customers.reindex(index=non_dist_indexes).dropna()

## Model Creation

distinctive un-churned customers will be used along with the distinctive churned customers to create a model to identify churning customers from what is left in the un-churned (non-distinctively churned) customers

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)

In [None]:
train_data = pd.concat([churned_customers, dist_churned_customers])

In [None]:
data, target = train_data.iloc[:, 1:], train_data.Attrition_Flag

In [None]:
knn.fit(data, target)

In [None]:
churning_customers = knn.predict(non_dist_churned_customers.iloc[:, 1:])

In [None]:
attriting_customer = pd.Series(list(churning_customers), name="Status")

In [None]:
def change_name(name):
    if name == "Attrited Customer": 
        return "Churning"
    else:
        return "Stable"

attriting_customer = attriting_customer.apply(change_name)

In [None]:
non_dist_churned_customers['Status'] = attriting_customer.values

In [None]:
non_dist_churned_customers

In [None]:
customers = non_dist_churned_customers.groupby("Status")

In [None]:
churning_customers = customers.get_group("Churning")

In [None]:
churning_customers.head(3)