In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Credit Card customers**

**Predict Churning customers**

Problem :

A manager at the bank is disturbed with more and more customers leaving their credit card services. They would really appreciate if one could predict for them who is gonna get churned so they can proactively go to the customer to provide them better services and turn customers decisions in the opposite direction. We have only 16.07% of customers who have churned. Thus, it's a bit difficult to train our model to predict churning customers.

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df_data = pd.DataFrame(data)
df_data

# 1. Understanding the Data

> Based on the note from the data source, I decided to ignore the last 2 columns of the data, so the first thing I did was I had to delete it so it wouldn't interfere when analyzing the overall data

In [None]:
#Deleting the last 2 columns
df_data1 = df_data.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'])
df_data1.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', inplace=True, axis=1)
df_data1.head()

In [None]:
df_data1.dtypes

> Grouping columns by data type

In [None]:
numerical_columns = ["Customer_Age", 
                     "Months_on_book", 
                     "Credit_Limit", 
                     "Total_Revolving_Bal", 
                     "Avg_Open_To_Buy", 
                     "Total_Amt_Chng_Q4_Q1", 
                     "Total_Trans_Amt", 
                     "Total_Trans_Ct", 
                     "Total_Ct_Chng_Q4_Q1", 
                     "Avg_Utilization_Ratio"]

categorical_columns = ['Attrition_Flag',
                       'Gender',
                       'Education_Level',
                       'Marital_Status',
                       'Income_Category',
                       'Card_Category']

discrete_columns = ['CLIENTNUM',
                    'Dependent_count',
                    'Total_Relationship_Count',
                    'Months_Inactive_12_mon',
                    'Contacts_Count_12_mon']

# **2. Identify the Data**

In [None]:
df_data1.info()

In [None]:
df_data1.duplicated().sum()

Because there are no null values and duplicated values found, it can be said that this data is clean, and can be continued for data analysis by looking at the distribution of the data.

In [None]:
# Numerical Value 

import seaborn as sns
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt
def histo(data,x):
    sns.histplot(data=df_data1,x=x)
    
for col in numerical_columns:
    histo(df_data1, x=col)
    plt.show()


In [None]:
# Discrete Value 
for col in discrete_columns:
    histo(data, x=col)
    plt.show()


In [None]:
# Categorical Value 
def prop(categorical_columns):
    proportion = pd.DataFrame(data[categorical_columns].value_counts())
    proportion['proportion(%)'] = data[categorical_columns].value_counts(normalize=True)
    proportion['proportion(%)'].plot(kind='bar')

for colu in categorical_columns:
    print('Percentage of',colu)
    prop(colu)
    plt.show()
    

# **3. Analyze the Data**

> **Analyze the data better focus on attrition_Flag column to dig more information about churning customers**

> The first analysis is to look for outliers values, because these values must be determined to see the pattern of distribution so that we can study the data better.

> To find outliers, it is necessary to pay attention to using a boxplot chart

In [None]:
def boxplot(data,x,y):
    sns.boxplot(data=data, y=y, x=x)

for num in numerical_columns:
    boxplot(df_data1,num,'Attrition_Flag')
    plt.show()

> To see it in another way, you can use a line plot like the one below

In [None]:
# Measures of Dispersion
existing_customer_df = df_data1[df_data1['Attrition_Flag'] == "Existing Customer"]
attried_customer_df = df_data1[df_data1['Attrition_Flag'] == "Attrited Customer"]

def plot(column, dataset1, dataset2, label1, label2):
    sns.kdeplot(dataset1[column], color='blue', label=label1).set(xlim=(0, existing_customer_df[column].max()))
    sns.kdeplot(dataset2[column], color='red', label=label2).set(xlim=(0, attried_customer_df[column].max()))
    plt.legend()
    plt.show()

for col in numerical_columns:
    mean_existing = existing_customer_df[col].mean()
    mean_attried = attried_customer_df[col].mean()
    print("The average",col, "of Existing Customer: {:.2f}".format(round(mean_existing,2)))
    print("The average",col, "of Attrited Customer: {:.2f}".format(round(mean_attried,2)))
    plot(col, existing_customer_df,attried_customer_df, "Existing Customer",'Attrition Customer')

> Then the analysis continues by seeing how the level of correlation between all columns

In [None]:
plt.figure(figsize=(15,11))
sns.heatmap(df_data1.corr(), annot=True)

**Hipothesis** : Seeing from the correlation value between columns a and b, which is 1, we are curious.
does the data in the two columns have the same value? or just do have similarities? For that we use the conjecture that the average of credit limit and avg of avg_open_to_buy are the same

**Parameters** :

x1 (array_like, 1-D or 2-D) = first of the two independent samples

x2 (array_like, 1-D or 2-D) = second of the two independent samples

value (float) = 
*   In the one sample case, value is the mean of x1 under the Null hypothesis. 
*   In the two sample case, value is the difference between mean of x1 and mean of x2 under the Null hypothesis.


In [None]:
# import library
import pandas as pd
import numpy as np

import scipy.stats as stats
from statsmodels.stats.weightstats import ztest
import matplotlib.pyplot as plt
import math

In [None]:
# Credit Limit
credit_limit_mean = data['Credit_Limit'].mean()
sample_data_credit_limit = data['Credit_Limit'].sample(n=50)

# Avg Open To Buy
avg_open_to_buy_mean = data['Avg_Open_To_Buy'].mean()
sample_data_avg_open_to_buy = data['Avg_Open_To_Buy'].sample(n=50)

# Difference between mean of x1 and mean of x2  
difference = credit_limit_mean - avg_open_to_buy_mean
print('Difference between population mean: {:.2f}'.format(difference))

# set the value
confidence_level = 0.95
alpha = 1-confidence_level

# ztest hypothesis
ztest_Score, pvalue = ztest(x1=sample_data_credit_limit, x2=sample_data_avg_open_to_buy, value=difference)
display(ztest(x1=sample_data_credit_limit, x2=sample_data_avg_open_to_buy, value=difference))

# test
if alpha < pvalue:
  print("Reject the hypothesis")
  print("Because the pvalue {:.2f} > {:.2f}".format(pvalue, alpha))
else:
  print("Accept the hypothesis")
  print("Because the pvalue {:.2f} < {:.2f}".format(pvalue, alpha))

# **4. Data Pre-Processing**

# **5. Develop Model**