# Analyzing CC Customers
Problem: A business manager of a consumer credit card portfolio is facing the problem of customer attrition. They want to analyze the data to find out the reason behind this and leverage the same to predict customers who are likely to drop off.

In [None]:
# Import Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot
import matplotlib.mlab as mlab
plt.style.use('ggplot') # default plot style.

import scipy
from scipy import stats
from scipy.stats import norm

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import Data
CD = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")
CD.head()

# Data Preprocessing:
A required process by Data Scientists to allow for dataset to be workable

* Step 1 - Set Index Value to Client Number
* Step 2 - Describe the Customer Dataset (i.e., descriptive statistics)
* Step 3 - Identify NA Values and Replace w/ appropriate values
* Step 4 - Understand Factor Variables (i.e., Income_Category, Marital Status, Card Category)
* Step 5 - Convert Factor Variables to Dummary Variables for Regression Analysis

In [None]:
# Step 1 - Set ClientNum as Index Value
CD = CD.set_index("CLIENTNUM");

## Categorial Features
* Customer_Age: Customer's Age in Years
* Dependent_count: Number of dependents
* Months_on_book: Period of relationship with bank
* Total_Relationship_Count: Total no. of products held by the customer
* Months_Inactive_12_mon: No. of months inactive in the last 12 months
* Contacts_Count_12_mon: No. of Contacts in the last 12 months
* Credit_Limit: Credit Limit on the Credit Card
* Total_Revolving_Bal: Total Revolving Balance on the Credit Card
* Avg_Open_To_Buy: Open to Buy Credit Line (Average of last 12 months)
* Total_Amt_Chng_Q4_Q1: Change in Transaction Amount (Q4 over Q1)
* Total_Trans_Amt: Total Transaction Amount (Last 12 months)
* Total_Trans_Ct: Total Transaction Count (Last 12 months)
* Total_Ct_Chng_Q4_Q1: Change in Transaction Count (Q4 over Q1)
* Avg_Utilization_Ratio: Average Card Utilization Ratio

In [None]:
# Step 2 - Describe the Customer Dataset (i.e., Customer Age, Credit Limits, etc)
CD.describe()

In [None]:
# Step 3 - Identify NA Values and Replace w/ appropriate values
CD.isna().sum()

In [None]:
# Step 3 - Identify NA Values and Replace w/ appropriate values
CD = CD.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
              'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'], axis = 1)

## Categorical Features

Mapping attributes with dummy variables 
* Attrition_Flag    (1: Attrited Customer, 0: Existing Customer)
* Gender            (1: Female, 2: Male)
* Education_Level   (1: Unknown, 1: Uneducated, 2: High School, 3: College, 4: Graduate, 5: Post-Graduate, 6: Doctorate)
* Marital_Status    (1: Unknown, 2: Divorced, 3: Single, 4: Married)
* Income_Category   (1: Unknown, 2: Less than 40K, 3: 40K - 60K, 4: 60K - 80K, 5:80K - 120K, 6: 120K +)
* Card_Category     (1: Blue, 2: Silver, 3: Gold, 4: Platinum)

In [None]:
# Step 4 - Understand Factor Variables (i.e., Income_Category, Marital Status, Card Category)
Incomes = CD.groupby("Income_Category")["Income_Category"].count()
print(Incomes)
print()

MS = CD.groupby("Marital_Status")["Marital_Status"].count()
print(MS)
print()

CC = CD.groupby("Card_Category")["Card_Category"].count()
print(CC)
print()

EL = CD.groupby("Education_Level")["Education_Level"].count()
print(EL)


In [None]:
# Step 5 - Convert Factor Variables to Dummary Variables for Regression Analysis
CD['Attrition_Flag_Num'] = CD['Attrition_Flag'].map({'Existing Customer':1, 'Attrited Customer':0})
CD['Gender_Num'] = CD['Gender'].map({'M':1, 'F':2})
CD['Income_Category_Num'] = CD['Income_Category'].map({'Unknown':1, 'Less than $40K':2, '$40K - $60K':3, '$60K - $80K':4, '$80K - $120K':5, '$120K +':6})
CD['Marital_Status_Num'] = CD['Marital_Status'].map({'Unknown':1, 'Divorced':2, 'Single':3, 'Married':4})
CD['Education_Level_Num'] = CD['Education_Level'].map({'Unknown':1, 'Uneducated':1, 'High School':2, 'College':3,'Graduate':4,'Post-Graduate':5,'Doctorate':6})
CD['Card_Category_Num'] = CD['Card_Category'].map({'Blue':1, 'Silver':2, 'Gold':3, 'Platinum':4})

# Customer Visualizations

## Categorical Plotting

In [None]:
Categories = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

for category in Categories:
    names = list(set(CD[category]))
    values = []
    EC = []
    AC = []
    for element in names:
        num = len(CD[CD[category] == element])
        values.append(num)
        EC1 = len(CD[(CD['Attrition_Flag'] == 'Existing Customer') & (CD[category] == element)])
        EC.append(EC1)
        AC1 = len(CD[(CD['Attrition_Flag'] == 'Attrited Customer') & (CD[category] == element)])
        AC.append(AC1)
    
    width = 0.5       # the width of the bars: can also be len(x) sequence

    fig, ax = plt.subplots(figsize = (10, 5))

    ax.bar(names, EC , width, label='Existing Customer')
    ax.bar(names, AC , width, bottom=EC, label='Attrited Customer')
    ax.legend()


## Numerical Plotting

In [None]:
Numericals = ['Customer_Age','Credit_Limit','Months_on_book','Avg_Utilization_Ratio','Avg_Open_To_Buy','Total_Trans_Amt']

for element in Numericals:
    num_bins = 20
    x = CD[element]
    sigma = x.std()
    mu = x.mean()
    
    fig, ax = plt.subplots(figsize = (10, 5))

    n, bins, patches = ax.hist(x, num_bins, density=1)
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *
    np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
    ax.plot(bins, y, '--')
    ax.set_xlabel(element)
    ax.set_ylabel('Frequency')
    plt.show()
    

## Visualization Takeaways
<br>

**Categorial**
* Balanced amount of Male/Female cardholders. Males appear slightly more likely to leave the firm. 
* Majority of cardholders hold a graduate degree, graduate degree holders appear slighly more likely to leave the firm. 
* Majority of cardholders make < 40K, which is interested given the point above
* Blue Card is the most popular offering

**Numerical**
* Mean Age is between 40-50
* Most cardholders have a low credit limit & low credit utilization rate
* Most customers have been with the bank for 3 years
* It appears there is some skewness in our data, we should analyze to ensure no issues with modeling. 

   # Customer Analytics
   
   

###  Logistic Regression to review Independent Variables
Logistic Regressions are used to identify the statistical impact that independent variables have on a binary dependent variable. In our case, the dependent variable is whether or not the customer leaves the bank (leave or stay, binary). Our independent variables are all datapoints we have on the customer (except for whether they are current customers or have left the bank)

In [None]:
x1 = CD[['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Gender_Num', 'Income_Category_Num', 'Marital_Status_Num', 'Education_Level_Num', 'Card_Category_Num']]
y1 = CD['Attrition_Flag_Num']

In [None]:
#Review Skewness of Dataset, consider performing log transformation on highly skewed independent variables. 
df_skew = pd.DataFrame(x1.skew(), columns=['Skewness']).sort_values(by='Skewness')
df_skew

In [None]:
# Import Statsmodels
import statsmodels.api as sm

# Perform Logistic Regression & output summary
logit_model = sm.Logit(y1, x1).fit(method = 'minimize')
print(logit_model.summary()) 

### Regression Takeaways

* Regression runs w/ outputs in Jupyter Notebook. Results are not showing up in Kaggle. See below for our analysis.
* Customer Age, Months on Book, Credit Limit, Revolving Balance, Open to Buy, Utilization Ratio, and Education Level are not statistically significant and our model cannot prove that these attributes have influence over whether or not a customer decides to leave the bank or not. 
* Our simple Logistic Regression only explains ~45% of the variation within customer accounts. A stronger model is needed

In [None]:
# Import Packages for ML Tool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# Split the dataset into a test & training data set. 
# Our model will train on 70% of the customer data then predict on the remaining 30%. 
x_train,x_test,y_train,y_test = train_test_split(x1,y1,test_size=0.3,random_state=0)

In [None]:
# Run a Logistic Regression and fit the dataset to the model
logistic_regression = LogisticRegression(max_iter = 10000)
logistic_regression.fit(x_train,y_train)
y_pred = logistic_regression.predict(x_test)

In [None]:
# Build a confusion matrix, which will provide insights into the accuracy of the model. 
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix)
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
plt.show()

### Feature Importance
Feature Importance helps "humans" understand the logic behind the black box. Please review the information below objectivity.

In [None]:
Features = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Gender_Num', 'Income_Category_Num', 'Marital_Status_Num', 'Education_Level_Num', 'Card_Category_Num']

importance = logistic_regression.coef_[0]
importance

cm = sns.light_palette("green", as_cmap=True)

FeatureImportance = pd.DataFrame(Features, columns = ['Feature'])
FeatureImportance['Score'] = importance
FeatureImportance = FeatureImportance.sort_values(by=['Score']).style.background_gradient(cmap=cm)
FeatureImportance

### Logistic ML Model Takeaways

* The most important features to determine if a customer will leave the bank or not are: Total Transaction Counts, Marital Status, Total Amount Change Quarter over Quarter, Total Relationships with Bank, and Total Count of Transactions Quarter over Quarter

* Our logistic model can accurately predict if a customer will leave the bank ~90% of the time.

# Random Forest Machine Learning Model

In [None]:
# Import Packages for ML Tool
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Run a Random Forest Classifier and fit the dataset to the model
RF = RandomForestClassifier(n_estimators=20, random_state=0)
RF.fit(x_train,y_train)
y_predRF = RF.predict(x_test)

In [None]:
# Build a confusion matrix, which will provide insights into the accuracy of the model. 
confusion_matrixRF = pd.crosstab(y_test, y_predRF, rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrixRF)
print('Accuracy: ',metrics.accuracy_score(y_test, y_predRF))
plt.show()

### Feature Importance

In [None]:
Features = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Gender_Num', 'Income_Category_Num', 'Marital_Status_Num', 'Education_Level_Num', 'Card_Category_Num']

importanceRF = RF.feature_importances_
importanceRF

cm = sns.light_palette("green", as_cmap=True)

FeatureImportanceRF = pd.DataFrame(Features, columns = ['Feature'])
FeatureImportanceRF['Score'] = importanceRF
FeatureImportanceRF = FeatureImportanceRF.sort_values(by=['Score']).style.background_gradient(cmap=cm)
FeatureImportanceRF

### Random Forest ML Model Takeaways

* The most important features to determine if a customer will leave the bank or not are: Total Transaction Counts, Total Transaction Amount, and Total Revolving Balance. 

* Our random forest model can accurately predict if a customer will leave the bank ~95% of the time.