In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from IPython.display import Image
import os
!ls ../input/
Image('../input/bank-picture/bank_churns.jpg')


The core of this kernel is to find features which have the highest impact on the target variable in the Credit Card Customers dataset. In this kernel I will present the simple visualisations method and K-Best features selection algorithm.

# Table of content
* [First glance at the dataset](#First_glance)
* [Visualizations for numerical features](#Num_vis)
* [Visualizations for categorical features](#Cat_vis)
* [Data preprocessing](#data_pre)
* [Logistic Regression](#Log_reg)
* [KBest features selection](#Kb_sel)

<a id = "First_glance"></a>
# First glance at the dataset

In [None]:

# Importing libraires

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from scipy import stats
from scipy.stats import ttest_ind
import warnings
warnings.filterwarnings("ignore")

sns.set(style="ticks")

In [None]:
# Importing dataset and removing variables releted tp Naive Baissian Classifier
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
df = df.drop(df.columns[[21, 22]], axis=1)

Let's get a glance at the Bank Churn Dataset. After removing variables related to Naive Bayesian Classifier, 21 variables left. Most of them are integers or floats. Attrition Flag is a dependent variable - it tells if the customer has churned from the bank or not. 

In [None]:
df.info()

In [None]:
df.head()

There are no missing values in the dataset.

In [None]:
df.isna().sum()

<a id = "Num_vis"></a>
# Visualizations for numerical features

Let’s visualise the distribution for each of the numeric variables. To visualise particular variables I will use distplot function from the seaborn package nested in a for loop.

In [None]:
# In the first step I create a new dataframe including only numeric variables

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include=numerics)
newdf2 = newdf.loc[:, newdf.columns != 'CLIENTNUM']

# Now I'm slicing the column with Attrition_Flag variable. On the plot I' d like to present the distribution for existing
# and churned customers separately. 

df_AttiredFlag = pd.DataFrame(df['Attrition_Flag'])

# Joining newdf2 and df_AttiredFlag (with inner join)
df_joined = pd.concat([newdf2, df_AttiredFlag], axis = 1, join = 'inner')

# Plotting multiple variables in the for loop

col = 0
for col in df_joined[['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']]:
    fig, axis2 = plt.subplots(figsize=(15,5))
    sns.distplot(df_joined[df_joined['Attrition_Flag'] == 'Attrited Customer'][col], bins = 20,  label = 'Existing customers',  color = 'skyblue')
    sns.distplot(df_joined[df_joined['Attrition_Flag'] == 'Existing Customer'][col], bins = 20,  label = 'Existing customers',  color = 'red')
   
    plt.show()

Based on analysis of the plots created for numerical variables we can conclude that the most influential features are:
* Contracts_Count_12_mon
* Total_Rev_Balance
* Total_Amt_Chng_Q4_Q1
* Total_Trans_Amt
* Total_Trans_Ct
* Total_Ct_Chng_Q4_Q1
* Avg_Utilization_Ratio

In the next part of this kernel we will confirm or reject the hypothesis that factors mentioned above have the highest impact on the final result.

<a id = "Cat_vis"></a>
# Visualizations for categorical features

In [None]:
# In the first step I create a new dataframe with only categorical variables
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf_non_num = df.select_dtypes(exclude=numerics)
non_numeric_df = pd.DataFrame(newdf_non_num.loc[:, newdf_non_num.columns != 'CLIENTNUM'])

# Creating a list with labels for levels of particular variable
list_labels = []
for col in non_numeric_df[[ "Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category"]]:   
    list_labels.append(non_numeric_df[col].value_counts().index)
    print(list_labels)
    
lists = []
for x in range (5):
    lists.append([i.split()[0] for i in list_labels[x]])

In [None]:
# Creating plots nested in a for loop

col = 0
#def non_num_plot (self):
for col, i in zip(non_numeric_df[['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']], range(5)):
        fig, ax = plt.subplots(1,2, dpi = 100, figsize=(15,15))
        ax[0].pie(non_numeric_df[non_numeric_df['Attrition_Flag'] == 'Attrited Customer'][col].value_counts(), labels = lists[i], autopct='%.2f%%')
        ax[0].set_title("Attrited Customers")
        ax[1].pie(non_numeric_df[non_numeric_df['Attrition_Flag'] == 'Existing Customer'][col].value_counts(), labels = lists[i], autopct='%.2f%%')
        ax[1].set_title("Existing Customers")

There are no significant differences between plots presenting churns and existing clients categorical characteristics.

<a id = "data_pre"></a>
# Data preprocessing
For improving the accuracy of the model it is indicated to remove from the dataset highly correlated features.

In [None]:
# Computing correlations between particular features
correlations_frame = df.corr()
plt.subplots(figsize=(20, 10))
sns.heatmap(correlations_frame, annot = True, cmap = 'viridis')

We will remove Client Id and the most correlated features.

In [None]:
#Data preprocessing
df_prepared = df.drop(["CLIENTNUM", 'Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Avg_Open_To_Buy' , 'Avg_Utilization_Ratio'], axis = 1)
df_prepared.head()

In the next step we will encode categorical features as dummies variables.

In [None]:
#Encoding dummies varaibles
df_encoded = pd.get_dummies(df_prepared, columns=['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])
df_encoded = df_encoded.drop(['Attrition_Flag_Existing Customer'], axis =1)
df_encoded.info()

<a id = "Log_reg"></a>
# Logistic regression

The idea is to create a simple logistic regression model and check which variable has an impact on the final result.

In [None]:
# Importing libraires
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import  train_test_split
from sklearn.metrics import classification_report


# Splitting data into train and test
X = df_encoded.loc[:, df_encoded.columns != 'Attrition_Flag_Attrited Customer']
Y = df_encoded['Attrition_Flag_Attrited Customer']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)


# Creating logistic regression model
log_reg = LogisticRegression()
model1 = log_reg.fit(X_train, Y_train)
y_predicted = log_reg.predict(X_test)

print(classification_report(Y_test, y_predicted))

<a id = "Kb_sel"></a>
# K-best features selection 

To check which variables are the most influential for the final result we will use Select KBest function.

Find out more about Select KBest function: https://www.kaggle.com/jepsds/feature-selection-using-selectkbest?utm_campaign=News&utm_medium=Community&utm_source=DataCamp.com 

In [None]:
# Features selection with KBest
from sklearn.feature_selection import SelectKBest, f_classif

selected_KBest = SelectKBest(f_classif, k=5).fit(X_train, Y_train)
selected_KBest_df = pd.DataFrame({'Features': list(X_train.columns),
                                'Scores': selected_KBest.scores_})
selected_KBest_df.sort_values(by='Scores', ascending = False)

According to the “Select KBest” algorithm five features that have the highest impact on the target variables are:
* Total_Trans_Ct
* Total_Ct_Chng_Q4_Q1
* Total_Revolving_Balance
* Contacts_Count_12_Mon
* Months_Inactive_12_Mon,

Results given by the KBest features selection algorithm are consistent with what we observed on the basis of the plots.