## Credit Card customers

* CLIENTNUM          :  Client number. Unique identifier for the customer holding the account
* Attrition_Flag     :  Internal event (customer activity) variable - if the account is closed then 1 else 0
* Customer_Age       :  Demographic variable - Customer's Age in Years
* Gender             :  Demographic variable - M=Male, F=Female
* Dependent_count    :  Demographic variable - Number of dependents
* Education_Level    :  Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)
* Marital_Status     :  Demographic variable - Married, Single, Divorced, Unknown
* Income_Category    :  Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, >
* Card_Category      :  Product Variable - Type of Card (Blue, Silver, Gold, Platinum)
* Months_on_book     :  Period of relationship with bank
* Total_Relationship_Count :  Total no. of products held by the customer
* Months_Inactive_12_mon   :  No. of months inactive in the last 12 months
* Contacts_Count_12_mon    :  No. of Contacts in the last 12 months
* Credit_Limit             ;  Credit Limit on the Credit Card
* Total_Revolving_Bal      :  Total Revolving Balance on the Credit Card
* Avg_Open_To_Buy          :  Open to Buy Credit Line (Average of last 12 months)
* Total_Amt_Chng_Q4_Q1     :  Change in Transaction Amount (Q4 over Q1)
* Total_Trans_Amt          :  Total Transaction Amount (Last 12 months)
* Total_Trans_Ct           :  Total Transaction Count (Last 12 months)
* Total_Ct_Chng_Q4_Q1      :  Change in Transaction Count (Q4 over Q1)
* Avg_Utilization_Ratio    :  Average Card Utilization Ratio
* Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 : 
* Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2' :

![](https://www.madd.org/wp-content/uploads/2017/11/Card-Handoff-2.jpg)



In [None]:
!pip install dataprep

In [None]:
from dataprep.eda import *

# 1- library and package 

In [None]:
# manipulation data
import pandas as pd
import numpy as np

#visualiation data
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib
import plotly.graph_objects as go
import plotly.express as px

#default theme
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'

# 2- laod and analysis data

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df.head()

In [None]:
# CLIENTNUM : Client number. Unique identifier for the customer holding the account ==> so we gonna drop it (usless)
df=df.drop('CLIENTNUM',axis=1)
df.head()

In [None]:
df.shape

so lie we c we had :
* 10127 Rows
* 22 columns

In [None]:
df.info()

In [None]:
# data type plot
df.dtypes.value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('type of our data');

In [None]:
df.describe(include='all')

#### some notes :
by applicing the describe() we can see are the most frequent valuer on eaxh columns are  :
1. Attrition_Flag : Existing Customer 
2. Gender : the famale 
3. Education_Level : Graduate 
4. Marital_Status : Married
5. Income_Category : Less than $40K
6. Card_Category : Blue

# 3- finding missing values

In [None]:
missing_values=df.isnull().sum()
percent_missing = df.isnull().sum()/df.shape[0]*100

value = {
    'missing_values ':missing_values,
    'percent_missing %':percent_missing
}
frame=pd.DataFrame(value)
frame

our data is so clean happy news :) 

# 4- data visualization

In [None]:
create_report(df)

# 5- features transformation

In [None]:
df.Attrition_Flag.replace({'Existing Customer': 1, 'Attrited Customer': 0}, inplace=True)

df.Gender.replace({'F': 0, 'M': 1}, inplace=True)

df.Education_Level.replace({'Graduate': 0, 'High School': 1, 'Unknown': 2,'Uneducated':3,'College':4,'Post-Graduate':5,'Doctorate':6}, inplace=True)

df.Marital_Status.replace({'Married': 0, 'Single': 1,'Unknown':2,'Divorced':3}, inplace=True)

df.Income_Category.replace({'Less than $40K': 0,'$80K - $120K': 1, '$60K - $80K': 2,'Unknown':3}, inplace=True)
df.Income_Category.replace({'$40K - $60K': 4}, inplace=True)
df.Income_Category.replace({'$120K +': 5}, inplace=True)

df.Card_Category.replace({'Blue': 0, 'Silver': 1,'Gold':2,'Platinum':3}, inplace=True)

df.head()

In [None]:
df.dtypes

In [None]:
# data type plot
df.dtypes.value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True)
plt.title('type of our data');

# 6- feautres selection

## A- corr map

In [None]:
df.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

## B-the most 10 important feature are

In [None]:
# Feature Selection

plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = df.drop('Attrition_Flag',axis=1)
y = df.Attrition_Flag

from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title('the most 10 important feature are')
plt.show()

In [None]:
df.columns

In [None]:
df=df.drop(['Customer_Age','Customer_Age','Gender','Dependent_count','Education_Level','Marital_Status','Income_Category','Card_Category','Months_on_book','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon','Credit_Limit','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1'],axis=1)

In [None]:
df

# 7- data split

In [None]:
X = df.drop(columns=["Attrition_Flag"])
y = df["Attrition_Flag"]

In [None]:
from sklearn.model_selection import train_test_split

#Splitting data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle =True)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#Applying GradientBoostingClassifier Model 


GBCModel = GradientBoostingClassifier(n_estimators=100,max_depth=3,random_state=33) 
GBCModel.fit(X_train, y_train)

#Calculating Details
print('GBCModel Train Score is : ' , GBCModel.score(X_train, y_train))
print('GBCModel Test Score is : ' , GBCModel.score(X_test, y_test))
#print('----------------------------------------------------')

#Calculating Prediction
y_pred = GBCModel.predict(X_test)
y_pred_prob = GBCModel.predict_proba(X_test)
#print('Predicted Value for GBCModel is : ' , y_pred[:10])
#print('Prediction Probabilities Value for GBCModel is : ' , y_pred_prob[:10])

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

#Calculating Confusion Matrix
CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)

# drawing confusion matrix
sns.heatmap(CM, center = True,cmap='GnBu')
plt.show()

## A- LOGISTIC REGRESSION

In [None]:
# Making Confusion Matrix and calculating accuracy score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

model = LogisticRegression()

#Fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mylist = []
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
# accuracy score
acc_logreg = accuracy_score(y_test, y_pred)

mylist.append(acc_logreg)
print(cm)
print(acc_logreg,'%')
# drawing confusion matrix
sns.heatmap(cm, center = True,cmap='GnBu')
plt.show()

# B- KNN

In [None]:
# Finding the optimum number of neighbors 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

list1 = []
for neighbors in range(1,5):
    classifier = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(1,5)), list1)
plt.show()

In [None]:
# Training the K Nearest Neighbor Classifier on the Training set

classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

# Predicting the Test set results

y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_knn = accuracy_score(y_test, y_pred)
mylist.append(acc_knn)
print(cm)
print(acc_knn)

In [None]:
# drawing confusion matrix
sns.heatmap(cm, center = True,cmap='GnBu')
plt.show()

## C- SUPPORT VECTOR MACHINE

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0]:
    classifier = SVC(C = c, random_state=0, kernel = 'rbf')
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0], list1)
plt.show()

In [None]:
# Training the Support Vector Classifier on the Training set

from sklearn.svm import SVC
classifier = SVC(C = 1.0, random_state=0, kernel = 'rbf')
classifier.fit(X_train, y_train)

In [None]:
# Predicting the test set results

y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_svc = accuracy_score(y_test, y_pred)
print(cm)
print(acc_svc,'%')
mylist.append(acc_svc)

## D) DecisionTreeClassifier

In [None]:

# Finding the optimum number of max_leaf_nodes

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for leaves in range(2,15):
    classifier = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(2,15)), list1)
plt.show()

In [None]:
# Training the Decision Tree Classifier on the Training set

classifier = DecisionTreeClassifier(max_leaf_nodes = 5, random_state=0, criterion='entropy')
classifier.fit(X_train, y_train)

In [None]:
#Predicting the test set results

y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
acc_decisiontree = accuracy_score(y_test, y_pred)
print(cm)
print(acc_decisiontree)
mylist.append(acc_decisiontree)

## E- ANN (neural network )

In [None]:

np.random.seed(0)
import tensorflow as tf

# Initialising the ANN

ann = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

# Adding the second hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

# Adding the third hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

# Adding the fourth hidden layer

ann.add(tf.keras.layers.Dense(units = 7, activation = 'relu'))

# Adding the output layer

ann.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

# Compiling the ANN

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy' , metrics = ['accuracy'] )

# Training the ANN on the training set

ann.fit(X_train, y_train, batch_size = 16, epochs = 100)

In [None]:
# Predicting the test set results

y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.9)
np.set_printoptions()

In [None]:
# Making the confusion matrix, calculating accuracy_score 

from sklearn.metrics import confusion_matrix, accuracy_score

# confusion matrix
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix")
print(cm)
print()

# accuracy
ac_ann = accuracy_score(y_test,y_pred)
print("Accuracy")
print(ac_ann)
mylist.append(ac_ann)

## F- xgboost

In [None]:

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30,1):
    classifier = XGBClassifier(n_estimators = estimators, max_depth=12, subsample=0.7)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30,1)), list1)
plt.show()

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 15, max_depth=12, subsample=0.7)
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)
print(y_pred)

In [None]:
# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac_xgboost = accuracy_score(y_test, y_pred)
mylist.append(ac_xgboost)
print(cm)
print(ac_xgboost)

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
               'ANN',   
              'Decision Tree','xgboost'],
    'Score': [acc_svc, acc_knn, acc_logreg, 
               ac_ann, acc_decisiontree,ac_xgboost
              ]})
models.sort_values(by='Score', ascending=False)

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=models.Model, y=models.Score, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()