
# Bank data Analysis
  - Using KNN algorithm
  - Goal is to predict whether particular client is willing to subscribe for term deposit in bank
  - Lets see.

## import all the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')

# Read Data from source

In [2]:
banks = pd.read_csv('bank-additional-full.csv',delimiter=";")

#### Start phase1 exploring data

In [3]:
banks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## As per data catalog columns can be Categorised 
- 1. Column 1:6 as Client Data
- 2. Column 7:10 as Campaign data
- 3. Column 11:19 as Social and Ecomonic details
- 4. Column 20 i.e. y is Lable data which shows client subscription status

## Lets explore Client Data

In [4]:
#Extract client data from bank
banks_clients = banks.iloc[:, :7]
banks_clients

Unnamed: 0,age,job,marital,education,default,housing,loan
0,56,housemaid,married,basic.4y,no,no,no
1,57,services,married,high.school,unknown,no,no
2,37,services,married,high.school,no,yes,no
3,40,admin.,married,basic.6y,no,no,no
4,56,services,married,high.school,no,no,yes
...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no
41184,46,blue-collar,married,professional.course,no,no,no
41185,56,retired,married,university.degree,no,yes,no
41186,44,technician,married,professional.course,no,no,no


### since our machine learning model can only understand numeric values.. we need to convert our string types into numeric values

#### lets do it

In [6]:
#Lets see the individual column details
print('jobs: \n',banks['job'].unique())
print('marital:\n',banks['marital'].unique())
print('education: \n',banks['education'].unique())
print('default: \n',banks['default'].unique())
print('housing:\n',banks['housing'].unique())
print('loan:\n',banks['loan'].unique())

jobs: 
 ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
marital:
 ['married' 'single' 'divorced' 'unknown']
education: 
 ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
default: 
 ['no' 'unknown' 'yes']
housing:
 ['no' 'yes' 'unknown']
loan:
 ['no' 'yes' 'unknown']


In [7]:
#Each column has limited categories.. so that we can go for label encoding of each column
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
banks_clients['job']=lb.fit_transform(banks_clients['job'])
banks_clients['marital']=lb.fit_transform(banks_clients['marital'])
banks_clients['education'] = lb.fit_transform(banks_clients['education'])
banks_clients['default']=lb.fit_transform(banks_clients['default'])
banks_clients['housing']=lb.fit_transform(banks_clients['housing'])
banks_clients['loan']=lb.fit_transform(banks_clients['loan'])
banks_clients

Unnamed: 0,age,job,marital,education,default,housing,loan
0,56,3,1,0,0,0,0
1,57,7,1,3,1,0,0
2,37,7,1,3,0,2,0
3,40,0,1,1,0,0,0
4,56,7,1,3,0,0,2
...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0
41184,46,1,1,5,0,0,0
41185,56,5,1,6,0,2,0
41186,44,9,1,5,0,0,0


### now lets do the data explory and labeling string colums of Campaign Data

In [9]:
bank_campaign = banks.iloc[:,7:11]
bank_campaign

Unnamed: 0,contact,month,day_of_week,duration
0,telephone,may,mon,261
1,telephone,may,mon,149
2,telephone,may,mon,226
3,telephone,may,mon,151
4,telephone,may,mon,307
...,...,...,...,...
41183,cellular,nov,fri,334
41184,cellular,nov,fri,383
41185,cellular,nov,fri,189
41186,cellular,nov,fri,442


- we can convert month values to int.
- we can converty day_of_week to int
- lable contact column

In [10]:
look_up = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05',
            'jun': '06', 'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}
bank_campaign['month'] = bank_campaign['month'].map(look_up)

#change day of a month into number 
day_num = {'mon':'01','tue':'02','wed':'03','thu':'04','fri':'05'}
bank_campaign['day_of_week'] = bank_campaign['day_of_week'].map(day_num)

bank_campaign['contact'] = bank_campaign['contact'].map({'telephone':'01','cellular':'02'})

bank_campaign

Unnamed: 0,contact,month,day_of_week,duration
0,01,05,01,261
1,01,05,01,149
2,01,05,01,226
3,01,05,01,151
4,01,05,01,307
...,...,...,...,...
41183,02,11,05,334
41184,02,11,05,383
41185,02,11,05,189
41186,02,11,05,442


## Time to explore the last social and ecomonic data

In [12]:
bank_se = banks.iloc[:,11:-1]
bank_se

Unnamed: 0,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41184,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41185,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41186,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6


- only poutcome column is of string type. We need to convert into int
- first see whats inside poutcome column

In [13]:
print("poutcome: \n", bank_se['poutcome'].unique())

poutcome: 
 ['nonexistent' 'failure' 'success']


- As it has only 3 unique values we can convert them into categories

In [14]:
bank_se['poutcome'] = lb.fit_transform(bank_se['poutcome'])
bank_se

Unnamed: 0,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
2,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
3,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
4,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41184,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41185,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6
41186,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6


## Time to join the divided data into single source.
- Below we can see all out data is converted into numeric type so that our model can understand

In [16]:
banks = pd.concat([banks_clients, bank_campaign,bank_se,banks['y'].map({'yes':1,'no':0})], axis=1)
banks

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,01,05,01,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,01,05,01,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,01,05,01,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,01,05,01,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,01,05,01,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0,02,11,05,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,1,1,5,0,0,0,02,11,05,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,5,1,6,0,2,0,02,11,05,...,2,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,9,1,5,0,0,0,02,11,05,...,1,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1


### scaling our model to normalize distance between points. Its very usefull for models which are based on distance like KNN.

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(banks.drop('y',axis=1))
scaled_feature = scaler.transform(banks.drop('y',axis=1))

- X is Feature which is scaled
- y is lable

In [19]:
X = pd.DataFrame(scaled_feature, columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'])
y = banks['y']

## Below we can see in column age of dataframe banks and X value is scaled

In [21]:
banks.head(1)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,5,1,...,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0


In [20]:
X.head(1)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.533034,-0.201579,-0.283741,-1.753925,-0.5136,-1.087707,-0.452491,-1.31827,-0.787808,-1.402469,0.010471,-0.565922,0.195414,-0.349494,0.192622,0.648092,0.722722,0.886447,0.71246,0.33168


#### Time to split our source data into Train and Test Category 

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

- test_size = 0.3 means our data will be spllited into 70% for training and 30% for testing
- Checking the split size

In [23]:
print("X_train count: \n",X_train['age'].count())

print("X_test count: \n", X_test['age'].count())

X_train count: 
 28831
X_test count: 
 12357


- simply call our algorithm and train it with our train data and predict the results with test data

In [24]:
from sklearn.neighbors import KNeighborsClassifier
Knn = KNeighborsClassifier(n_neighbors=4)

In [25]:
# fit our data
Knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

In [26]:
# Predict output with test features(x_test)
pred = Knn.predict(X_test)

- yeah.. time to check how much accuracy our model acheived.

In [28]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
print("Accuracy:", round(accuracy_score(y_test, pred),2)*100)

[[10726   242]
 [  992   397]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     10968
           1       0.62      0.29      0.39      1389

    accuracy                           0.90     12357
   macro avg       0.77      0.63      0.67     12357
weighted avg       0.88      0.90      0.88     12357

Accuracy: 90.0


### A rough guide for classifying the accuracy of a diagnostic test is the traditional academic point system:

- 90 - 100 = excellent (A)

- 80 - 90 = good (B)

- 70 - 80 = fair (C)

- 60 - 70 = poor (D)

- 50 - 60 = fail (F)

## ok, its my first Machine learning model.. will improve more data visualization and Exploring. Most importantly should start implementing Feature Selection.

- Bye.