### **Import Libraries**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import math
from collections import Counter
import operator

### **Load The Dataset**

In [4]:
df = pd.read_csv('bank-additional-full.csv', delimiter=';')

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

### **Data Preprocessing**

#### **Slicing The Dataframe**

In [7]:
bank_client = df.iloc[:, 0:7]
bank_client.head()

Unnamed: 0,age,job,marital,education,default,housing,loan
0,56,housemaid,married,basic.4y,no,no,no
1,57,services,married,high.school,unknown,no,no
2,37,services,married,high.school,no,yes,no
3,40,admin.,married,basic.6y,no,no,no
4,56,services,married,high.school,no,no,yes


In [8]:
bank_related = df.iloc[: , 7:11]
bank_related.head()

Unnamed: 0,contact,month,day_of_week,duration
0,telephone,may,mon,261
1,telephone,may,mon,149
2,telephone,may,mon,226
3,telephone,may,mon,151
4,telephone,may,mon,307


In [9]:
bank_se = df.loc[: , ['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]
bank_se.head()

Unnamed: 0,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.1,93.994,-36.4,4.857,5191.0
1,1.1,93.994,-36.4,4.857,5191.0
2,1.1,93.994,-36.4,4.857,5191.0
3,1.1,93.994,-36.4,4.857,5191.0
4,1.1,93.994,-36.4,4.857,5191.0


In [10]:
bank_o = df.loc[: , ['campaign', 'pdays','previous', 'poutcome']]
bank_o.head()

Unnamed: 0,campaign,pdays,previous,poutcome
0,1,999,0,nonexistent
1,1,999,0,nonexistent
2,1,999,0,nonexistent
3,1,999,0,nonexistent
4,1,999,0,nonexistent


#### **Reformatting 'bank_client' Dataframe Column Values**

In [11]:
print(bank_client['job'].unique())
print(bank_client['education'].unique())
print(bank_client['marital'].unique())
print(bank_client['default'].unique())
print(bank_client['housing'].unique())
print(bank_client['loan'].unique())

['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
['married' 'single' 'divorced' 'unknown']
['no' 'unknown' 'yes']
['no' 'yes' 'unknown']
['no' 'yes' 'unknown']


In [12]:
print('1º Quartile: ', bank_client['age'].quantile(q=0.25))
print('2º Quartile: ', bank_client['age'].quantile(q=0.50))
print('3º Quartile: ', bank_client['age'].quantile(q=0.75))
print('4º Quartile: ', bank_client['age'].quantile(q=1.00))

print('Ages above ', bank_client['age'].quantile(q=0.75) 
                      + 1.5*(bank_client['age'].quantile(q=0.75)
                      - bank_client['age'].quantile(q=0.25)), 
      'are outliers')

1º Quartile:  32.0
2º Quartile:  38.0
3º Quartile:  47.0
4º Quartile:  98.0
Ages above  69.5 are outliers


In [13]:
bank_client['job'].replace(['housemaid' , 'services' , 'admin.' , 'blue-collar' , 'technician', 'retired' , 'management', 
                            'unemployed', 'self-employed', 'unknown' , 'entrepreneur', 'student'] , 
                           [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inplace=True)
bank_client['education'].replace(['basic.4y' , 'high.school', 'basic.6y', 'basic.9y', 'professional.course', 'unknown' , 
                                  'university.degree' , 'illiterate'], [1, 2, 3, 4, 5, 6, 7, 8], inplace=True)
bank_client['marital'].replace(['married', 'single', 'divorced', 'unknown'], [1, 2, 3, 4], inplace=True)
bank_client['default'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)
bank_client['housing'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)
bank_client['loan'].replace(['yes', 'no', 'unknown'],[1, 2, 3], inplace=True)

def age(data):
    data.loc[data['age'] <= 32, 'age'] = 1
    data.loc[(data['age'] > 32) & (data['age'] <= 47), 'age'] = 2
    data.loc[(data['age'] > 47) & (data['age'] <= 70), 'age'] = 3
    data.loc[(data['age'] > 70) & (data['age'] <= 98), 'age'] = 4 
    return data

age(bank_client);

#### **Reformatting 'bank_related' Dataframe Column Values**

In [14]:
print(bank_related['contact'].unique())
print(bank_related['month'].unique())
print(bank_related['day_of_week'].unique())

['telephone' 'cellular']
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']
['mon' 'tue' 'wed' 'thu' 'fri']


In [15]:
print('1º Quartile: ', bank_related['duration'].quantile(q=0.25))
print('2º Quartile: ', bank_related['duration'].quantile(q=0.50))
print('3º Quartile: ', bank_related['duration'].quantile(q=0.75))
print('4º Quartile: ', bank_related['duration'].quantile(q=1.00))
    
print('Duration calls above: ', bank_related['duration'].quantile(q=0.75)
                                + 1.5*(bank_related['duration'].quantile(q=0.75) 
                                - bank_related['duration'].quantile(q=0.25)), 
      'are outliers')

1º Quartile:  102.0
2º Quartile:  180.0
3º Quartile:  319.0
4º Quartile:  4918.0
Duration calls above:  644.5 are outliers


In [16]:
bank_related['contact'].replace(['telephone', 'cellular'], [1, 2], inplace=True)
bank_related['month'].replace(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 
                               'mar', 'apr', 'sep'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], inplace=True)
bank_related['day_of_week'].replace(['mon', 'tue', 'wed', 'thu', 'fri'], [1, 2, 3, 4, 5], inplace=True)

def duration(data):
    data.loc[data['duration'] <= 102, 'duration'] = 1
    data.loc[(data['duration'] > 102) & (data['duration'] <= 180)  , 'duration'] = 2
    data.loc[(data['duration'] > 180) & (data['duration'] <= 319)  , 'duration'] = 3
    data.loc[(data['duration'] > 319) & (data['duration'] <= 644.5), 'duration'] = 4
    data.loc[data['duration']  > 644.5, 'duration'] = 5
    return data

duration(bank_related);

#### **Reformatting 'bank_o' Dataframe Column Values**

In [17]:
print(bank_o['poutcome'].unique())

['nonexistent' 'failure' 'success']


In [18]:
bank_o['poutcome'].replace(['nonexistent', 'failure', 'success'], [1, 2, 3], inplace=True)

#### **Concatenating The Dataframes**

In [19]:
bank_final= pd.concat([bank_client, bank_related, bank_se, bank_o, df['y']], axis=1)
bank_final = bank_final[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 
                         'contact', 'month', 'day_of_week', 'duration', 'emp.var.rate', 'cons.price.idx', 
                         'cons.conf.idx', 'euribor3m', 'nr.employed', 'campaign', 'pdays', 'previous', 
                         'poutcome', 'y']]
                         
bank_final.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome,y
0,3,1,1,1,2,2,2,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,no
1,3,2,1,2,3,2,2,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,no
2,2,2,1,2,2,1,2,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,no
3,2,3,1,3,2,2,2,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,no
4,3,2,1,2,2,2,1,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,no


#### **Reformatting The Label**

In [20]:
bank_final['y'].replace(['no', 'yes'], [0, 1], inplace=True)

#### **Shuffling The Dataset**

In [21]:
bank_final_shuffled = bank_final.sample(frac=1)

In [22]:
bank_final_shuffled.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome,y
20980,2,3,3,7,2,2,2,2,4,4,...,1.4,93.444,-36.1,4.964,5228.1,1,999,0,1,0
25974,2,3,3,7,2,1,2,2,6,3,...,-0.1,93.2,-42.0,4.12,5195.8,1,999,0,1,0
21356,1,5,2,5,2,1,2,2,4,1,...,1.4,93.444,-36.1,4.963,5228.1,2,999,0,1,0
13172,1,2,2,5,2,1,2,2,3,3,...,1.4,93.918,-42.7,4.962,5228.1,1,999,0,1,0
35348,2,3,2,2,2,2,1,2,1,5,...,-1.8,92.893,-46.2,1.25,5099.1,3,999,1,2,0


#### **Splitting The Dataset**

Train Set (70%), Validation Set (20%), Test Set (10%)

In [23]:
length_train_low = int(bank_final_shuffled.shape[0]*0)
length_train_up = int(bank_final_shuffled.shape[0]*0.7)
length_val_low = int(bank_final_shuffled.shape[0]*0.7)
length_val_up = int(bank_final_shuffled.shape[0]*0.9)
length_test_low = int(bank_final_shuffled.shape[0]*0.9)
length_test_up = int(bank_final_shuffled.shape[0]*1)

train_bank_final = bank_final_shuffled.iloc[length_train_low:length_train_up, :]
val_bank_final = bank_final_shuffled.iloc[length_val_low:length_val_up, :]
test_bank_final = bank_final_shuffled.iloc[length_test_low:length_test_up, :].reset_index().drop(["index"], axis=1)

#### **Reformatting The Column Names**

In [24]:
train_bank_final.rename(columns={'age':'', 'job':'', 'marital':'', 'education':'', 'default':'', 'housing':'',
                                 'loan':'', 'contact':'', 'month':'', 'day_of_week':'', 'duration':'', 'emp.var.rate':'',
                                 'cons.price.idx':'', 'cons.conf.idx':'', 'euribor3m':'', 'nr.employed':'',
                                 'campaign':'', 'pdays':'', 'previous':'', 'poutcome':'', 'y':''})

val_bank_final.rename(columns={'age':'', 'job':'', 'marital':'', 'education':'', 'default':'', 'housing':'',
                                    'loan':'', 'contact':'', 'month':'', 'day_of_week':'', 'duration':'', 
                                    'emp.var.rate':'', 'cons.price.idx':'', 'cons.conf.idx':'', 'euribor3m':'',
                                    'nr.employed':'', 'campaign':'', 'pdays':'', 'previous':'', 'poutcome':'', 'y':''})

test_bank_final.rename(columns={'age':'', 'job':'', 'marital':'', 'education':'', 'default':'', 'housing':'', 'loan':'',
                                'contact':'', 'month':'', 'day_of_week':'' ,'duration':'' ,'emp.var.rate':''
                                ,'cons.price.idx':'', 'cons.conf.idx':'', 'euribor3m':'', 'nr.employed':'', 
                                'campaign':'', 'pdays':'', 'previous':'', 'poutcome':'', 'y':''})

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,3,2,1,1,2,2,2,1,2,4,...,1.4,94.465,-41.8,4.958,5228.1,1,999,0,1,0
1,3,6,1,4,3,2,2,2,3,4,...,1.4,93.918,-42.7,4.968,5228.1,4,999,0,1,0
2,2,4,1,4,3,1,2,2,1,5,...,-1.8,92.893,-46.2,1.313,5099.1,3,999,1,2,0
3,1,2,1,2,2,1,2,2,1,3,...,-1.8,92.893,-46.2,1.281,5099.1,3,999,0,1,0
4,1,4,2,4,2,2,2,2,1,3,...,-1.8,92.893,-46.2,1.281,5099.1,1,999,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4114,2,3,2,2,2,3,3,1,1,4,...,1.1,93.994,-36.4,4.860,5191.0,1,999,0,1,0
4115,2,2,1,2,2,1,2,2,9,5,...,-1.8,93.075,-47.1,1.405,5099.1,1,999,1,2,0
4116,3,2,1,4,3,1,2,2,3,4,...,1.4,93.918,-42.7,4.962,5228.1,2,999,0,1,0
4117,2,3,1,6,3,2,2,1,1,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0


#### **Reformatting The Splitted Sets**

In [25]:
train_bank_final = train_bank_final.astype(float).values.tolist()
val_bank_final = val_bank_final.astype(float).values.tolist()
test_bank_final = test_bank_final.astype(float).values.tolist()

In [26]:
train_set = {0:[], 1:[]}
test_set = {0:[], 1:[]}
val_set = {0:[], 1:[]}

for i in train_bank_final:
  train_set[i[-1]].append(i[:-1])
  
for i in val_bank_final:
  val_set[i[-1]].append(i[:-1])

for i in test_bank_final:
  test_set[i[-1]].append(i[:-1])

### **Model Development**

#### **Creating The KNN Algorithm from Scratch**

In [27]:
def knn_algo(set, predict, k):
  distances = []
  
  for data in set:
    for records in set[data]:
      euclidean_distance = math.sqrt((records[0] - predict[0])**2 + (records[1] - predict[1])**2)
      distances.append([euclidean_distance, data])
  votes = [i[1] for i in sorted(distances)[:k]]
  vote_result = Counter(votes).most_common(1)[0][0]

  return vote_result

#### **Training The Created KNN Algorithm**

In [28]:
k_range = range(1, 5)
correct = 0
total = 0
max_accuracy = 0.0
optimal_k = 0
accuracy = 0.0
accuracies = []

for k in k_range:
  for data in train_set:
    for records in train_set[data]:
      vote = knn_algo(train_set, records, k)
      if data == vote:
        correct+=1
      total+=1
      accuracy = correct / total

  print("Accuracy with K =", k)
  print(accuracy)
  if max_accuracy < accuracy:
    max_accuracy = accuracy
    optimal_k = k

  accuracies.append(accuracy)

Accuracy with K = 1
0.8876556484339773
Accuracy with K = 2
0.8876556484339773
Accuracy with K = 3
0.8876787716925069
Accuracy with K = 4
0.8876903333217717


In [29]:
print('Maximum Accuracy: ', max_accuracy)
print('Optimal K Value: ', optimal_k)

Maximum Accuracy:  0.8876903333217717
Optimal K Value:  4


#### **Testing The Test Set by Using Created Algorithm and Optimal K Value**

In [30]:
for data in test_set:
    for records in test_set[data]:
        vote = knn_algo(test_set, records, optimal_k)
        if data == vote:
            correct+=1
        total+=1
        accuracy = correct/total

print('Accuracy: ', accuracy)

Accuracy:  0.8876367807238599
