# Input Variables

## Bank client data:
1 - Age (numeric)
2 - Job (Categorical) - Type of job (Admin, Unknown, Unemployed, Management, Housemaid, Entrepreneur, Student, Blue-collar, Self-employed, Retired, Technician, Services
3 - Marital - Marital Status (Categorical: Married, Divorced, Single. Note: Widowed falls under the divorced category)
4 - Education - Categorical (Unknown, Primary, Secondary, Tertiary)
5 - Default - Binary(Has credit in default? Yes/No)
6 - Balance - Numeric (Average yearly balance, in euros)
7 - Housing - Binary (Have housing loan? Yes/No)
8 - Loan - Binary(Have a personal loan? Yes/No)

## Related with the last contact of current campaign:
9 - Contact - Categorical (Contact communication type: Unknown, telephone, cellular)
10 - Day - Numeric (Last contact day of the month)
11 - Month - Categorical (Last contact month of the year: "Jan", "Feb", "Mar", ... , "Nov", "Dec"
12 - Duration - Numeric (Last contact duration, in seconds)

## Other attributes:
13 - Campaign - Numeric (Number of contacts performed during this campaign and for this client, also includes last contact)
14 - pdays - Numeric (Number of days that have passed after the client was last contacted from a previous campaign; -1 means the client was not previously contacted.)
15 - Previous - Numeric (Number of contacts performed before this campaign and for this client.)
16 - poutcome - Categorical (Outcome of the previous marketing campaign: Unknown, Other, Failure, Success)

<b> Output variable </b> - Desired target

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import statsmodels
from sklearn.preprocessing import (LabelEncoder, OrdinalEncoder, StandardScaler)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
warnings.filterwarnings("ignore")

# <i>Importing the dataset with pandas</i>

In [2]:
bank = pd.read_csv('C:\\Users\\rxbro\\Documents\\Capstone 2\\bank-full.csv', sep=';', quoting=3)
bank.head()

Unnamed: 0,"""age""","""job""","""marital""","""education""","""default""","""balance""","""housing""","""loan""","""contact""","""day""","""month""","""duration""","""campaign""","""pdays""","""previous""","""poutcome""","""y"""
0,58,"""management""","""married""","""tertiary""","""no""",2143,"""yes""","""no""","""unknown""",5,"""may""",261,1,-1,0,"""unknown""","""no"""
1,44,"""technician""","""single""","""secondary""","""no""",29,"""yes""","""no""","""unknown""",5,"""may""",151,1,-1,0,"""unknown""","""no"""
2,33,"""entrepreneur""","""married""","""secondary""","""no""",2,"""yes""","""yes""","""unknown""",5,"""may""",76,1,-1,0,"""unknown""","""no"""
3,47,"""blue-collar""","""married""","""unknown""","""no""",1506,"""yes""","""no""","""unknown""",5,"""may""",92,1,-1,0,"""unknown""","""no"""
4,33,"""unknown""","""single""","""unknown""","""no""",1,"""no""","""no""","""unknown""",5,"""may""",198,1,-1,0,"""unknown""","""no"""


In [3]:
# The values have quotes in them.  We need to replace them with spaces.
new_name = bank.columns.str.replace('"', '')
bank.columns = new_name
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,"""management""","""married""","""tertiary""","""no""",2143,"""yes""","""no""","""unknown""",5,"""may""",261,1,-1,0,"""unknown""","""no"""
1,44,"""technician""","""single""","""secondary""","""no""",29,"""yes""","""no""","""unknown""",5,"""may""",151,1,-1,0,"""unknown""","""no"""
2,33,"""entrepreneur""","""married""","""secondary""","""no""",2,"""yes""","""yes""","""unknown""",5,"""may""",76,1,-1,0,"""unknown""","""no"""
3,47,"""blue-collar""","""married""","""unknown""","""no""",1506,"""yes""","""no""","""unknown""",5,"""may""",92,1,-1,0,"""unknown""","""no"""
4,33,"""unknown""","""single""","""unknown""","""no""",1,"""no""","""no""","""unknown""",5,"""may""",198,1,-1,0,"""unknown""","""no"""


In [4]:
# Now we need to remove all of the quotes.  We need to better understand our data set.
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
# To get rid of the quotes in the data, we need to loop over all values and remove the quotes.
# Since integers cannot use string replace, we will need to escape the integer values.
for column in bank.columns:
    if bank[column].dtypes != 'int64':
        bank[column] = bank[column].str.replace('"', '')

In [6]:
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Data Preprocessing
<b> Convert Age features into a group.</b>

In [7]:
bank['age_group'] = bank['age'].apply(lambda x : '17-24' if x < 25 else '25-34'
                                     if x < 35 else '35-44'
                                     if x < 45 else '45-54'
                                     if x < 55 else '55-64'
                                     if x < 65 else '65+') # 65 year olds will be in the last category.

<b> Convert pdays features into a group. </b>

In [8]:
bank['pdays'].unique()

array([ -1, 151, 166,  91,  86, 143, 147,  89, 140, 176, 101, 174, 170,
       167, 195, 165, 129, 188, 196, 172, 118, 119, 104, 171, 117, 164,
       132, 131, 123, 159, 186, 111, 115, 116, 173, 178, 110, 152,  96,
       103, 150, 175, 193, 181, 185, 154, 145, 138, 126, 180, 109, 158,
       168,  97, 182, 127, 130, 194, 125, 105, 102,  26, 179,  28, 183,
       155, 112, 120, 137, 124, 187, 190, 113, 162, 134, 169, 189,   8,
       144, 191, 184, 177,   5,  99, 133,  93,  92,  10, 100, 156, 198,
       106, 153, 146, 128,   7, 121, 160, 107,  90,  27, 197, 136, 139,
       122, 157, 149, 135,  30, 114,  98, 192, 163,  34,  95, 141,  31,
       199,  94, 108,  29, 268, 247, 253, 226, 244, 239, 245, 204, 231,
       238, 258, 230, 254, 265,  71, 223, 246, 250, 266, 240, 205, 261,
       259, 241, 260, 234, 251, 225, 161, 237, 262, 248, 255, 220, 227,
       206, 224, 249, 235, 228, 263,   2, 270, 232, 252, 207, 200, 269,
       233, 256, 273, 272, 242, 264, 208, 214, 222, 271, 203, 22

In [9]:
bank_bin = [151, 166,  91,  86, 143, 147,  89, 140, 176, 101, 174, 170,
       167, 195, 165, 129, 188, 196, 172, 118, 119, 104, 171, 117, 164,
       132, 131, 123, 159, 186, 111, 115, 116, 173, 178, 110, 152,  96,
       103, 150, 175, 193, 181, 185, 154, 145, 138, 126, 180, 109, 158,
       168,  97, 182, 127, 130, 194, 125, 105, 102,  26, 179,  28, 183,
       155, 112, 120, 137, 124, 187, 190, 113, 162, 134, 169, 189,   8,
       144, 191, 184, 177,   5,  99, 133,  93,  92,  10, 100, 156, 198,
       106, 153, 146, 128,   7, 121, 160, 107,  90,  27, 197, 136, 139,
       122, 157, 149, 135,  30, 114,  98, 192, 163,  34,  95, 141,  31,
       199,  94, 108,  29, 268, 247, 253, 226, 244, 239, 245, 204, 231,
       238, 258, 230, 254, 265,  71, 223, 246, 250, 266, 240, 205, 261,
       259, 241, 260, 234, 251, 225, 161, 237, 262, 248, 255, 220, 227,
       206, 224, 249, 235, 228, 263,   2, 270, 232, 252, 207, 200, 269,
       233, 256, 273, 272, 242, 264, 208, 214, 222, 271, 203, 221, 202,
       216, 201, 257, 229, 210, 217,  75, 213,  73,  76, 267, 211, 215,
        77, 236,  82,   6, 209, 274,   1, 243, 212, 275,  80, 276,   9,
       279,  12, 280,  88, 277,  85,  84, 219,  24,  21, 282,  41, 294,
        49, 329, 307, 303, 331, 308, 300,  64, 314, 287, 330, 332, 302,
       323, 318, 333,  60, 326, 335, 313, 312, 305, 325, 327, 336, 309,
       328, 322,  39, 316, 292, 295, 310, 306, 320, 317, 289,  57, 321,
       142, 339, 301, 315, 337, 334, 340, 319,  17,  74, 148, 341, 299,
       344, 342, 324, 345, 346, 304, 281, 343, 338,  14, 347,  15, 291,
       348, 349, 285, 350, 284,  25, 283, 278,  81,   4,  87,  83,  79,
        70,  13, 293,  37,  78,  63,  22, 296, 355,  66,  19,  35, 360,
       357, 354, 351, 362, 358, 365, 298, 286, 364, 363,  47, 361, 288,
       366, 356, 352, 359, 297, 367, 353, 368,  42, 290,  67, 371, 370,
       369,  50,  36, 373, 374, 372, 311, 375, 378,  59, 379,  40,  18,
        43,  20,  69,  38, 385,  56,  55,  44, 391,  72, 390,  32,  62,
       399, 393,  65, 377, 395, 388, 389, 386,  61, 412, 405, 434, 394,
       382, 459, 440, 397, 383,  68, 461, 462, 463, 422,  51, 457, 430,
       442, 403, 454, 428, 392, 410, 401, 474, 475, 477, 478,  54, 476,
       380, 479,  45,  46, 495,  58,  48, 518,  52, 515, 520, 511, 536,
       387, 218,  33, 544, 435, 436, 555, 433, 446, 558, 469, 616, 561,
       553, 384, 592, 467, 585, 480, 421, 667, 626, 426, 595, 381, 376,
       648, 521, 452, 449, 633, 398,  53, 460, 670, 551, 414, 557, 687,
       404, 651, 686, 425, 504, 578, 674, 416, 586, 411, 756, 450, 745,
       514, 417, 424, 776, 396, 683, 529, 439, 415, 456, 407, 458, 532,
       481, 791, 701, 531, 792, 413, 445, 535, 784, 419, 455, 491, 431,
       542, 470, 472, 717, 437,   3, 782, 728, 828, 524, 562, 761, 492,
       775, 579, 493, 464, 760, 466, 465, 656, 831, 490, 432, 655, 427,
       749, 838, 769, 587, 778, 854, 779, 850, 771, 594, 842, 589, 603,
       484, 489, 486, 409, 444, 680, 808, 485, 503, 690, 772, 774, 526,
       420, 528, 500, 826, 804, 508, 547, 805, 541, 543, 871, 550, 530]

In [10]:
pd.qcut(bank_bin, q=4)

[(143.25, 282.5], (143.25, 282.5], (0.999, 143.25], (0.999, 143.25], (0.999, 143.25], ..., (427.75, 871.0], (427.75, 871.0], (427.75, 871.0], (427.75, 871.0], (427.75, 871.0]]
Length: 558
Categories (4, interval[float64]): [(0.999, 143.25] < (143.25, 282.5] < (282.5, 427.75] < (427.75, 871.0]]

In [11]:
bank['pdays_group'] = bank['pdays'].apply(lambda x: 'Not Previous Contacted' if x < 0 else '1 to 143 days'
                                         if x < 144 else '144 to 282 days'
                                         if x < 283 else '283 to 427 days'
                                         if x < 428 else 'More than 428 days')

<b> Unknown value in Jobs. </b>

In [12]:
bank['job'].unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin.', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

In [13]:
bank[bank['job'] == 'unknown']['job'].count()

288

In [14]:
print(f'Percent of unknowns in column {round(288/len(bank)*100, 2)} %')

Percent of unknowns in column 0.64 %


In [15]:
bank['job'] = bank['job'].apply(lambda x: np.NaN if x == 'unknown' else x)

In [16]:
bank['job'].isnull().sum()

288

<b>Unknown value in Education.</b>

In [17]:
bank['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [18]:
bank[bank['education'] == 'unknown']['education'].count()

1857

In [19]:
print(f'Percent of unknowns in education column: {round(1857/len(bank)*100, 2)} %')

Percent of unknowns in education column: 4.11 %


In [20]:
bank['education'] = bank['education'].apply(lambda x : np.NaN if x == 'unknown' else x)

In [21]:
bank['education'].isnull().sum()

1857

<b>Unknown values in poutcome column.</b>

In [22]:
bank['poutcome'].unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

In [23]:
bank[bank['poutcome'] =='unknown']['poutcome'].count()

36959

In [24]:
print(f'Percent of unknown in poutcome column: {round(36959 / len(bank) * 100, 2)} %')

Percent of unknown in poutcome column: 81.75 %


Since the percentage of unknowns in this column is very high, this column will be removed.

<b> Unknown value in Contact column. </b>

In [25]:
bank['contact'].unique()

array(['unknown', 'cellular', 'telephone'], dtype=object)

In [26]:
bank[bank['contact'] == 'unknown']['contact'].count()

13020

In [27]:
print(f'Percent unknown in the contact column: {round(13020 / len(bank) * 100, 2)} %')

Percent unknown in the contact column: 28.8 %


<li> Since the percentage of missing values is not too big or too small, we will use Classification to fill in the missing values. </li>

<b>Drop </b>
<p> We will be dropping missing values here.

In [28]:
bank = bank.dropna(axis=0)
bank.isnull().sum()

age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day            0
month          0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
age_group      0
pdays_group    0
dtype: int64

In [29]:
# Drop poutcome features
bank = bank.drop(columns=['poutcome', 'age', 'pdays'])

# Standardization

In [30]:
bank_std_mv = bank.copy()

In [31]:
scaler_mv = StandardScaler()

In [32]:
scaler_mv.fit(bank_std_mv[['balance', 'day', 'duration', 'campaign', 'previous']])

StandardScaler()

In [33]:
bank_std_mv[['balance', 'day', 'duration', 'campaign', 'previous']] = scaler_mv.transform(bank_std_mv[['balance', 'day', 'duration', 'campaign', 'previous']])
bank_std_mv

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,management,married,tertiary,no,0.259354,yes,no,unknown,-1.301418,may,0.010368,-0.573827,-0.250730,no,55-64,Not Previous Contacted
1,technician,single,secondary,no,-0.435568,yes,no,unknown,-1.301418,may,-0.415726,-0.573827,-0.250730,no,35-44,Not Previous Contacted
2,entrepreneur,married,secondary,no,-0.444443,yes,yes,unknown,-1.301418,may,-0.706245,-0.573827,-0.250730,no,25-34,Not Previous Contacted
5,management,married,tertiary,no,-0.369166,yes,no,unknown,-1.301418,may,-0.462209,-0.573827,-0.250730,no,35-44,Not Previous Contacted
6,management,single,tertiary,no,-0.298161,yes,yes,unknown,-1.301418,may,-0.160070,-0.573827,-0.250730,no,25-34,Not Previous Contacted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,-0.173904,no,no,cellular,0.143343,nov,2.783852,0.078925,-0.250730,yes,45-54,Not Previous Contacted
45207,retired,divorced,primary,no,0.123262,no,no,cellular,0.143343,nov,0.765716,-0.247451,-0.250730,yes,65+,Not Previous Contacted
45208,retired,married,secondary,no,1.433555,no,no,cellular,0.143343,nov,3.364890,0.731677,1.035364,yes,65+,144 to 282 days
45209,blue-collar,married,secondary,no,-0.225513,no,no,telephone,0.143343,nov,0.967143,0.405301,-0.250730,no,55-64,Not Previous Contacted


## Label Encoder

In [34]:
bank_encode = bank_std_mv.copy()

In [35]:
label_encoder_X = LabelEncoder()

In [36]:
bank_encode['job'] = label_encoder_X.fit_transform(bank_encode['job'])
bank_encode['marital'] = label_encoder_X.fit_transform(bank_encode['marital'])
bank_encode['default'] = label_encoder_X.fit_transform(bank_encode['default'])
bank_encode['housing'] = label_encoder_X.fit_transform(bank_encode['housing'])
bank_encode['pdays_group'] = label_encoder_X.fit_transform(bank_encode['pdays_group'])
bank_encode['loan'] = label_encoder_X.fit_transform(bank_encode['loan'])
bank_encode['y'] = label_encoder_X.fit_transform(bank_encode['y'])

In [37]:
labelOE_ed = OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']])
labelOE_mo = OrdinalEncoder(categories=[['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']])
labelOE_age = OrdinalEncoder(categories=[['17-24', '25-34', '35-44', '45-54', '55-64', '65+']])

In [38]:
bank_encode['education'] = labelOE_ed.fit_transform(bank_encode[['education']])
bank_encode['month'] = labelOE_mo.fit_transform(bank_encode[['month']])
bank_encode['age_group'] = labelOE_age.fit_transform(bank_encode[['age_group']])

In [39]:
bank_encode['education'] = bank_encode['education'].apply(lambda x: int(x))
bank_encode['month'] = bank_encode['month'].apply(lambda x: int(x))
bank_encode['age_group'] = bank_encode['age_group'].apply(lambda x: int(x))

In [40]:
bank_encode.head()

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.25073,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.25073,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.25073,0,1,4
5,4,1,2,0,-0.369166,1,0,unknown,-1.301418,4,-0.462209,-0.573827,-0.25073,0,2,4
6,4,2,2,0,-0.298161,1,1,unknown,-1.301418,4,-0.16007,-0.573827,-0.25073,0,1,4


# Prepare the data for Testing

In [41]:
bank_test_miss = bank_encode[bank_encode['contact'] == 'unknown']
bank_test_miss

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.250730,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.250730,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.250730,0,1,4
5,4,1,2,0,-0.369166,1,0,unknown,-1.301418,4,-0.462209,-0.573827,-0.250730,0,2,4
6,4,2,2,0,-0.298161,1,1,unknown,-1.301418,4,-0.160070,-0.573827,-0.250730,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45061,6,2,1,0,-0.106187,0,0,unknown,0.504533,9,-0.973522,-0.573827,-0.250730,0,1,4
45062,5,1,0,0,-0.201188,0,0,unknown,0.504533,9,-0.981269,-0.573827,-0.250730,0,4,4
45122,2,2,2,0,-0.358975,1,1,unknown,1.226913,9,-0.934786,-0.573827,-0.250730,0,2,4
45135,1,1,0,0,-0.019732,0,0,unknown,1.467707,9,-0.725612,-0.573827,-0.250730,0,3,4


In [42]:
bank_mv = bank_encode.copy()

In [43]:
bank_mv['contact'] = bank_mv['contact'].apply(lambda x : np.NaN if x == 'unknown' else x)

In [44]:
bank_mv = bank_mv.dropna(axis = 0)

In [45]:
bank_mv.head()

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
12657,4,2,1,0,-0.433596,0,0,cellular,-1.421815,6,-0.012873,-0.573827,-0.25073,0,1,4
12658,1,1,0,0,-0.291916,0,0,cellular,-1.421815,6,0.149817,-0.573827,-0.25073,0,3,4
12659,1,1,1,0,-0.410585,0,1,cellular,-1.421815,6,1.586916,-0.247451,-0.25073,0,2,4
12660,9,2,1,0,-0.438855,0,0,telephone,-1.421815,6,-0.748854,-0.247451,-0.25073,0,1,4
12661,9,2,1,0,-0.403682,1,1,cellular,-1.421815,6,0.688245,0.405301,-0.25073,0,1,4


In [46]:
# Cellular = 0, Telephone = 1
bank_mv['contact'] = bank_mv['contact'].apply(lambda x: 0 if x == 'cellular' else 1)

In [47]:
bank_mv.head(10)

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
12657,4,2,1,0,-0.433596,0,0,0,-1.421815,6,-0.012873,-0.573827,-0.25073,0,1,4
12658,1,1,0,0,-0.291916,0,0,0,-1.421815,6,0.149817,-0.573827,-0.25073,0,3,4
12659,1,1,1,0,-0.410585,0,1,0,-1.421815,6,1.586916,-0.247451,-0.25073,0,2,4
12660,9,2,1,0,-0.438855,0,0,1,-1.421815,6,-0.748854,-0.247451,-0.25073,0,1,4
12661,9,2,1,0,-0.403682,1,1,0,-1.421815,6,0.688245,0.405301,-0.25073,0,1,4
12662,1,2,1,0,-0.486849,1,0,0,-1.421815,6,3.043382,0.078925,-0.25073,1,1,4
12663,1,1,1,0,-0.008227,1,1,1,-1.421815,6,-0.454462,-0.247451,-0.25073,0,3,4
12664,1,2,0,0,-0.332349,0,0,0,-1.421815,6,-0.849567,-0.247451,-0.25073,0,1,4
12665,1,2,1,0,-0.381986,0,0,1,-1.421815,6,-0.566796,-0.247451,-0.25073,0,1,4
12666,1,1,1,0,-0.430966,0,1,0,-1.421815,6,-0.477703,0.078925,-0.25073,0,2,4


## Splitting the Dataset for predicting missing values

In [48]:
x1 = bank_mv.drop(columns='contact')
y1 = bank_mv['contact']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(x1, y1, train_size=0.7, random_state=42)

In [50]:
len(X_train), len(X_test), len(y_train), len(y_test)

(21634, 9273, 21634, 9273)

### Modeling

In [51]:
k = round(len(X_train) ** .5)
k

147

In [52]:
model_KNN = KNeighborsClassifier(n_neighbors=k)
model_KNN.fit(X_train, y_train)
print(f'KNN score : {model_KNN.score(X_test, y_test)}')

KNN score : 0.9132966677450663


In [53]:
bank_test_miss

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group
0,4,1,2,0,0.259354,1,0,unknown,-1.301418,4,0.010368,-0.573827,-0.250730,0,4,4
1,9,2,1,0,-0.435568,1,0,unknown,-1.301418,4,-0.415726,-0.573827,-0.250730,0,2,4
2,2,1,1,0,-0.444443,1,1,unknown,-1.301418,4,-0.706245,-0.573827,-0.250730,0,1,4
5,4,1,2,0,-0.369166,1,0,unknown,-1.301418,4,-0.462209,-0.573827,-0.250730,0,2,4
6,4,2,2,0,-0.298161,1,1,unknown,-1.301418,4,-0.160070,-0.573827,-0.250730,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45061,6,2,1,0,-0.106187,0,0,unknown,0.504533,9,-0.973522,-0.573827,-0.250730,0,1,4
45062,5,1,0,0,-0.201188,0,0,unknown,0.504533,9,-0.981269,-0.573827,-0.250730,0,4,4
45122,2,2,2,0,-0.358975,1,1,unknown,1.226913,9,-0.934786,-0.573827,-0.250730,0,2,4
45135,1,1,0,0,-0.019732,0,0,unknown,1.467707,9,-0.725612,-0.573827,-0.250730,0,3,4


In [54]:
bank_test_miss.loc[:, 'predicted_contact'] = model_KNN.predict(bank_test_miss.drop(columns=['contact']))

In [55]:
bank_test_miss['predicted_contact'].unique()

array([0], dtype=int64)

In [56]:
transfer = bank_test_miss['predicted_contact'].tolist()

In [57]:
bank.loc[bank[bank['contact'] == 'unknown'].index, 'contact'] = transfer

In [58]:
bank['contact'] = bank['contact'].apply(lambda x: 'cellular' if x == 0 else x)

In [59]:
bank[bank['contact'] == 'unknown']

Unnamed: 0,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,previous,y,age_group,pdays_group


In [60]:
bank.to_csv('Bank_Clean.csv', index=False)