In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%config InlineBackend.figure_format = 'retina'

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

**First look at the datasets**

In [4]:
path_1 = './dataset/bank-additional-full.csv'
path_2 = './dataset/bank-additional-names.txt'
path_3 = './dataset/bank-additional.csv'

In [5]:
# Function to read the source files
def read_files(path):
    df = pd.read_csv(path, sep=';')
    print('\nDetails of dataset: ', path)
    print('Shape: ',df.shape)
    return df

In [6]:
data_1 = read_files(path_1)
data_3 = read_files(path_3)


Details of dataset:  ./dataset/bank-additional-full.csv
Shape:  (41188, 21)

Details of dataset:  ./dataset/bank-additional.csv
Shape:  (4119, 21)


In [7]:
# Checking if the features of the smaller and larger dataset are the same
if False in list(data_1.columns == data_3.columns):
    print('Not similar')

**What do we understand from the first look at the dataset(csv files)?**
- There are 2 datasets of small and large size with similar features.
- Approach for modelling: Train the model on the smaller dataset and test on the larger dataset. This will ensure the model does not see the test set.

**Problem statement:**
- The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y).

**Summary of Input variables**
   
### Bank client data:
   1. age (numeric)
   2. job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
   3. marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed)
   4. education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown")
   5. default: has credit in default? (categorical: "no","yes","unknown")
   6. housing: has housing loan? (categorical: "no","yes","unknown")
   7. loan: has personal loan? (categorical: "no","yes","unknown")
### Related with the last contact of the current campaign:
   8. contact: contact communication type (categorical: "cellular","telephone") 
   9. month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
   10. day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
   11. duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
### Other attributes:
   12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
   13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
   14. previous: number of contacts performed before this campaign and for this client (numeric)
   15. poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")
### Social and economic context attributes
   16. emp.var.rate: employment variation rate - quarterly indicator (numeric)
   17. cons.price.idx: consumer price index - monthly indicator (numeric)     
   18. cons.conf.idx: consumer confidence index - monthly indicator (numeric)     
   19. euribor3m: euribor 3 month rate - daily indicator (numeric)
   20. nr.employed: number of employees - quarterly indicator (numeric)
### Output variable (desired target):
   21. y - has the client subscribed a term deposit? (binary: "yes","no")

## Exploratory data analysis

In [8]:
# Copy the train dataset to a new dataframe. 
data = data_1[:]

In [9]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

**Check for null values and dtypes**
- For this dataset it is given that null values are referred to as 'unknown'. Hence replacing the 'unknown' values as null would be required


In [10]:
def change_unknown(df):
    df.replace('unknown', np.nan, inplace = True)
    return df

In [11]:
# Function to check the null counts in a dataframe
def checknullcount(df):
    info = []
    columns = ('Feature', 'Nullcount', 'Dtypes', 'Unique Count', 'Unique')
    for col in df.columns:
        nunique = df[col].nunique()
        unique = df[col].unique()
        nullcount = df[col].isnull().sum()
        datatype = df[col].dtypes
        if nullcount != 0:
            info.append([col, nullcount, datatype, nunique, unique])
    return (pd.DataFrame(columns=columns, data=info).sort_values('Nullcount', ascending = False))

In [12]:
data = change_unknown(data)
checknullcount(data)

Unnamed: 0,Feature,Nullcount,Dtypes,Unique Count,Unique
3,default,8597,object,2,"[no, nan, yes]"
2,education,1731,object,7,"[basic.4y, high.school, basic.6y, basic.9y, pr..."
4,housing,990,object,2,"[no, yes, nan]"
5,loan,990,object,2,"[no, yes, nan]"
0,job,330,object,11,"[housemaid, services, admin., blue-collar, tec..."
1,marital,80,object,3,"[married, single, divorced, nan]"


**Analysis of Null values**
- 6 features have null values:
    - default - 803. All of this cannot be deleted! Nee dto check of ways to impute. KNN imputation might be a good method.
    - education - 167. These can be deleted after a quick check
    - housing - 105
    - loan - 105
    - job - 39
    - marital - 11

**After a quick look at the unknown values in Tableau, the understanding is that it is better to delet clients who have unknown values in housing and loan.**

In [13]:
data = data[data['housing'].notnull()]
data = data[data['job'].notnull()]
data = data[data['marital'].notnull()]

In [14]:
data['default'].value_counts()

no     31587
yes        3
Name: default, dtype: int64

** The distribution of default credit is more towards no. It makes sense toeither not use this feature or impute with a no**

In [15]:
# fill in missing values
data.default.fillna(value='no', inplace=True)

In [16]:
# change_unknown(data)
checknullcount(data)

Unnamed: 0,Feature,Nullcount,Dtypes,Unique Count,Unique
0,education,1558,object,7,"[basic.4y, high.school, basic.6y, basic.9y, pr..."


##  Imputing with KNN

K-Nearest Neighbors can be used to impute missing values in datasets. What we will do is estimate the most likely value for the missing data based on a KNN model.

- The column with missing data education


Here, we are trying to predict the values of default provided the other factors.
- Assuming a clients bank data will be similar to the other clients with similar attributes, we will use KNN imputation with the other Client data as the predictors.

**1. Create a subset of the large dataset with only client features**

In [17]:
clientdata = data[['age', 'job', 'marital', 'education', 'housing', 'loan', 'default']]
predictors = ['age', 'job', 'marital', 'default', 'housing', 'loan']

In [18]:
dummy_pred = pd.get_dummies(clientdata[predictors], drop_first=True)
print(dummy_pred.shape)
dummy_pred.head(2)

(39803, 16)


Unnamed: 0,age,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_married,marital_single,default_yes,housing_yes,loan_yes
0,56,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,57,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [19]:
dummy_pred['education'] = clientdata['education']

In [20]:
# Create the train set with known default data and test set with the unknown default data
clientdata_missing =  dummy_pred.loc[dummy_pred.education.isnull(),:]
clientdata_valid =  dummy_pred.loc[~dummy_pred.education.isnull(),:]

In [21]:
print(clientdata_missing.shape)
print(clientdata_valid.shape)

(1558, 17)
(38245, 17)


In [22]:
clientdata_valid.education.value_counts()

university.degree      11821
high.school             9244
basic.9y                5856
professional.course     5100
basic.4y                4002
basic.6y                2204
illiterate                18
Name: education, dtype: int64

**2. Set the predictor and target variable**
- All the predictors are categorical variables.
- Convert the predictors to dummy variables
- 

In [23]:
X_train = clientdata_valid.drop('education', axis =1)
y_train = clientdata_valid['education'].values
# clientdata_valid['education'].value_counts()

**3. Standardize the predictor matrix**
- Most of the predictors are categorical in nature. Hence, standardisation will not help.
- Choose not to standardise the data

In [24]:
# ss = StandardScaler()
# Xs = ss.fit_transform(X)

**4. Find the bext K for the imputation using KNN**

In [25]:
def find_best_k_cls(X, y, k_min=1, k_max=50, step=2, cv=5):
    k_range = range(k_min, k_max+1, step)
    accs = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X, y, cv=cv)
        accs.append(np.mean(scores))
    print (np.max(accs), np.argmax(k_range))
    return np.argmax(k_range)

In [None]:
find_best_k_cls(X_train, y_train)

**5. Fit KNN classifier**

In [None]:
knn = KNeighborsClassifier(n_neighbors=24)
knn.fit(X_train,y_train)

**6. Predict missing education values using the knn classifier model**

In [None]:
X_test = clientdata_missing.drop('education', axis =1)

In [None]:
y_test = knn.predict(X_test)

In [None]:
pd.Series(y_test).value_counts()

**7. Impute the missing education values to the dataset**

In [None]:
data.loc[data.education.isnull(), 'education'] = y_test

In [None]:
checknullcount(data)

In [None]:
# Function to draw a box plot.
# Parameters 
#     dict_of_values - A dictionary with the desired plot values mapped to their column names
#     list_of_names - List of the Columns to be plotted
#     kind - Choose plots style between pandas and seaborn
#     data - The dataframe to be plotted
# Return value-
#     None
plot_w_panda = 0
plot_w_seaborn = 1
def draw_boxplot(dict_of_values, list_of_names, kind = plot_w_panda, data = data):
    if kind == plot_w_panda:
        color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='Gray')
        sat_plot_df = pd.DataFrame(dict_of_values, columns = list_of_names)
        sat_plot_df.plot.box(color = color, vert = True)
    else:
        sns.boxplot(data = data, order = list_of_names, orient = 'h')
    plt.show()

## 1. What is the age range of the current customers?
- There are 67 unique values

In [None]:
print('Minimum age of customer: ', data.age.min())
print('Maximum age of customer: ', data.age.max())

In [None]:
# Plot Age distribution
fig = plt.figure(figsize= (10,4))
sns.boxplot(x = data.age, data = data, orient = 'h')
plt.title('Distribution of customer age group')
plt.xlabel('Age')
plt.show()

**People from age group 70 - 90 seem like outliers**

## 2. What are the job type?
- There are 11 unique Job types.
- There are 39 null counts

In [None]:
# Plot Age distribution
fig = plt.figure(figsize= (15,4))
sns.countplot(x=data.job, data=data, palette="Greens_d")
plt.title('Distribution of customer job')
plt.xlabel('Job Type')
plt.show()

**Have the clients with unknown job description subscribed to term deposit?**
- Seems equally distributed. Might be ok to drop it

In [None]:
# # Plot Age distribution of people with unknown job types
# fig = plt.figure(figsize= (15,4))
# sns.countplot(x=data[data.job.isnull()].y, data=data, palette="Greens_d")
# plt.title('Term deposit distribution of clients with unknown job types')
# plt.xlabel('Term deposit taken')
# plt.show()