# Naive Bayesian (kNN)

## Import required packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def evaluateBinaryClassification(predictions, actuals):
    contigency = pd.crosstab(actuals,predictions)
    TP = contigency['yes']['yes']
    TN = contigency['no']['no']
    FP = contigency['yes']['no']
    FN = contigency['no']['yes']
    n = contigency.sum().sum()

    Acuracy = (TP + TN)/n
    Recall = TP/(TP+FN)
    Precision = TP/(TP+FP)
    FScore = 2*Recall*Precision/(Recall+Precision)
    
    return Acuracy, Recall, Precision, FScore
    

In [None]:
customer_df = pd.read_csv('Customer Subscription.csv')

In [None]:
# Deal with unknowns
customer_df.job.replace('unknown',np.nan,inplace=True)
customer_df.marital.replace('unknown',np.nan,inplace=True)
customer_df.education.replace('unknown',np.nan,inplace=True)
customer_df.loan.replace('unknown',np.nan,inplace=True)
customer_df.default.replace('unknown',np.nan,inplace=True)
customer_df.job.replace('unknown',np.nan,inplace=True)
customer_df.housing.replace('unknown',np.nan,inplace=True)

In [None]:
customer_df.pdays.replace(999,np.nan,inplace=True)

# First let's do KNN

# Task: Classificaiton

We would like to predict the class (subscriber/ no subscriber) of customers.

In [None]:
possible_predictors = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']
target = 'y'

# kNN Preprocess

In [None]:
y=customer_df[target]

Xs = pd.get_dummies(customer_df[possible_predictors],drop_first=True)

KNN can handle missing values, so we keep them as missing not to create bias in the data.

However, the case of missing vlause for pdays is different. The values are not missing for our lack of knowlege, but they are missing for a difference about the population of data object that leads to them not having a value. In these situations, we will use MM method to fill the missing values.

#### MM method

we will fill the missing values with Max+Mean (MM) of the attribute.

In [None]:
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

In [None]:
# standardize data
scaler = preprocessing.StandardScaler()

scaler.fit(Xs)  # Note the use of an array of column names

Xs = pd.DataFrame(scaler.transform(Xs),columns =Xs.columns)
Xs.describe()

# Set up experimentation 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3,random_state=1)

print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

# Feature Selection

In [None]:
rf = RandomForestClassifier(n_estimators=1000,random_state=2)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

df = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
df = df.sort_values('importance')
print(df)

ax = df.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
select_features=df[df.importance>=0.05].feature

# Tuned KNN
Use the tune KNN to Predict

In [None]:
knn = KNeighborsClassifier(n_neighbors=1,weights='uniform').fit(X_train[select_features], y_train)
y_predict_knn = knn.predict(X_test[select_features])
pd.crosstab(y_test,y_predict_knn)

In [None]:
evaluateBinaryClassification(y_predict_knn,y_test)

# Naive Bayesian

## Preprocess

NB can also handle missing values, but it does not need the data to be standardized. So some of the preprocessing steps will look different.

In [None]:
y=customer_df[target]

Xs = pd.get_dummies(customer_df[possible_predictors],drop_first=True)
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3,random_state=1)
print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

We use the same features we selected using Random Forest.

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train[select_features], y_train)

y_predict_nb = nb.predict(X_test[select_features])
pd.crosstab(y_test,y_predict_nb)

In [None]:
evaluateBinaryClassification(y_predict_nb,y_test)

In [None]:
# predict probabilities
y_prob = nb.predict_proba(X_test[select_features])

summary_df = pd.concat([pd.DataFrame({'actual': y_test, 'predicted': y_predict_nb}),
                pd.DataFrame(y_prob, index=y_test.index,columns = ['No_prob','Yes_prob'])], axis=1)
y_predict_nb = pd.Series(summary_df.Yes_prob>0.9999998888888889).replace({False:'no',True:'yes'})

In [None]:
Methods = ['Random','KNN','NB','DT']
Metrics = ['Accuracy','Recall','Precision','Fscore']

compare_df = pd.DataFrame(index = Methods, columns = Metrics)

#Method1 #KNN

compare_df.loc['KNN'] = evaluateBinaryClassification(y_predict_knn,y_test)

number_Yes =  np.sum(y_predict_knn=='yes')

#Method 2 Random
y_predict_random = pd.Series(np.random.permutation(len(y_test))<number_Yes).replace({False:'no',True:'yes'})
print(evaluateBinaryClassification(y_predict_random,y_test))

compare_df.loc['Random'] = evaluateBinaryClassification(y_predict_random,y_test)

#Method 3 NB
compare_df.loc['NB'] = evaluateBinaryClassification(y_predict_nb,y_test)
compare_df

In [None]:
print('Random Method number of yes prediction: {}'.format(np.sum(y_predict_random=='yes')))
print('KNN Method number of yes prediction: {}'.format(np.sum(y_predict_knn=='yes')))
print('NB Method number of yes prediction: {}'.format(np.sum(y_predict_nb=='yes')))

# DT
Now let us start learning about DT

## DT Preprocess
Preprocessing is different for decision Tree as ordinal attributes needs to be transformed with ranking instead of binary coding. Why?

In [None]:
y=customer_df[target]

Xs = pd.DataFrame(customer_df)
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

In [None]:
Xs.education.unique()

In [None]:
replace_dic = {'basic.9y':3, 'university.degree':6, 'basic.4y':1, 'high.school':4,
       'professional.course':5, 'basic.6y':2, 'illiterate':0}

Xs.education.replace(replace_dic,inplace=True)

In [None]:
Xs.education.unique()

In [None]:
Xs = pd.get_dummies(Xs[possible_predictors],drop_first=True)

The module we are going to use cannot handle missing values, so we have to deal with them first.

In [None]:
Xs.fillna(Xs.median(),inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3,random_state=1)
print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from dmba import plotDecisionTree

classTree = DecisionTreeClassifier()
classTree.fit(X_train, y_train)

print("Classes: {}".format(', '.join(classTree.classes_)))
plotDecisionTree(classTree, feature_names=X_train.columns, class_names=classTree.classes_)

In [None]:
y_predict_dt = classTree.predict(X_test)
evaluateBinaryClassification(y_predict_dt,y_test)

In [None]:
print('Random Method number of yes prediction: {}'.format(np.sum(y_predict_random=='yes')))
print('KNN Method number of yes prediction: {}'.format(np.sum(y_predict_knn=='yes')))
print('NB Method number of yes prediction: {}'.format(np.sum(y_predict_nb=='yes')))
print('DT Method number of yes prediction: {}'.format(np.sum(y_predict_dt=='yes')))

In [None]:
compare_df.loc['DT'] = evaluateBinaryClassification(y_predict_dt,y_test)
compare_df

# Tune DT
parameters:

### criterion{“gini”, “entropy”}, default=”gini”
    The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

### splitter{“best”, “random”}, default=”best”
    The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.

### max_depth int, default=None
    The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

### min_samples_split int or float, default=2
    The minimum number of samples required to split an internal node:

    If int, then consider min_samples_split as the minimum number.

    If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.

### min_impurity_decrease float, default=0.0
    A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':['gini','entropy'],
    'splitter' : ['best','random'],
    'max_depth': [10, 20, 30, 40], 
    'min_samples_split': [20, 40, 60, 80, 100], 
    'min_impurity_decrease': [0, 0.0005, 0.001, 0.005, 0.01], 
}

gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='recall')
gridSearch.fit(X_train, y_train.replace({'yes':1,'no':0}))
print('score: ', gridSearch.best_score_)
print('parameters: ', gridSearch.best_params_)

In [None]:
param_grid = {
    'criterion':['entropy'],
    'splitter' : ['best'],
    'max_depth': [6,7,8,9,10,11,12,13,14], 
    'min_samples_split': [16,17,18,19,20,21,22,23,24,25], 
    'min_impurity_decrease': [0.001,0.003, 0.005,0.007,0.009], 
}

gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='recall')
gridSearch.fit(X_train, y_train.replace({'yes':1,'no':0}))
print('score: ', gridSearch.best_score_)
print('parameters: ', gridSearch.best_params_)

In [None]:
classTree = DecisionTreeClassifier(criterion= 'entropy', max_depth= 6,
                                   min_impurity_decrease= 0.005, min_samples_split= 16, splitter= 'best')
classTree.fit(X_train, y_train)

plotDecisionTree(classTree, feature_names=X_train.columns, class_names=classTree.classes_)

In [None]:
y_predict_dt = classTree.predict(X_test)
evaluateBinaryClassification(y_predict_dt,y_test)

# Comapre

In [None]:
compare_df.loc['DT'] = evaluateBinaryClassification(y_predict_dt,y_test)
compare_df

In [None]:
print('Random Method number of yes prediction: {}'.format(np.sum(y_predict_random=='yes')))
print('KNN Method number of yes prediction: {}'.format(np.sum(y_predict_knn=='yes')))
print('NB Method number of yes prediction: {}'.format(np.sum(y_predict_nb=='yes')))
print('DT Method number of yes prediction: {}'.format(np.sum(y_predict_dt=='yes')))