# Naive Bayesian (kNN)

## Import required packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def evaluateBinaryClassification(predictions, actuals):
    contigency = pd.crosstab(actuals,predictions)
    TP = contigency['yes']['yes']
    TN = contigency['no']['no']
    FP = contigency['yes']['no']
    FN = contigency['no']['yes']
    n = contigency.sum().sum()

    Acuracy = (TP + TN)/n
    Recall = TP/(TP+FN)
    Precision = TP/(TP+FP)
    FScore = 2*Recall*Precision/(Recall+Precision)
    
    return Acuracy, Recall, Precision, FScore
    

In [None]:
customer_df = pd.read_csv('Customer Subscription.csv')

# First let's do KNN

In [None]:
# Deal with unknowns
customer_df.job.replace('unknown',np.nan,inplace=True)
customer_df.marital.replace('unknown',np.nan,inplace=True)
customer_df.education.replace('unknown',np.nan,inplace=True)
customer_df.loan.replace('unknown',np.nan,inplace=True)
customer_df.default.replace('unknown',np.nan,inplace=True)
customer_df.job.replace('unknown',np.nan,inplace=True)
customer_df.housing.replace('unknown',np.nan,inplace=True)

In [None]:
customer_df.pdays.replace(999,np.nan,inplace=True)

# Task: Classificaiton

We would like to predict the class (subscriber/ no subscriber) of customers.

In [None]:
possible_predictors = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']
target = 'y'

# kNN Preprocess

In [None]:
y=customer_df[target]

Xs = pd.get_dummies(customer_df[possible_predictors],drop_first=True)

KNN can handle missing values, so we keep them as missing not to create bias in the data.

However, the case of missing vlause for pdays is different. The values are not missing for our lack of knowlege, but they are missing for a difference about the population of data object that leads to them not having a value. In these situations, we will use MM method to fill the missing values.

#### MM method

we will fill the missing values with Max+Mean (MM) of the attribute.

In [None]:
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

In [None]:
# standardize data
scaler = preprocessing.StandardScaler()

scaler.fit(Xs)  # Note the use of an array of column names

Xs = pd.DataFrame(scaler.transform(Xs),columns =Xs.columns)
Xs.describe()

# Set up experimentation 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3,random_state=1)

print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

# Feature Selection

In [None]:
rf = RandomForestClassifier(n_estimators=1000,random_state=2)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

df = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
df = df.sort_values('importance')
print(df)

ax = df.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
select_features=df[df.importance>=0.05].feature

# Tuned KNN
Use the tune KNN to Predict

In [None]:
knn = KNeighborsClassifier(n_neighbors=1,weights='uniform').fit(X_train[select_features], y_train)
y_predict_knn = knn.predict(X_test[select_features])
pd.crosstab(y_test,y_predict_knn)

In [None]:
evaluateBinaryClassification(y_predict_knn,y_test)

# Comapre

In [None]:
Methods = ['Random','KNN','NB']
Metrics = ['Accuracy','Recall','Precision','Fscore']

compare_df = pd.DataFrame(index = Methods, columns = Metrics)

#Method1 #KNN

compare_df.loc['KNN'] = evaluateBinaryClassification(y_predict_knn,y_test)

number_Yes =  np.sum(y_predict_knn=='yes')

#Method 2 Random
y_predict_random = pd.Series(np.random.permutation(len(y_test))<number_Yes).replace({False:'no',True:'yes'})
print(evaluateBinaryClassification(y_predict_random,y_test))

compare_df.loc['Random'] = evaluateBinaryClassification(y_predict_random,y_test)
compare_df

# Naive Bayesian

## Preprocess

NB can also handle missing values, but it does not need the data to be standardized. So some of the preprocessing steps will look different.

In [None]:
y=customer_df[target]

Xs = Xs = pd.get_dummies(customer_df[possible_predictors],drop_first=True)
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3,random_state=1)
print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

We use the same features we selected using Random Forest.

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train[select_features], y_train)

y_predict_nb = nb.predict(X_test[select_features])
pd.crosstab(y_test,y_predict_nb)

In [None]:
evaluateBinaryClassification(y_predict_nb,y_test)

In [None]:
compare_df.loc['NB'] = evaluateBinaryClassification(y_predict_nb,y_test)
compare_df

In [None]:
print('Random Method number of yes prediction: {}'.format(np.sum(y_predict_random=='yes')))
print('KNN Method number of yes prediction: {}'.format(np.sum(y_predict_knn=='yes')))
print('NB Method number of yes prediction: {}'.format(np.sum(y_predict_nb=='yes')))

In [None]:
# predict probabilities
y_prob = nb.predict_proba(X_test[select_features])
y_prob

In [None]:
summary_df = pd.concat([pd.DataFrame({'actual': y_test, 'predicted': y_predict_nb}),
                pd.DataFrame(y_prob, index=y_test.index,columns = ['No_prob','Yes_prob'])], axis=1)
summary_df.sort_values('Yes_prob',ascending=False)

In [None]:
Thresholds = np.linspace(0.9999999,1,10)

for tr in Thresholds:
    BM = summary_df.Yes_prob > tr
    print('Number of Yes for threshold {} is {}.'.format(tr,np.sum(BM)))

In [None]:
y_predict_nb = pd.Series(summary_df.Yes_prob>0.9999999222222222).replace({False:'no',True:'yes'})

In [None]:
print('Random Method number of yes prediction: {}'.format(np.sum(y_predict_random=='yes')))
print('KNN Method number of yes prediction: {}'.format(np.sum(y_predict_knn=='yes')))
print('NB Method number of yes prediction: {}'.format(np.sum(y_predict_nb=='yes')))

In [None]:
compare_df.loc['NB'] = evaluateBinaryClassification(y_predict_nb,y_test)
compare_df