# k-Nearest Neighbors (kNN)

We use RidingMowers.csv for this lab.

## Import required packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


## GEt to know the dataset

In [None]:
customer_df = pd.read_csv('CustomerLoan.csv')
customer_df

## Get to know the data

In [None]:
customer_df.shape

In [None]:
customer_df.describe()

In [None]:
Numerical_attributes = ['income','score']
for i,col in enumerate(Numerical_attributes):
    customer_df[col].plot(kind='hist')
    plt.title(col)
    plt.show()

### Relationships between the attributes

In [None]:
customer_df.plot.scatter(x='income',y='score')
plt.show()

In [None]:
customer_df[Numerical_attributes].corr().round(2)

In [None]:
income_discretized = pd.cut(customer_df.income, bins = 3)
contingency_tbl = pd.crosstab(customer_df.default,income_discretized)
probablity_tbl = contingency_tbl/ contingency_tbl.sum()
sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
plt.show()

In [None]:
score_discretized = pd.cut(customer_df.score, bins = 3)
contingency_tbl = pd.crosstab(customer_df.default,score_discretized)
probablity_tbl = contingency_tbl/ contingency_tbl.sum()
sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
plt.show()

There is not high correlation between the predictors. That means there are no data redundacy. 

## Classificaiton Purpose
We want to create a classification model to predict defualt or not defaulting of loans based on income and credit score.

In [None]:
newCustomer = pd.DataFrame([{'income': 98487, 'score': 785}])
newCustomer

In [None]:
fig, ax = plt.subplots()

subset = customer_df.loc[customer_df['default']=='Yes']
ax.scatter(subset.income, subset.score, marker='o', label='Default-YES', color='C1')

subset = customer_df.loc[customer_df['default']=='NO']
ax.scatter(subset.income, subset.score, marker='D', label='Default-NO', color='C0')

ax.scatter(newCustomer.income, newCustomer.score, marker='*', label='New Customer', color='black', s=150)

plt.xlabel('income')  # set x-axis label
plt.ylabel('score')  # set y-axis label

for _, row in customer_df.iterrows():
    ax.annotate(row.Name, (row.income -700, row.score-10))
    
handles, labels = ax.get_legend_handles_labels()

ax.legend(handles, labels, loc=4)

plt.show()

In [None]:
# Standardize the data
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(customer_df[['income', 'score']])  # Note the use of an array of column names

Xs = pd.DataFrame(scaler.transform(customer_df[['income', 'score']]),
             columns = ['income', 'score'])
y= customer_df.default

newCustomer_str = pd.DataFrame(scaler.transform(newCustomer),
             columns = ['income', 'score'])

Use k-nearest neighbour

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=3)
knn.fit(Xs)
distances, indices = knn.kneighbors(newCustomer_str)
print(Xs.iloc[indices[0]])  # indices is a list of lists, we are only interested in the first element

In [None]:
print(customer_df.iloc[indices[0], :]) 

# Predict

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3).fit(Xs, y)
knn.predict(newCustomer_str)

# Second Case Study
Who will subscribe for a long term deposit?

We will use Customer Subscription.csv from https://www.kaggle.com/rashmiranu/banking-dataset-classification

In [None]:
customer_df = pd.read_csv('Customer Subscription.csv')
customer_df

# Get to know the data

In [None]:
categorical_attributes = ['job','marital','education','default','loan','contact','month',
                          'day_of_week','y']
for i,col in enumerate(categorical_attributes):
    customer_df[col].value_counts().plot(kind='barh')
    plt.title(col)
    plt.show()

In [None]:
# Deal with unknowns
customer_df.job.replace('unknown',np.nan,inplace=True)
customer_df.marital.replace('unknown',np.nan,inplace=True)
customer_df.education.replace('unknown',np.nan,inplace=True)
customer_df.loan.replace('unknown',np.nan,inplace=True)
customer_df.default.replace('unknown',np.nan,inplace=True)
customer_df.job.replace('unknown',np.nan,inplace=True)

In [None]:
for i,col in enumerate(categorical_attributes):
    customer_df[col].value_counts().plot(kind='barh')
    plt.title(col)
    plt.show()

In [None]:
numerical_attributes = ['age','duration','campaign','previous','pdays']
for i,col in enumerate(numerical_attributes):
    customer_df[col].plot(kind='hist')
    plt.title(col)
    plt.show()

In [None]:
customer_df.pdays.replace(999,np.nan,inplace=True)

In [None]:
customer_df.pdays.plot(kind='hist')

In [None]:
sns.pairplot(customer_df[numerical_attributes])
plt.show()

In [None]:
for i,col1 in enumerate(categorical_attributes):
    for ii,col2 in enumerate(categorical_attributes):
        if(col1!=col2):
            if(i<ii):
                contingency_tbl = pd.crosstab(customer_df[col1],customer_df[col2])
                probablity_tbl = contingency_tbl/ contingency_tbl.sum()
                sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
                plt.show()

In [None]:
for i,col1 in enumerate(numerical_attributes):
    for ii,col2 in enumerate(categorical_attributes):
        col_discretized = pd.cut(customer_df[col1], bins = 3)
        contingency_tbl = pd.crosstab(customer_df[col2],col_discretized)
        probablity_tbl = contingency_tbl/ contingency_tbl.sum()
        sns.heatmap(probablity_tbl, annot=True, center=0.5 ,cmap="Greys")
        plt.show()

# Task: Classificaiton

We would like to predict the class (subscriber/ no subscriber) of customers.

In [None]:
possible_predictors = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']
target = 'y'


# Preprocess

In [None]:
y=customer_df[target]

Xs = pd.get_dummies(customer_df[possible_predictors],drop_first=True)
Xs

KNN can handle missing values, so we keep them as missing not to create bias in the data.

However, the case of missing vlause for pdays is different. The values are not missing for our lack of knowlege, but they are missing for a difference about the population of data object that leads to them not having a value. In these situations, we will use MM method to fill the missing values.

#### MM method

we will fill the missing values with Max+Mean (MM) of the attribute.

In [None]:
Xs.pdays.fillna(Xs.pdays.max()+Xs.pdays.mean(),inplace=True)

In [None]:
Xs.pdays.plot(kind='hist')

In [None]:
# standardize data
scaler = preprocessing.StandardScaler()

scaler.fit(Xs)  # Note the use of an array of column names

Xs = pd.DataFrame(scaler.transform(Xs),columns =Xs.columns)
Xs.describe()

# Set up experimentation 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xs,y,  test_size=0.3)

print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

# Feature Selection

In [None]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

df = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
df = df.sort_values('importance')
print(df)

ax = df.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
select_features=df[df.importance>=0.01].feature

In [None]:
knn = KNeighborsClassifier().fit(X_train[select_features], y_train)
predict_y = knn.predict(X_test[select_features])

In [None]:
contigency = pd.crosstab(y_test,predict_y)
contigency

In [None]:
TP = contigency['yes']['yes']
TN = contigency['no']['no']
FP = contigency['yes']['no']
FN = contigency['no']['yes']
n = contigency.sum().sum()

Acuracy = (TP + TN)/n
Recall = TP/(TP+FN)
Precision = TP/(TP+FP)
FScore = 2*Recall*Precision/(Recall+Precision)
print('Accuracy= {}.'.format(Acuracy))
print('Recall= {}.'.format(Recall))
print('Precision= {}.'.format(Precision))
print('FScore= {}.'.format(FScore))

In [None]:
def evaluateBinaryClassification(predictions, actuals):
    contigency = pd.crosstab(actuals,predictions)
    TP = contigency['yes']['yes']
    TN = contigency['no']['no']
    FP = contigency['yes']['no']
    FN = contigency['no']['yes']
    n = contigency.sum().sum()

    Acuracy = (TP + TN)/n
    Recall = TP/(TP+FN)
    Precision = TP/(TP+FP)
    FScore = 2*Recall*Precision/(Recall+Precision)
    
    return Acuracy, Recall, Precision, FScore
    

In [None]:
evaluateBinaryClassification(predict_y,y_test)

# Tune KNN
Parameters: 

    n_neighborsint, default=5
    Number of neighbors to use by default for kneighbors queries.

    weights{‘uniform’, ‘distance’} default=’uniform’
    weight function used in prediction.

In [None]:
# Create tuning (validation) set: devide the trainset

X_train_s, X_tune, y_train_s, y_tune = train_test_split(X_train, y_train, test_size=0.3)

print('X_train Shape: ', X_train.shape)
print('y_train Shape: ', y_train.shape)

print('X_train_s Shape: ', X_train_s.shape)
print('X_tune Shape: ', X_tune.shape)
print('y_train_s Shape: ', y_train_s.shape)
print('y_tune Shape: ', y_tune.shape)

In [None]:
# Create a placeholder for experimentations
num_repetition=1

n_neighbors_options = range(1,10)
weights_options  = ['uniform','distance']

my_index = pd.MultiIndex.from_product([n_neighbors_options,weights_options],
                                     names=('n_neighbors', 'weights'))

tune_df = pd.DataFrame(index = my_index,
                       columns=['R{}'.format(i) for i in range(num_repetition)])

tune_df

In [None]:
for neighbor_o in n_neighbors_options:
    for weights_o in weights_options:
        for rep in tune_df.columns:
            knn = KNeighborsClassifier(n_neighbors=neighbor_o,weights=weights_o)
            knn.fit(X_train_s[select_features], y_train_s)
            predict_y = knn.predict(X_tune[select_features])
            metrics = evaluateBinaryClassification(predict_y,y_tune)
            
            #tune based on precision
            tune_df.at[(neighbor_o,weights_o),rep] = metrics[1]
        print(neighbor_o,weights_o)
            

In [None]:
tune_df.sort_values('R0',ascending=False)

# Tuned KNN
Use the tune KNN to Predict

In [None]:
knn = KNeighborsClassifier(n_neighbors=1,weights='uniform').fit(X_train[select_features], y_train)
predict_y = knn.predict(X_test[select_features])
evaluateBinaryClassification(predict_y,y_test)

In [None]:
pd.crosstab(y_test,predict_y)

In [None]:
pd.Series(np.random.permutation(len(y_test))<1000).replace({False:'no',True:'yes'})


# Comapre

In [None]:
Methods = ['Random','KNN']
Metrics = ['Accuracy','Recall','Precision','Fscore']

compare_df = pd.DataFrame(index = Methods, columns = Metrics)

#Method1 #KNN

compare_df.loc['KNN'] = evaluateBinaryClassification(predict_y,y_test)

number_Yes =  np.sum(predict_y=='yes')

#Method 2 Random
predict_y = pd.Series(np.random.permutation(len(y_test))<number_Yes).replace({False:'no',True:'yes'})
print(evaluateBinaryClassification(predict_y,y_test))

compare_df.loc['Random'] = evaluateBinaryClassification(predict_y,y_test)
compare_df