## Loading libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', None)
import re

## Loading data

In [None]:
data = pd.read_csv('lesson_4.05_data.csv') # this file is in files_for_lesson_and_activities folder
data.head()

In [None]:
data.info()

## Splitting the dataset between the categorical and numerical columns

In [None]:
categoricals = data.select_dtypes(np.object)
numericals = data.select_dtypes(np.number)


print("Dataframe of numerical columns: ")
print()

display(numericals)

print("Dataframe of categorical columns: ")
print()

display(categoricals)

In [None]:
#Lets apply scaling to the numerical columns except the final column
#which will be our y variable, average gift

from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(numericals.iloc[:,:-1])
scaled_numericals = transformer.transform(numericals.iloc[:,:-1])

Getting a dataframe with the result from the standard scaler which is a numpy array.

As the numpy arrays doesn't have column names, we pick up the column names from the numerical (original) dataframe.

In [None]:
pdscaled_numericals= pd.DataFrame(scaled_numericals,columns=numericals.columns.tolist()[:-1])

In [None]:
pdscaled_numericals.head()

## Dummifying categorical columns

In [None]:
cat_encoded = pd.get_dummies(categoricals, drop_first=True)
cat_encoded.head()

## Alternative way to do it. However, you will have to load the corresponding libraries of sklearn.
#encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals)
#encoded = encoder.transform(categoricals).toarray()

## Putting everything together

In [None]:
full = pd.concat([pdscaled_numericals.iloc[:,:-1],cat_encoded,numericals.iloc[:,-1]],axis=1)
print("The dataset size is: ", full.shape)
print()
full

In [None]:
full.columns.tolist()

In [None]:
full_x = pd.DataFrame(full,columns=full.columns.tolist()[:-1])
full_x.head()

In [None]:
#define X and y. 
#X is all the features, scaled numericals and encoded categoricals which are our independent variables
#y is the variable we seek to predict 

X = full_x
y = full['AVGGIFT']

## Splitting data intro train and test sets.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## Training the K-NN model

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=4) # The keyword "n_neighbors" is what sets the K.
knn_model.fit(X_train, y_train)

## Getting our predictions

In [None]:
knn_predictions = knn_model.predict(X_test)
# to get some predictions for y we use the x test set. 
# we will later compare the predictions from x test to real test y 

## Getting the error metrics of our K-NN model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

MAE = mean_absolute_error(y_test,knn_predictions)
MSE = mean_squared_error(y_test,knn_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,knn_predictions)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))

# Activity

Train a linear model and compare the performace of both models in the test set.


In [None]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
lm_model = lm.fit(X_train,y_train)

In [None]:
lm_predictions = lm_model.predict(X_test)

In [None]:
MAE = mean_absolute_error(y_test,lm_predictions)
MSE = mean_squared_error(y_test,lm_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,lm_predictions)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))

K 
Let's try to find the best K value from a pre-defined set of values.

In [None]:
scores = [] # We will store here the R2 values for each of our K-NN models with different K-values.
for i in range(2,10): # The values of K. = 2,3,4,5,6,7,8,9
    model = KNeighborsRegressor(n_neighbors=i)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

Now let's plot how it changes the score as a function of K.

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(2,10),scores,color = 'blue', linestyle='dashed',
         marker='o', markerfacecolor='red', markersize=10)
plt.title('R2 vs. K Value')
plt.xlabel('K')
plt.ylabel('R2')

#threshold = 0.01 # minimum difference we want between two R2 consecutive values.
#delta = 10. # difference between two consecutive R2 values.
#k = 3 # initival value
#old_score = 0.

#while (delta >= threshold):
#  knn_model = KNeighborsRegressor(n_neighbors=k) # The keyword "n_neighbors" is what sets the K.
#  knn_model.fit(X_train, y_train)
#  knn_predictions = knn_model.predict(X_test) 
#  new_score = r2_score(y_test,knn_predictions)
#  delta = new_score - old_score
#  k = k + 4 
#  old_score = new_score

# Activity

If you think a little bit about it, the number of neighbors might be very important for our results, but will it be the only parameter that matters? Go to the documentation and check the parameters and the values they can take, pick the one you think is more relevant and change its value in the model. Hint: If K (number of neighbors) is the most important one, maybe we could measure the way these K instances affect our prediction

In [None]:
uniform_model = KNeighborsRegressor(n_neighbors=9)
uniform_model.fit(X_train, y_train)
uniform_model.score(X_test, y_test)

In [None]:
distance_model = KNeighborsRegressor(n_neighbors=9, weights = "distance")
distance_model.fit(X_train, y_train)
distance_model.score(X_test, y_test)

# Activity

Let's visualize how KNN actually works. First of all install the mlxtend library and create a dataframe containing the two most relevant numerical variables and the target, in that order. Once you have done it sample it with n = 100, introduce that sample into this function with an arbitrary k:

Let's visualize how KNN actually works. First of all install the mlxtend library and create a dataframe containing the two most relevant numerical variables and the target, in that order. Once you have done it sample it with n = 100, introduce that sample into this function with an arbitrary k:

In [None]:
from mlxtend.plotting import plot_decision_regions

In [None]:
def knn_comparison(data, k):
    x = data.iloc[:, 0:2].values
    y = data.iloc[:, -1].astype(int).values
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(x, y)

    plt.figure(figsize=(16,12))
    plot_decision_regions(x, y, clf=knn)
    plt.title("Knn with K="+ str(k), fontsize = 18)
    plt.show()

In [None]:
new = pd.concat([X,y],axis=1)
new = new[['HV1','IC1','AVGGIFT']].sample(n=100,random_state=100)
new.head()

In [None]:
knn_comparison(new,4)

What can you see in the plot? 

Now try to create a function plot_knn_boundaries to loop over the previous function and iterate over the ks = [1, 3, 5, 10, 25, 50]. And now, can you tell the difference between the plots?

In [None]:
def plot_knn_boundaries(data, ks = [1, 3, 5, 10, 25, 50]):
    for i in ks:
        knn_comparison(data, i)

The lower the number of k the more over-fitted it will be. We can see that with k = 1, the boundaries are very clear and as we increase k the plots start turning very messy until the last two plots, where it is oversimplified.

In [None]:
plot_knn_boundaries(new)

R2adj can be computed as:

In [None]:
score_adj = 1 - (1-r_squared)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

# Activity:

From all the regression metrics we have seen, which one do you think is the one to use in most cases?

Calculate and plot R2, MSE, RMSE, and MAE.

We have seen R2, MSE, RMSE and MAE. Of course, there is not a magic solution for which you should always use it, but there are some details worth knowing:

*  R2 is scaled, which means that it is independent of the data. This one would be the one to go with if we don't know a lot about the data and general information about our model. However, it can be misleading, as it is supposed to be between 0 and 1 but sometimes it is not (you can read about it here. In fact, R2 is a biased estimator (more information here.

*  MAE would be the median of the regression metrics as what it measures is the sum of distances between predicted and real values (errors), and that won't give a special treat to really bad predictions, so if that's what we want this metric should do great.

*  MSE - It is the mean of the squared distance of the errors, which will weight the bad predictions.

*  RMSE - Root MSE, essentially it is the same but it is easier to understand within the data context

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))
print('Adjusted R:',  1 - (1-metrics.r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))