In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline

import gc
import math

  from pandas.core import datetools


In [2]:
medical = pd.read_pickle("data/medical.p")
print(medical.UCURNINS.unique())
medical["UCURNINS"] = (medical.UCURNINS=="Yes").astype(int)
print(medical.UCURNINS.unique())
features = ["UMARSTAT", "USATMED", "URELATE", "REGION", "FHOSP", "FDENT", "FEMER", "FDOCT", "UIMMSTAT", "UAGE", "U_FTPT", "U_WKSLY", "U_USHRS", "HOTHVAL", "HRETVAL", "HSSVAL", "HWSVAL", "UBRACE", "UEDUC3", "GENDER"]
levCols = []
numCols = []
for col in features:
    if medical[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)

# Binaryzacja zmiennych nominalnych
dummLev = pd.get_dummies(medical[levCols], drop_first=True)
dummLev.shape
medical2 = pd.concat([medical[numCols], dummLev], axis=1)
print(medical.shape, math.sqrt(medical.shape[0]))

['Yes' 'No']
[1 0]
(35072, 29) 187.275198571514


At this stage we have our dataset prepared. With all nominal variables in binary form. 

We have around 35 k rows. That is a considerable amount so chosing $k=\sqrt{N}$ might be too high. Maybe lets start with 50.

First we need to standarize our data.

In [3]:
medical2[numCols] = medical2[numCols].apply(lambda x: (x-x.mean())/x.std())
features = medical2.columns.tolist()

In [4]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import neighbors

In [5]:
kf = KFold(n_splits=5)
probs = []
indicies = []
aucs = []
bacc = []
accs = []
n_neighbors = 50
clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=2)
for train, test in kf.split(medical2.index.values):
    clf.fit(medical2.iloc[train][features].values, medical.iloc[train]["UCURNINS"].values)
    prob = clf.predict_proba(medical2.iloc[test][features].values)
    probs.append(prob)
    indicies.append(test)
    aucs.append(metrics.roc_auc_score(medical.iloc[test]["UCURNINS"].values, prob[:,1]))
    accs.append(metrics.accuracy_score(medical.iloc[test]["UCURNINS"].values, (prob[:,1]>0.5).astype(int)))
print(np.mean(aucs))
print(aucs)
print(np.mean(accs))
print(accs)

0.8100906626566241
[0.8068274352412889, 0.8227779243154852, 0.8068194266434432, 0.7989174537503217, 0.8151110733325813]
0.8636806785573544
[0.8498930862437634, 0.8687099073414113, 0.8671228970630168, 0.8554319931565441, 0.8772455089820359]


In [6]:
n_neighbors = 25
clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=2)
for train, test in kf.split(medical2.index.values):
    clf.fit(medical2.iloc[train][features].values, medical.iloc[train]["UCURNINS"].values)
    prob = clf.predict_proba(medical2.iloc[test][features].values)
    probs.append(prob)
    indicies.append(test)
    aucs.append(metrics.roc_auc_score(medical.iloc[test]["UCURNINS"].values, prob[:,1]))
    accs.append(metrics.accuracy_score(medical.iloc[test]["UCURNINS"].values, (prob[:,1]>0.5).astype(int)))
print(np.mean(aucs))
print(aucs)
print(np.mean(accs))
print(accs)

0.8036455110439705
[0.8068274352412889, 0.8227779243154852, 0.8068194266434432, 0.7989174537503217, 0.8151110733325813, 0.7938441819174347, 0.8082769313070204, 0.7968999042853857, 0.7866600007247408, 0.8003207789220039]
0.8642937015694707
[0.8498930862437634, 0.8687099073414113, 0.8671228970630168, 0.8554319931565441, 0.8772455089820359, 0.8507483962936565, 0.8704205274411975, 0.8671228970630168, 0.8579982891360137, 0.8782435129740519]


For now 50 seems to be the best value.

Now lets see if changing our distance metric from Euclidesian to Manhatan distance is any better.

In [7]:
n_neighbors = 50
clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=1)
for train, test in kf.split(medical2.index.values):
    clf.fit(medical2.iloc[train][features].values, medical.iloc[train]["UCURNINS"].values)
    prob = clf.predict_proba(medical2.iloc[test][features].values)
    probs.append(prob)
    indicies.append(test)
    aucs.append(metrics.roc_auc_score(medical.iloc[test]["UCURNINS"].values, prob[:,1]))
    accs.append(metrics.accuracy_score(medical.iloc[test]["UCURNINS"].values, (prob[:,1]>0.5).astype(int)))
print(np.mean(aucs))
print(aucs)
print(np.mean(accs))
print(accs)

0.8106434294762069
[0.8068274352412889, 0.8227779243154852, 0.8068194266434432, 0.7989174537503217, 0.8151110733325813, 0.7938441819174347, 0.8082769313070204, 0.7968999042853857, 0.7866600007247408, 0.8003207789220039, 0.8200906612887722, 0.8342881707866624, 0.8208078470921378, 0.8137184018116637, 0.8342912507241628]
0.8648401964018202
[0.8498930862437634, 0.8687099073414113, 0.8671228970630168, 0.8554319931565441, 0.8772455089820359, 0.8507483962936565, 0.8704205274411975, 0.8671228970630168, 0.8579982891360137, 0.8782435129740519, 0.8528866714183891, 0.8702779757662152, 0.8714000570287995, 0.8570002851439977, 0.8781009409751924]


Choosing a different metric did not change a lot. Lets see how much improvement can we get by chosing fewer variables.

In [8]:
feat2 = ["USATMED", "FDENT", "UEDUC3", "FDOCT", 'UMARSTAT_Married_live together','UIMMSTAT_Foreign-born, non-citizen',
 'UIMMSTAT_US-born citizen']
n_neighbors = 30
clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=1)
for train, test in kf.split(medical2.index.values):
    clf.fit(medical2.iloc[train][feat2].values, medical.iloc[train]["UCURNINS"].values)
    prob = clf.predict_proba(medical2.iloc[test][feat2].values)
    probs.append(prob)
    indicies.append(test)
    aucs.append(metrics.roc_auc_score(medical.iloc[test]["UCURNINS"].values, prob[:,1]))
    accs.append(metrics.accuracy_score(medical.iloc[test]["UCURNINS"].values, (prob[:,1]>0.5).astype(int)))
print(np.mean(aucs))
print(aucs)
print(np.mean(accs))
print(accs)

0.8077414892545176
[0.8068274352412889, 0.8227779243154852, 0.8068194266434432, 0.7989174537503217, 0.8151110733325813, 0.7938441819174347, 0.8082769313070204, 0.7968999042853857, 0.7866600007247408, 0.8003207789220039, 0.8200906612887722, 0.8342881707866624, 0.8208078470921378, 0.8137184018116637, 0.8342912507241628, 0.799687035079033, 0.8133730986451371, 0.7734201269074457, 0.7939119577993796, 0.8147861245162522]
0.8655482335400473
[0.8498930862437634, 0.8687099073414113, 0.8671228970630168, 0.8554319931565441, 0.8772455089820359, 0.8507483962936565, 0.8704205274411975, 0.8671228970630168, 0.8579982891360137, 0.8782435129740519, 0.8528866714183891, 0.8702779757662152, 0.8714000570287995, 0.8570002851439977, 0.8781009409751924, 0.8561653599429794, 0.8746970776906628, 0.8699743370402053, 0.8568577131451383, 0.8806672369546621]


As we can see the results is not much worse. Additionally for fairly large datasets our estimator is rather stable with different values of $k$.

## Exercises 6
### Exercise 6.1

Import titanic dataset, divide it into training and test data. Apply the KNN method to predict the value of the variable ‘survived’, test a few values of parameter k. Check whether the rescaling of variables allows to get better forecasts in the test sample.

### Exercise 6.2

Find the optimal value for the parameter k using cross validation.

### Exercise 6.3

Calculate the prediction error on the train and test sample for k = 1, 2, …. 100, and plot results. What conclusions can be drawn from this result?