In [1]:
# Adapted from http://www.science.smith.edu/~jcrouser/SDS293/
# Original R to Python adaptation by Jordi Warmenhoven

In [2]:
import pandas as pd
import numpy as np

### K-Nearest Neighbors

In [3]:
df = pd.read_csv('Smarket.csv', index_col=0, parse_dates = True)
df.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 1 to 1250
Data columns (total 9 columns):
Year         1250 non-null int64
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(7), int64(1), object(1)
memory usage: 97.7+ KB


In [5]:
from sklearn import neighbors
from sklearn.metrics import confusion_matrix, classification_report

In [6]:
#X_train = df[:'2004'][['Lag1','Lag2']]
#y_train = df[:'2004']['Direction']

X_train = df[df['Year'] <= 2004][['Lag1','Lag2']]
y_train = df[df['Year'] <= 2004]['Direction']

#X_test = df['2005':][['Lag1','Lag2']]
#y_test = df['2005':]['Direction']

X_test = df[df['Year'] >= 2005][['Lag1','Lag2']]
y_test = df[df['Year'] >= 2005]['Direction']

In [7]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 1)
pred = knn.fit(X_train, y_train).predict(X_test)

In [8]:
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))

[[43 58]
 [68 83]]
             precision    recall  f1-score   support

       Down      0.426     0.387     0.406       111
         Up      0.550     0.589     0.568       141

avg / total      0.495     0.500     0.497       252



In [9]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(X_train, y_train).predict(X_test)

print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))

[[48 55]
 [63 86]]
             precision    recall  f1-score   support

       Down      0.466     0.432     0.449       111
         Up      0.577     0.610     0.593       141

avg / total      0.528     0.532     0.529       252



### An Application to Caravan Insurance Data

In [10]:
df2 = pd.read_csv('Caravan.csv')
df2["Purchase"].value_counts()

No     5474
Yes     348
Name: Purchase, dtype: int64

In [11]:
from sklearn import preprocessing
y = df2.Purchase
X = df2.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)

In [12]:
X_train = X_scaled[1000:,:]
y_train = y[1000:]

X_test = X_scaled[:1000,:]
y_test = y[:1000]

In [13]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, pred, digits=3))

             precision    recall  f1-score   support

         No      0.946     0.928     0.937       941
        Yes      0.117     0.153     0.132        59

avg / total      0.897     0.882     0.889      1000



In [14]:
print(confusion_matrix(y_test, pred).T)

[[873  50]
 [ 68   9]]
