In [1]:
# Welcome! In this notebook we will see how to use sklearn to perform classification
# first let's import the necessary libraries:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
# sklearn is divided in several sub-libraries for the different functionality, for
# a complete overview see the API here: http://scikit-learn.org/stable/modules/classes.html

In [2]:
# Next let's load some data to play with. In this notebook we will use
# the Abalone dataset from the UCI repository. The dataset should be put 
# in the same folder as the notebook. 

# with this dictionary we will encode the categorical feature 'gender'.
cat_gender = {'I':np.array([1,0,0]), 'M':np.array([0,1,0]), 'F':np.array([0,0,1])}
# loading the data and massaging it to become usable 
abalone_data = [x.strip().split(',') for x in open('abalone.data').readlines()] 
# Construct an array with the categorical feature (first one) and the remaining features
abalone_data = np.array([np.hstack((cat_gender[x[0]],np.array(map(float,x[1:])))) for x in abalone_data])
# Take last column as label
abalone_labels = abalone_data[:,-1].astype('int')
# The remaining columns are the data
abalone_data = abalone_data[:,:-1]
# We split the data randomly between train and test
naba= abalone_labels.shape[0]
sel = range(naba)
np.random.seed(25)
np.random.shuffle(sel)
aba_train_data = abalone_data[sel[:naba/2],:]
aba_train_labs = abalone_labels[sel[:naba/2]]
aba_test_data = abalone_data[sel[naba/2:],:]
aba_test_labs = abalone_labels[sel[naba/2:]]

In [3]:
# K-NN
# We define a function to run the K-NN classifier with a range of 'k' values
def runknn(tr_data, tr_labels, te_data, te_labels):
    for k in range(1,15,2):
        # Here we use the KNN function from scikit learn (in the
        # homework you will have to implement it yourselves) 
        # First we create a classifier object
        knn = KNeighborsClassifier(k)
        # Then we "train" it with the training data
        knn.fit(aba_train_data, aba_train_labs)
        # Finally we compute the accuracy of the test data
        acc = knn.score(aba_test_data, aba_test_labs)
        print 'Accuracy (k=%d): %.4f'%(k,acc)
# We call the function with the train and test splits
runknn(aba_train_data, aba_train_labs, aba_test_data, aba_test_labs)

Accuracy (k=1): 0.2030
Accuracy (k=3): 0.2121
Accuracy (k=5): 0.2437
Accuracy (k=7): 0.2623
Accuracy (k=9): 0.2623
Accuracy (k=11): 0.2657
Accuracy (k=13): 0.2666


In [4]:
# Logistic Regression
# Similarly as in KNN, we first create a classifier object, with C=1
LogReg = LogisticRegression(C=1)
# Then we train it with the training data
LogReg.fit(aba_train_data, aba_train_labs)
# And finally use the score function to get the accuracy
print 'Log Reg accuracy', LogReg.score(aba_test_data, aba_test_labs)

Log Reg accuracy 0.244135950215


In [5]:
# But, is C=1 the best regularization parameter? We will
# find out with cross-validation + grid search
# sklearn has a handy function for that too:
from sklearn.grid_search import GridSearchCV
# we define the search space for the C parameter
parameters = {'C':[10**i for i in range(-7,7)]}
# and create a grid search object with the type of classifier we want to
# use and the parameters
clf = GridSearchCV(LogisticRegression(), parameters, verbose=False)
# and we train the classifier with grid search (this will use cross-validation
# to select the best parameter)
clf.fit(aba_train_data, aba_train_labs)
print 'Log Reg with Grid search', clf.score(aba_test_data, aba_test_labs), 'with C =', clf.best_params_['C']



Log Reg with Grid search 0.258496888463 with C = 100


In [6]:
# Linear Support Vector Machine
# The interface for the different classifiers is very similar; the Linear SVM
# works in the same way as the logistic regression
clf = LinearSVC(C=1)
clf.fit(aba_train_data, aba_train_labs)
print 'Linear SVM score', clf.score(aba_test_data, aba_test_labs)
parameters = {'C':[10**i for i in range(-3, 4)]}
clf = GridSearchCV(LinearSVC(), parameters, verbose=False)
clf.fit(aba_train_data, aba_train_labs)
print 'Linear SVM score with Grid Search', clf.score(aba_test_data, aba_test_labs), 'with C =', clf.best_params_['C']

Linear SVM score 0.237912876975
Linear SVM score with Grid Search 0.238391574916 with C = 1


In [None]:
# Now we will do the same with the kernelized SVM
clf = SVC(C=1)
clf.fit(aba_train_data, aba_train_labs)
print 'RBF SVM score', clf.score(aba_test_data, aba_test_labs)
# We can actually define several parameters we want to optimize over:
parameters = {'C':[10**i for i in range(-3, 4)], 'kernel':['linear', 'rbf']}
clf = GridSearchCV(SVC(), parameters, verbose=False)
clf.fit(aba_train_data, aba_train_labs)
print 'RBF SVM score with Grid Search', clf.score(aba_test_data, aba_test_labs), 'with C =', \
    parameters.best_params_['C'], 'and kernel =', parameters.best_params_['kernel']