In [None]:
# Loading wine dataset and splitting into a train and test set

In [94]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine = load_wine()
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(wine.data,
                                                            wine.target,
                                                            random_state=1103)


In [None]:
# Checking shape

In [95]:
print(X_train_w.shape, X_test_w.shape)

(133, 13) (45, 13)


In [None]:
# Loading USPS dataset and splitting into a train and test set

In [101]:
train_USPS = np.genfromtxt("zip.train", delimiter = "")
test_USPS = np.genfromtxt("zip.test", delimiter = "")
data_USPS = np.r_[train_USPS, test_USPS]
target_USPS = data_USPS[:,0]

X_train_USPS, X_test_USPS, y_train_USPS, y_test_USPS = train_test_split(data_USPS,
                                                                       target_USPS,
                                                                       random_state=1104)

In [None]:
# Checking shape

In [102]:
print(X_train_USPS.shape, X_test_USPS.shape)

(6973, 257) (2325, 257)


In [103]:
# Using cross validation on wine dataset

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [160]:
cross_validation = cross_val_score(SVC(), X_train_w, y_train_w)
np.mean(cross_validation)

0.7296296296296296

In [166]:
# Using svm.fit and svm.score

svm.fit(X_train_w, y_train_w)
svm.score(X_test_w, y_test_w)

0.6888888888888889

In [None]:
# We see both cross validation and using SVM dont provide very accurate
# results, they are quite decent but nothing to be too sure with.
# Cross validation does prove to be slightly better.

In [106]:
# Now doing the same for USPS dataset

cross_validation_USPS = cross_val_score(svm, X_train_USPS, y_train_USPS)
np.mean(cross_validation_USPS)

0.990678329553694

In [255]:
# Using svm for USPS

svm.fit(X_train_USPS, y_train_USPS)
score =  svm.score(X_test_USPS, y_test_USPS)
score

0.9913978494623656

In [None]:
# These results are much more accurate, both are also very similar,
# so cross validation on USPS is better than on wine dataset.

In [None]:
# now for wine dataset: Task 5 and 6 below using code from lab 9

In [108]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.model_selection import GridSearchCV

In [254]:
# Standard Scaler

SS_pipe = make_pipeline(Normalizer(), SVC())
param_grid = {'svc__C': [0.01, 0.1, 1, 10, 100], 'svc__gamma': [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(SS_pipe, param_grid=param_grid, cv=5)
grid.fit(X_train_w, y_train_w)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_w, y_test_w))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9253561253561253
Test set score: 0.8888888888888888
Best parameters: {'svc__C': 100, 'svc__gamma': 100}


In [110]:
# MinMaxScaler
MMS_pipe = make_pipeline(MinMaxScaler(), SVC())

grid = GridSearchCV(MMS_pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test, y_test))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9774928774928775
Test set score: 1.0
Best parameters: {'svc__C': 1, 'svc__gamma': 1}


In [111]:
# Robust Scaler
RS_pipe = make_pipeline(RobustScaler(), SVC())

grid = GridSearchCV(RS_pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test, y_test))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.97008547008547
Test set score: 1.0
Best parameters: {'svc__C': 1, 'svc__gamma': 0.01}


In [112]:
# Normaliser
N_pipe = make_pipeline(Normalizer(), SVC())

grid = GridSearchCV(N_pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test, y_test))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9626780626780626
Test set score: 0.9111111111111111
Best parameters: {'svc__C': 100, 'svc__gamma': 100}


In [None]:
# MinMax and Robust Scalers provided the highest accuracies of 97%
# The worst is StandardScaler however even 92% isnt that bad. All 4
# Scaler methods provide a decent accuracy.

In [None]:
# Now for USPS dataset: Task 5 and 6 below using code from lab 9, again...
# I will use less parameters for C and gamma as it would take too long
# otherwise.

In [245]:
# Standard Scalar

# Using less parameters as it was taking hours to compute otherwise
param_grid = {'svc__C': [10, 100], 'svc__gamma': [0.1, 1]}


grid = GridSearchCV(SS_pipe, param_grid=param_grid)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.994120012547374
Test set score: 0.993978494623656
Best parameters: {'svc__C': 100, 'svc__gamma': 0.1}


In [246]:
# MinMax Scaler

grid = GridSearchCV(MMS_pipe, param_grid=param_grid)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9665850058880097
Test set score: 0.9655913978494624
Best parameters: {'svc__C': 10, 'svc__gamma': 0.1}


In [247]:
# Robust Scaler

grid = GridSearchCV(RS_pipe, param_grid=param_grid)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.46048862765667503
Test set score: 0.4924731182795699
Best parameters: {'svc__C': 10, 'svc__gamma': 0.1}


In [248]:
# Normalised

grid = GridSearchCV(N_pipe, param_grid=param_grid)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.994120012547374
Test set score: 0.993978494623656
Best parameters: {'svc__C': 100, 'svc__gamma': 0.1}


In [None]:
# Firstly we can see that the RobustScaler gives a very bad accuracy
# relative to the other 3. Standarf and Normalised with a accuracy of 99%
# are the best.

In [None]:
# Now to do a Neural Network for wine dataset

In [275]:
from sklearn.neural_network import MLPClassifier

classifier = MLPClassifier(max_iter=5000,
                           hidden_layer_sizes=[10],
                          random_state=1103)

# max_iter to get rid of errors.


In [None]:
# Task 3/4 using neural net, firstly using cross val score

In [227]:
cross_validation = cross_val_score(classifier, X_train_w, y_train_w)
np.mean(cross_validation)

0.46096866096866096

In [154]:
classifier.fit(X_train_w, y_train_w)
classifier.score(X_test_w, y_test_w)

0.4444444444444444

In [None]:
# These values are much less accurate using a neural net compared to
# what we tried before. These are bascially unusable.

In [None]:
# Now moving onto task 5/6 

In [273]:
# Standard Scalar

pipe_SS = make_pipeline(StandardScaler(), classifier)
param_grid = {'mlpclassifier__alpha': [0.001, 0.01, 0.1, 1],
  'mlpclassifier__hidden_layer_sizes': [(50,), (100,)]}
grid = GridSearchCV(pipe_SS, param_grid=param_grid, cv=10)
grid.fit(X_train_w, y_train_w)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_w, y_test_w))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9703296703296704
Test set score: 0.9555555555555556
Best parameters: {'mlpclassifier__alpha': 1, 'mlpclassifier__hidden_layer_sizes': (50,)}


In [244]:
# Normaliser

pipe_N = make_pipeline(Normalizer(), classifier)
param_grid = {'mlpclassifier__alpha': [0.001, 0.01, 0.1, 1],
  'mlpclassifier__hidden_layer_sizes': [(50,), (100,)]}
grid = GridSearchCV(pipe_N, param_grid=param_grid, cv=10)
grid.fit(X_train_w, y_train_w)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_w, y_test_w))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9104395604395605
Test set score: 0.8888888888888888
Best parameters: {'mlpclassifier__alpha': 0.001, 'mlpclassifier__hidden_layer_sizes': (100,)}


In [None]:
# By setting max iterations of the MLPClassifier to 5000, it gets rid of
# the errors from reaching them.

In [None]:
# Using neural nets, my standard scalar accuracies have improved as
# opposed to what we tried before.
# However, the normaliser's accuracy has diminished slightly, but these
# accuracies are still okay in general.

In [None]:
# Now to do a Neural Network for USPS dataset

In [264]:
# using classifier from neural net wine dataset.

cross_validation = cross_val_score(classifier, X_train_USPS, y_train_USPS)
np.mean(cross_validation)

0.9749030921049249

In [265]:
classifier.fit(X_train_USPS, y_train_USPS)
classifier.score(X_test_USPS, y_test_USPS)

0.9806451612903225

In [None]:
# Both accuracies of cross validation and neural net (MLPClassifer) are
# similar and quite high which is good.
# However svm (we did before) was 99% accuracy which was slightly better

In [267]:
# Standard Scaler

param_grid = {'mlpclassifier__alpha': [0.001, 0.01, 0.1, 1],
  'mlpclassifier__hidden_layer_sizes': [(50,), (100,)]}
grid = GridSearchCV(pipe_SS, param_grid=param_grid, cv=10)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9810688871257497
Test set score: 0.9840860215053764
Best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__hidden_layer_sizes': (100,)}


In [268]:
# Normaliser

grid = GridSearchCV(pipe_N, param_grid=param_grid, cv=10)
grid.fit(X_train_USPS, y_train_USPS)
print("Best cross-validation accuracy:", grid.best_score_)
print("Test set score:", grid.score(X_test_USPS, y_test_USPS))
print("Best parameters:", grid.best_params_)

Best cross-validation accuracy: 0.9868067403074164
Test set score: 0.9896774193548387
Best parameters: {'mlpclassifier__alpha': 0.01, 'mlpclassifier__hidden_layer_sizes': (100,)}


In [None]:
# Neaural nets for USPS proved to be quite accurate with accuracy of 98%
# for both
# Before when we didn't use neural nets we got 99% for both so we see
# only a slight decrease.
# Perhaps it was better to stick without neural nets.