importing the wine dataset using sklearn

In [1]:
from sklearn.datasets import load_wine
wine = load_wine()
from sklearn.model_selection import train_test_split
Xw_train, Xw_test, yw_train, yw_test = train_test_split(wine['data'], wine['target'], random_state=2501)


importing US postal service training and test set and combining them into USPS and then using train_test_split

In [2]:
import numpy as np
USPS_traindata = np.genfromtxt("zip.train", delimiter=" ")
USPS_testdata = np.genfromtxt("zip.test", delimiter=" ")
USPS = np.append(USPS_traindata,USPS_testdata, axis = 0)

In [3]:
data = USPS[:,1:257]
target = USPS[:,0]
Xu_train,Xu_test,yu_train,yu_test = train_test_split(data,target, random_state=2501)

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC


svmW = SVC()
scoresw = cross_val_score(svmW, Xw_train, yw_train)

svmU = SVC()
scoresu = cross_val_score(svmU, Xu_train, yu_train )

print("the cross validation scores for wine training datset with 5 folds is ", scoresw)
print("the cross validation scores for USPS training datset with 5 folds is ", scoresu)

the cross validation scores for wine training datset with 5 folds is  [0.7037037  0.7037037  0.66666667 0.69230769 0.73076923]
the cross validation scores for USPS training datset with 5 folds is  [0.96487455 0.96917563 0.96774194 0.97274032 0.97274032]


In [5]:
svmW.fit(Xw_train,yw_train)
print("the test error rate for wine dataset with default values of parameter for SVM is ",1 - svmW.score(Xw_test,yw_test))
svmU.fit(Xu_train,yu_train)
print("the test error rate for USPS dataset with default values of parameter for SVM is ",1 - svmU.score(Xu_test,yu_test))

the test error rate for wine dataset with default values of parameter for SVM is  0.3111111111111111
the test error rate for USPS dataset with default values of parameter for SVM is  0.023655913978494647


The cross validation score for wine dataset was lower than expected in the range (66 to 73), so the test error rate was expected to be higher, which in this case is 31%. The same is not true for USPS dataset, the cross validation scores were high enough ( in the range of 96-98) so in turn we had a very little error rate of 2%. Hence, the test error rate not only depends on the model used but the training data used as well

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler , MinMaxScaler, RobustScaler, Normalizer
from sklearn.model_selection import GridSearchCV
from IPython.display import clear_output

pipeNormalizer = [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
parameters = {'svc__C':[0.01, 0.1, 1, 10], 'svc__gamma': [0.01, 0.1, 1, 10 ]}

best_score_w = 0
best_score_u = 0 

for i in pipeNormalizer:
    
    print("using the normalizer: ", i)
    pipe =  make_pipeline(i, SVC())
    print("pipe created: ",i)
    gridw =  GridSearchCV(pipe, param_grid = parameters, cv=3, n_jobs = -1)
    print("grid initialized for wine dataset")
    gridw.fit(Xw_train,yw_train)
    print("fit to train set complete for wine dataset")
    
    if gridw.best_score_ > best_score_w:
        best_score_w = gridw.best_score_
        best_normalizer_w = i
        best_param_w = gridw.best_params_
        best_fit_w = gridw
        print("the current best score for wine dataset is",best_score_w)
    
    
    gridu =  GridSearchCV(pipe, param_grid = parameters, cv=3, n_jobs = -1)
    print("grid initialized for USPS dataset")
    gridu.fit(Xu_train,yu_train)
    print("fit to train set complete for USPS dataset")
    
    if gridu.best_score_ > best_score_u:
        best_score_u = gridu.best_score_
        best_normalizer_u = i
        best_param_u = gridu.best_params_
        best_fit_u = gridu
        print("the current best score for USPS dataset is",best_score_u)
    clear_output()

In [7]:
print("For Wine dataset, the best score was ",best_score_w," with the parameters " ,best_param_w ," and using the normalizer: ",best_normalizer_w)

print("For USPS dataset, the best score was ",best_score_u,"with the parameters:",best_param_u,"and using normalizer:",best_normalizer_u)

For Wine dataset, the best score was  0.9848484848484849  with the parameters  {'svc__C': 10, 'svc__gamma': 0.1}  and using the normalizer:  StandardScaler()
For USPS dataset, the best score was  0.9716046366726507 with the parameters: {'svc__C': 10, 'svc__gamma': 1} and using normalizer: Normalizer()


for wine dataset, it would be better if we use Standard Scalar with Parameter C=10 and gamma=0.1
and for USPS dataset, the Normalizer scaling is better with the parameter C=10 and gamma=1
although the results here are based on training set, we should also take a look at test error rate using these values.

In [8]:
import copy
import random
random.seed(2501)

#finding other Normalizer
if str(best_normalizer_w) == 'Normalizer()':
    pipeNormalizerw = copy.copy(pipeNormalizer)
    pipeNormalizerw.remove(best_normalizer_w)
    anotherNormalizerw = pipeNormalizerw[random.randrange(0,len(pipeNormalizerw))]
else:
    anotherNormalizerw = Normalizer()

# print(best_normalizer_w)
# print(anotherNormalizerw)


if str(best_normalizer_u) == 'Normalizer()':
    pipeNormalizeru = copy.copy(pipeNormalizer)
    pipeNormalizeru.remove(best_normalizer_u)
    anotherNormalizeru = pipeNormalizeru[random.randrange(0,len(pipeNormalizeru))]
else:
    anotherNormalizeru = Normalizer()

# #type(best_normalizer_u)
# print(best_normalizer_u)
# print(anotherNormalizeru)

In [9]:
#test error rate using score

anotherpipew = make_pipeline(anotherNormalizerw,SVC(C=best_param_w["svc__C"], gamma = best_param_w["svc__gamma"]))
anotherpipeu = make_pipeline(anotherNormalizeru,SVC(C=best_param_u["svc__C"], gamma = best_param_u["svc__gamma"]))

anotherpipew.fit(Xw_train,yw_train)
anotherpipeu.fit(Xu_train,yu_train)

# score_w=anotherpipew.score(Xw_test,yw_test) 
# score_u=anotherpipeu.score(Xu_test,yu_test)

# best_test_w = best_fit_w.score(Xw_test,yw_test)
# best_test_u = best_fit_u.score(Xu_test,yu_test)

# print("For Wine dataset: \n for the given set of parameters and normalizer \n Normalizer:",anotherNormalizerw,"\n Parameter:",best_param_w,"\n the test error rate is:", 1-score_w,end='\n')
# print("For Wine dataset with \n best Normalizer:",best_normalizer_w,"\n best Parameter:",best_param_w,"\n test error rate is :", 1-best_test_w, end = '\n\n')

# print("For USPS dataset: \n for the given set of parameters and normalizer \n Normalizer:",anotherNormalizeru,"\n Parameter:",best_param_u,"\n the test error rate is:", 1-score_u,end = '\n')
# print("For USPS dataset with \n best Normalizer:",best_normalizer_u,"\n best Parameter:",best_param_u,"\n test error rate is :", 1-best_test_u)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=10, gamma=1))])

In [10]:
#test error rate using the predicted values:

best_pred_w = best_fit_w.predict(Xw_test)
best_pred_u = best_fit_u.predict(Xu_test)

another_pred_w = anotherpipew.predict(Xw_test)
another_pred_u = anotherpipeu.predict(Xu_test)

testErrorw = sum(yw_test != best_pred_w)/len(yw_test)
testErroru = sum(yu_test != best_pred_u)/len(yu_test)

anotherTestErrorw = sum(yw_test != another_pred_w)/len(yw_test)
anotherTestErroru = sum(yu_test != another_pred_u)/len(yu_test)

print("For Wine dataset: \n for the given set of parameters and normalizer \n Normalizer:",anotherNormalizerw,"\n Parameter:",best_param_w,"\n the test error rate is:",anotherTestErrorw ,end='\n')
print("For Wine dataset with \n best Normalizer:",best_normalizer_w,"\n best Parameter:",best_param_w,"\n test error rate is :",testErrorw , end = '\n\n')

print("For USPS dataset: \n for the given set of parameters and normalizer \n Normalizer:",anotherNormalizeru,"\n Parameter:",best_param_u,"\n the test error rate is:",anotherTestErroru ,end = '\n')
print("For USPS dataset with \n best Normalizer:",best_normalizer_u,"\n best Parameter:",best_param_u,"\n test error rate is :",testErroru )


For Wine dataset: 
 for the given set of parameters and normalizer 
 Normalizer: Normalizer() 
 Parameter: {'svc__C': 10, 'svc__gamma': 0.1} 
 the test error rate is: 0.6222222222222222
For Wine dataset with 
 best Normalizer: StandardScaler() 
 best Parameter: {'svc__C': 10, 'svc__gamma': 0.1} 
 test error rate is : 0.022222222222222223

For USPS dataset: 
 for the given set of parameters and normalizer 
 Normalizer: StandardScaler() 
 Parameter: {'svc__C': 10, 'svc__gamma': 1} 
 the test error rate is: 0.792258064516129
For USPS dataset with 
 best Normalizer: Normalizer() 
 best Parameter: {'svc__C': 10, 'svc__gamma': 1} 
 test error rate is : 0.021075268817204302


The test error rate with the Normalizer seems too high for wine dataset but better for USPS dataset.
This is completely opposite for USPS dataset where Standard Scalar yeilds high test error rate as compared to Normalizer.

In [11]:
#cross-conformal predictor partially implemented.

from sklearn.model_selection import KFold

kf = KFold(shuffle=True, random_state=2501) # default n_splits is 5

pipew = make_pipeline(best_normalizer_w, SVC(C=best_param_w["svc__C"], gamma=best_param_w["svc__gamma"]))
pipeu = make_pipeline(best_normalizer_u, SVC(C=best_param_u["svc__C"], gamma=best_param_u["svc__gamma"]))


conformalScorew = list()
conformalScoreu = list()
rankw = 0

#for wine dataset:
for rest_index, fold_index in kf.split(Xw_train):
    Xw_rest, Xw_fold = Xw_train[rest_index], Xw_train[fold_index]
    yw_rest, yw_fold = yw_train[rest_index], yw_train[fold_index]
    pipew.fit(Xw_rest, yw_rest)
    XaugmentedFold = np.append(Xw_fold, Xw_test, axis = 0)
    #yaugmentedFold= np.append(yw_fold,yw_test, axis = 0)
    #conw = pipew.decision_function(XaugmentedFold)

    icmTestw = pipew.decision_function(Xw_test)
    icmFoldw = pipew.decision_function(Xw_fold)
    
    #to find p(y)
    #apply decision function on pipew and fold to find (a i,k) and apply the decision function on pipew to find (a^yk)
    #we then use the two values we do for each fold sum of  rank of aik < a^yk +1 /n+1
    
    #also need to figure out how to compute the rank of test sample in each fold and subtract 1 to get pythonic rank.

    #conformalScorew.extend(conw)

# print(len(conformalScorew))
# print(len(yw_test)*5)


In [12]:
#running MLPClassifier with crossValidation of cv=5(default) and with default Parameters
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier


mlpW = MLPClassifier()
scoresw = cross_val_score(mlpW, Xw_train, yw_train)

mlpU = MLPClassifier()
scoresu = cross_val_score(mlpU, Xu_train, yu_train )

print("the cross validation scores for wine training datset with 5 folds is ", scoresw)
print("the cross validation scores for USPS training datset with 5 folds is ", scoresu)

the cross validation scores for wine training datset with 5 folds is  [0.2962963  0.48148148 0.40740741 0.46153846 0.19230769]
the cross validation scores for USPS training datset with 5 folds is  [0.96057348 0.96344086 0.94982079 0.96987088 0.96628407]


In [13]:
mlpW.fit(Xw_train,yw_train)
print("the test error rate for wine dataset with default values of parameter for SVM is ",1 - mlpW.score(Xw_test,yw_test))
mlpU.fit(Xu_train,yu_train)
print("the test error rate for USPS dataset with default values of parameter for SVM is ",1 - mlpU.score(Xu_test,yu_test))

the test error rate for wine dataset with default values of parameter for SVM is  0.24444444444444446
the test error rate for USPS dataset with default values of parameter for SVM is  0.029247311827957034


We are getting warning that the optimization hasn't converged yet for few of the initial runs of iteration. The algorithm is optimizing by a stepwise convergence to a minimum and in these initial run, minimum wasn't found.

again, the cross validation score for wine dataset had most of the scores too low and hence the test error rate was expected to be high. Although USPS dataset with the cross validation score was having good score overall and hence the value for test error rate was also expected to be low. This observation is helpful to determine that it not only depends on the models use with different scaling and parameter but the dataset provided for training as well.

In [14]:
#trying different parameters and normalizers with MLPClassifier

pipeNormalizer = [StandardScaler(), MinMaxScaler(), RobustScaler(), Normalizer()]
parameters = {'mlpclassifier__learning_rate': ["constant", "invscaling", "adaptive"],
              'mlpclassifier__alpha': 10.0 ** -np.arange(1, 5)
             }

print("Using MLPClassifier for the following values and dataset: \n")
#list of fit to avoid refitting for calculating the test error rate
fitListw = []
fitListu = []

for i in pipeNormalizer:
    
    pipe =  make_pipeline(i, MLPClassifier())
    gridw =  GridSearchCV(pipe, param_grid = parameters, cv=3, n_jobs = -1)
    gridw.fit(Xw_train,yw_train)
     
    print("for wine dataset & Normalizer:",i,"\n and with best parameters:",gridw.best_params_,"the best score is:",gridw.best_score_,end = '\n')
    fitListw.append(gridw)
    
    gridu =  GridSearchCV(pipe, param_grid = parameters, cv=3, n_jobs = -1)
    gridu.fit(Xu_train,yu_train)
    print("for USPS dataset & Normalizer:",i,"\n and with best parameters:",gridu.best_params_,"the best score is:",gridu.best_score_,end = '\n')
    fitListu.append(gridu)

        

Using MLPClassifier for the following values and dataset: 





for wine dataset & Normalizer: StandardScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.01, 'mlpclassifier__learning_rate': 'constant'} the best score is: 0.9924242424242425
for USPS dataset & Normalizer: StandardScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'invscaling'} the best score is: 0.9638604803237527




for wine dataset & Normalizer: MinMaxScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'constant'} the best score is: 0.9848484848484849
for USPS dataset & Normalizer: MinMaxScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'adaptive'} the best score is: 0.9648646814600954




for wine dataset & Normalizer: RobustScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'adaptive'} the best score is: 0.9924242424242425
for USPS dataset & Normalizer: RobustScaler() 
 and with best parameters: {'mlpclassifier__alpha': 0.0001, 'mlpclassifier__learning_rate': 'adaptive'} the best score is: 0.9426349946637549




for wine dataset & Normalizer: Normalizer() 
 and with best parameters: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'invscaling'} the best score is: 0.6617845117845117
for USPS dataset & Normalizer: Normalizer() 
 and with best parameters: {'mlpclassifier__alpha': 0.001, 'mlpclassifier__learning_rate': 'adaptive'} the best score is: 0.9568335399971623




For MLPClassifier and wine dataset it seems either of the scaling method( Standard Scaler or robustScaler) is better suited with given training set.
on the other hand, for USPS dataset MinMaxScalar yeilds the best result.

In [15]:
#test error rate using predicted values

pred_w1 = fitListw[0].predict(Xw_test)
pred_w2 = fitListw[3].predict(Xw_test)

pred_u1 = fitListu[1].predict(Xu_test)
pred_u2 = fitListu[3].predict(Xu_test)

testErrorw1 = sum(yw_test != pred_w1)/len(yw_test)
testErroru1 = sum(yu_test != pred_u1)/len(yu_test)

testErrorw2 = sum(yw_test != pred_w2)/len(yw_test)
testErroru2 = sum(yu_test != pred_u2)/len(yu_test)

print("For Wine dataset and MLPClassifier : \n for the given set of parameters and normalizer \n Normalizer:",pipeNormalizer[0],"\n Parameter:",fitListw[0].best_params_,"\n the test error rate is:",testErrorw1 ,end='\n')
print("For Wine dataset and MLPClassifier: \n for the given set of parameters and normalizer \n Normalizer:",pipeNormalizer[3],"\n best Parameter:",fitListw[3].best_params_,"\n test error rate is :",testErrorw2 , end = '\n\n')

print("For USPS dataset and MLPClassifier : \n for the given set of parameters and normalizer \n Normalizer:",pipeNormalizer[1],"\n Parameter:",fitListw[1].best_params_,"\n the test error rate is:",testErroru1 ,end='\n')
print("For USPS dataset and MLPClassifier: \n for the given set of parameters and normalizer \n Normalizer:",pipeNormalizer[3],"\n best Parameter:",fitListw[3].best_params_,"\n test error rate is :",testErroru2 , end = '\n\n')


For Wine dataset and MLPClassifier : 
 for the given set of parameters and normalizer 
 Normalizer: StandardScaler() 
 Parameter: {'mlpclassifier__alpha': 0.01, 'mlpclassifier__learning_rate': 'constant'} 
 the test error rate is: 0.0
For Wine dataset and MLPClassifier: 
 for the given set of parameters and normalizer 
 Normalizer: Normalizer() 
 best Parameter: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'invscaling'} 
 test error rate is : 0.37777777777777777

For USPS dataset and MLPClassifier : 
 for the given set of parameters and normalizer 
 Normalizer: MinMaxScaler() 
 Parameter: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'constant'} 
 the test error rate is: 0.03096774193548387
For USPS dataset and MLPClassifier: 
 for the given set of parameters and normalizer 
 Normalizer: Normalizer() 
 best Parameter: {'mlpclassifier__alpha': 0.1, 'mlpclassifier__learning_rate': 'invscaling'} 
 test error rate is : 0.03354838709677419

