In [53]:
# Kaggle competition
# Group: TBA
# method: Support Vector Machine
# preprocess getdata
from getdata import CrossValidation, GetData
# for writing csv files
import pandas as pd
# for the base linear SVM learner
from sklearn.svm import LinearSVC
import sklearn.metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [35]:
class ImageGetter:
    def __init__(
        self,
        dataset_modifier='_int',
        as_image=False,
        transform=False,
    ):
        self.dataset_modifier = dataset_modifier
        self.as_image = as_image
        self.transform = transform
        
    def kaggle(self):
        data_getter = GetData()

        train_x, train_y = data_getter.load_training(
            dataset_modifier=self.dataset_modifier,
            as_image=self.as_image,
            transform=self.transform,
        )

        test_x = data_getter.load_test(
            as_image=self.as_image,
            transform=self.transform
        )

        train_x = train_x.reshape((-1, 64, 64, 1))
        train_y = train_y.flatten()
        valid_x = test_x.reshape((-1, 64, 64, 1))

        print('dataset loaded successful for kaggle')

        return train_x, train_y, valid_x
    
    def process(self):
        cv = CrossValidation(
            dataset_modifier=self.dataset_modifier,
            transform=self.transform,
            as_image=True,
        )
        train_x, train_y, valid_x, valid_y = cv.get_set()
        print('data loaded successfully')

        train_x = train_x.reshape((-1, 64, 64, 1))
        valid_x = valid_x.reshape((-1, 64, 64, 1))

        return train_x, train_y, valid_x, valid_y

In [36]:
# internal testing
train_x, train_y, valid_x, valid_y = ImageGetter(
        dataset_modifier='_int',
        transform=True,
    ).process()

beginning manual transformations for training set...
finished manual transformation for training set
beginning cross validation separation...
finished cross validation separation
data loaded successfully


In [38]:
# kaggle
k_train_x, k_train_y, k_valid_x = ImageGetter(
        as_image=True,
        transform=True,
    ).kaggle()

beginning manual transformations for training set...
finished manual transformation for training set
beginning manual transformations for test set...
finished manual transformation for test set
dataset loaded successful for kaggle


In [49]:
# internal test
TARGET_SHAPE = (64,64)
print('begin linearSVC()')
svc_model = LinearSVC()
svc_model.fit(train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]), train_y) #fit it to default

begin linearSVC()


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [110]:
print('internal svc model score: ', svc_model.score(train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]), train_y))

internal svc model score:  0.455875


# Predictions and Evaluations

In [54]:
predictions = svc_model.predict(valid_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]))

# write to the csv file for submission
print('writing data to file...')
dt = pd.DataFrame(data=predictions)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/internal_svm_test.csv', mode='w',index=True, index_label='Id')

writing data to file...


In [108]:
print('svc f1_score: ',sklearn.metrics.f1_score(valid_y, predictions, average='micro'))

svc f1_score:  0.3454


In [75]:
print(confusion_matrix(valid_y, predictions))

[[ 80   5   7 702  20   4   0  20 133  39]
 [  3 758   2 196  17   6  11  11  83  31]
 [  0  14  40 776  28   0   2  22  75  28]
 [  0   7   6 925   8   2   0  11  27  11]
 [  2  48   4 321 363   8   3   8 134 115]
 [ 11  18   5 486  26 117   5  20 171  61]
 [  5  26   9 600  79  22  89   3 113  51]
 [  2  36  13 358  16   2   1 363  55 162]
 [  1  26   1 643  22   3   3   6 258  53]
 [  1  34   2 242  87   0   1  40  75 461]]


In [77]:
print(classification_report(valid_y, predictions))

             precision    recall  f1-score   support

          0       0.76      0.08      0.14      1010
          1       0.78      0.68      0.73      1118
          2       0.45      0.04      0.07       985
          3       0.18      0.93      0.30       997
          4       0.55      0.36      0.43      1006
          5       0.71      0.13      0.22       920
          6       0.77      0.09      0.16       997
          7       0.72      0.36      0.48      1008
          8       0.23      0.25      0.24      1016
          9       0.46      0.49      0.47       943

avg / total       0.56      0.35      0.33     10000



In [55]:
# kaggle 
TARGET_SHAPE = (64,64)
print('begin linearSVC()')
svc_model = LinearSVC()
svc_model.fit(k_train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]), k_train_y) #fit it to default

begin linearSVC()


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [109]:
print('kaggle svc model score: ', svc_model.score(k_train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]), k_train_y))

kaggle svc model score:  0.45764


In [61]:
# Predictions and Evaluations for kaggle
k_predictions = svc_model.predict(k_valid_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]))

# write to the csv file for submission
print('writing data to file...')
dt = pd.DataFrame(data=k_predictions)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/go_svm_test.csv', mode='w',index=True, index_label='Id')

writing data to file...


In [None]:
#print(sklearn.metrics.f1_score(k_valid_y, k_predictions, average='micro'))

## Random Classifer

In [105]:
import numpy as np
classes = np.unique(train_y)
randPreds = np.random.choice(classes, len(valid_y))
print('Random Class f1_score :', sklearn.metrics.f1_score(valid_y, randPreds, average='micro'))

# write to the csv file for submission
print('writing data to file...')
dt = pd.DataFrame(data=randPreds)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/random_test.csv', mode='w',index=True, index_label='Id')

Random Class f1_score : 0.1048
writing data to file...


## Majority-class classifier

In [106]:
# most common class in training set
majClass = np.argmax(train_y)
majPreds = np.multiply(np.ones(len(valid_y)), majClass)
print('Majority class f1_score :', sklearn.metrics.f1_score(valid_y, majPreds, average='micro')) 

# write to the csv file for submission
print('writing data to file...')
dt = pd.DataFrame(data=majPreds)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/major_test.csv', mode='w',index=True, index_label='Id')

Majority class f1_score : 0.0985
writing data to file...


## grindsearch 
to find the right right parameters (like what C values to use)  by creating a 'grid' of parameters and trying out all the possible combinations and see what works best. 

implemented with Scikit-learn built in functionality - GridSearchCV.
GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested.

In [85]:
# Create a dictionary called param_grid and fill out some parameters for C
param_grid = {'C': [0.01, 0.1, 1, 10, 100, 1000]}

In [86]:
# Create a GridSearchCV object and fit it to the training data.
#grid = GridSearchCV(LinearSVC(),param_grid,refit=True,verbose=2)
grid = GridSearchCV(LinearSVC(max_iter=20),param_grid,verbose=2) #verbose=2, n_jobs=2, ,cv=10

In [89]:
grid.fit(train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]),train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=  23.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.6s remaining:    0.0s


[CV] ........................................... C=0.01, total=  23.5s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=  22.8s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  23.8s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  22.7s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  23.2s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=  23.9s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=  23.0s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  7.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=20,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [90]:
grid.best_params_

{'C': 10}

In [92]:
grid_predictions = grid.predict(valid_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]))

In [102]:
print('writing data to file...')
dt = pd.DataFrame(data=grid_predictions)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/internal_svm_test_with_gridsearch.csv', mode='w',index=True, index_label='Id')

print(confusion_matrix(valid_y,grid_predictions))

writing data to file...
[[622   4  61   1 152  74  27  24   4  41]
 [ 56 712  47   4 203  38  14  13  11  20]
 [ 17  18 553   4 237  61  13  39  15  28]
 [ 45  15 368 104 168 156  12  24  46  59]
 [ 16  41  44   2 767  39  32  11   7  47]
 [ 36  11  71   7 125 573  29  22  19  27]
 [ 21  42  96   3 200 102 516   1   2  14]
 [ 48  26 107   6 181  85   1 484  10  60]
 [ 60  47 143   4 420 196  18   4  56  68]
 [ 13  15  36   1 416  66   1  54   8 333]]


In [103]:
print(classification_report(valid_y, grid_predictions))

             precision    recall  f1-score   support

          0       0.67      0.62      0.64      1010
          1       0.76      0.64      0.69      1118
          2       0.36      0.56      0.44       985
          3       0.76      0.10      0.18       997
          4       0.27      0.76      0.40      1006
          5       0.41      0.62      0.50       920
          6       0.78      0.52      0.62       997
          7       0.72      0.48      0.57      1008
          8       0.31      0.06      0.09      1016
          9       0.48      0.35      0.41       943

avg / total       0.56      0.47      0.46     10000



In [95]:
# Create a GridSearchCV object and fit it to the training data.
#grid = GridSearchCV(LinearSVC(),param_grid,refit=True,verbose=2)
k_grid = GridSearchCV(LinearSVC(max_iter=20),param_grid,verbose=2) #verbose=2, n_jobs=2, ,cv=10

In [96]:
k_grid.fit(k_train_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]),k_train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=  31.5s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.5s remaining:    0.0s


[CV] ........................................... C=0.01, total=  32.8s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=  30.1s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  31.3s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  30.2s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  28.8s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=  29.5s
[CV] C=1 .............................................................
[CV] .............................................. C=1, total=  28.3s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=20,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [97]:
k_grid.best_params_

{'C': 0.1}

In [98]:
k_grid_predictions = k_grid.predict(k_valid_x.reshape(-1, TARGET_SHAPE[0]*TARGET_SHAPE[1]))

In [101]:
print('writing data to file...')
dt = pd.DataFrame(data=k_grid_predictions)
dt.columns = ["Label"]
dt = dt.astype(int)
dt.to_csv('../output/go_svm_test_with_gridsearch.csv', mode='w',index=True, index_label='Id')


writing data to file...


In [None]:
# Results:
# random_test.csv scored 9.766%
# major_test.csv scored 9.266%
# internal_svm_test.csv scored 10.7%
# go_svm_test.csv scored 41%

# After gridsearch with the best parameter for C:
# internal_svm_test_with_gridsearch.csv still scored 9.066%
# go_svm_test_with_gridsearch.csv scored 41.8%


In [None]:
# f1 scores results:
# Random Class f1_score : 0.1048
# Majority class f1_score : 0.0985
# svc f1_score:  0.3454

# among the internal testing vs kaggle go time
# internal svc model score:  0.455875
# kaggle svc model score: 0.45764