# Assignment 3 on SVM
By K. Sai Somanath, 18MCMT28

In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

## Extracting data of Arcene Dataset

In [2]:
DATA_PATH = 'Data/Arcene'
TRAIN_FILE = DATA_PATH + '/train'
TRAIN_LABELS_FILE = DATA_PATH + '/train_labels'
TEST_FILE = DATA_PATH + '/test'
TEST_LABELS_FILE = DATA_PATH + '/test_labels'

In [3]:
file_handle = open(TRAIN_FILE)
train = np.array([list(map(int, file_handle.readline().strip().split(' '))) for _ in range(100)], dtype='float64')
train.shape

(100, 10000)

In [4]:
file_handle = open(TEST_FILE)
test = np.array([list(map(int, file_handle.readline().strip().split(' '))) for _ in range(100)], dtype='float64')
test.shape

(100, 10000)

In [5]:
file_handle = open(TRAIN_LABELS_FILE)
y_train = np.array([int(file_handle.readline().strip()) for _ in range(100)])

In [6]:
file_handle = open(TEST_LABELS_FILE)
y_test = np.array([int(file_handle.readline().strip()) for _ in range(100)])

## Apply PCA transformation to reduce the dimensions of the data

### K = 100

In [7]:
pca = PCA(n_components=100)
pca.fit(train)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
X_train = pca.transform(train)
X_test = pca.transform(test)

## Applying Grid Search to find the best parameters

In [9]:
tuned_parameters = [
  {'C': [1, 10, 100, 1000, 10000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000, 10000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
scores_list = ['precision', 'recall']
for score in scores_list:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    print("Final accuracy =", accuracy_score(y_test, clf.predict(X_test)))

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set:

{'C': 1, 'kernel': 'linear'}

Grid scores on development set:

0.911 (+/-0.106) for {'C': 1, 'kernel': 'linear'}
0.911 (+/-0.106) for {'C': 10, 'kernel': 'linear'}
0.911 (+/-0.106) for {'C': 100, 'kernel': 'linear'}
0.911 (+/-0.106) for {'C': 1000, 'kernel': 'linear'}
0.911 (+/-0.106) for {'C': 10000, 'kernel': 'linear'}
0.280 (+/-0.012) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 10000, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 10000, 'gamma': 

Now, that its clear the $C = 1$ and linear kernel gives the best results, we used them to train the model. Also, the accuracy on the test set is about $83\%$

In [10]:
clf_final = SVC(kernel='linear', C=1)
clf_final.fit(X_train, y_pred)
print(means)

[0.89780303 0.89780303 0.89780303 0.89780303 0.89780303 0.5
 0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5       ]


### Support Vectors

In [11]:
print("Number of support Vectors =", len(clf_final.support_))

Number of support Vectors = 90


In [12]:
print("Number of suppport vectors for each class:", clf_final.n_support_ )
print("The margin support vectors =", clf_final.dual_coef_.shape[1])
print("The non-margin support vectors = 0")

Number of suppport vectors for each class: [52 38]
The margin support vectors = 90
The non-margin support vectors = 0


$$
E\left [ Out\_Sample\_Error \right ] \leq \frac{E\left [ Number\_of\_Support\_Vectors \right ]}{N - 1}
$$

In [13]:
mean = np.array(means).mean()
if mean < len(clf_final.support_) / 99:
    print("The condition holds true")
else:
    print("The condition is false")

The condition holds true


### Classification report

In [14]:
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

         -1       0.82      0.89      0.85        56
          1       0.85      0.75      0.80        44

avg / total       0.83      0.83      0.83       100



### K = 10

In [15]:
# k = 10
scaler = MinMaxScaler()
scaler.fit(train)
X_train_s = scaler.transform(train)
X_test_s = scaler.transform(test)
pca = PCA(n_components=10)
pca.fit(X_train_s)
X1_train = pca.transform(X_train_s)
X1_test = pca.transform(X_test_s)

## Applying grid search for the best parameters

In [16]:
tuned_parameters = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
 ]
scores_list = ['precision', 'recall']
for score in scores_list:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X1_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X1_test)
    print(classification_report(y_true, y_pred))
    print()
    print("Final accuracy =", accuracy_score(y_test, clf.predict(X1_test)))

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.724 (+/-0.127) for {'C': 1, 'kernel': 'linear'}
0.732 (+/-0.111) for {'C': 10, 'kernel': 'linear'}
0.732 (+/-0.111) for {'C': 100, 'kernel': 'linear'}
0.732 (+/-0.111) for {'C': 1000, 'kernel': 'linear'}
0.786 (+/-0.213) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.658 (+/-0.190) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.280 (+/-0.012) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.830 (+/-0.163) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.807 (+/-0.181) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.619 (+/-0.172) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.846 (+/-0.121) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.832 (+/-0.086) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.791 (+/-0.046) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.805 (+/-0.120) for {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.865 (+/-0.091) for {'C': 

In [17]:
c = 1000
clf_final_2 = SVC(kernel='rbf', C=c, gamma=0.001)
clf_final_2.fit(X1_train, y_train)
y_true, y_pred = y_test, clf_final_2.predict(X1_test)
print(classification_report(y_true, y_pred))
print("Final accuracy =", accuracy_score(y_test, clf_final_2.predict(X1_test)))

             precision    recall  f1-score   support

         -1       0.85      0.89      0.87        56
          1       0.85      0.80      0.82        44

avg / total       0.85      0.85      0.85       100

Final accuracy = 0.85


Now, that its clear the $C = 1000$,  $\gamma = 0.001$ and RBF kernel gives the best results, we used them to train the model. Also, the accuracy on the test set is about $85\%$

### Support Vectors

In [18]:
print("Number of support Vectors =", len(clf_final_2.support_))
print("Number of suppport vectors for each class:", clf_final_2.n_support_ )
alphas = np.absolute(clf_final_2.dual_coef_)
msv = np.count_nonzero(alphas == c)
print("The margin support vectors =", clf_final_2.dual_coef_.shape[1] - msv)
print("The non-margin support vectors =", msv)

Number of support Vectors = 44
Number of suppport vectors for each class: [20 24]
The margin support vectors = 31
The non-margin support vectors = 13


$$
E\left [ Out\_Sample\_Error \right ] \leq \frac{E\left [ Number\_of\_Support\_Vectors \right ]}{N - 1}
$$

In [19]:
mean = np.array(means).mean()
if mean < len(clf_final.support_) / 99:
    print("The condition holds true")
else:
    print("The condition is false")

The condition holds true


### Classification report

In [20]:
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

         -1       0.85      0.89      0.87        56
          1       0.85      0.80      0.82        44

avg / total       0.85      0.85      0.85       100



## Analysis

The training proves to be challenging as the data available for training is very less.
1. Grid search had to be used to find the best parameters. In each iteration we exhuastively search for the best parameters using the 5-fold Cross-Validation.
2. In both the cases; k=100, k=10; the formula for the generalisation error is satisfied.
3. The Number of support vectors for both k=10 and k=100 are very high as the data was reduced from a very high dimesion (10000) to a low dimension (100, 10) space.
4. Also, considering the fact that the number of training samples equals number of test sample, and also that the number is a mere 100 points, we get really bad accuracy rates (about 30%) in some folds of cross validation.
5. When k=100, there is not much effect to the accuracy when the C value is changed to 10, 100, 1000, 10000 respectively. The accuracy remains the same. 
6. When using the linear kernel, the number of margin support vectors( $\alpha = C$ ) is $0$
7. The rbf kernel seems to perform better when the data is reduced to 10 dimensional PCA space.