In [4]:
import random

import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn import svm

from sklearn.ensemble import BaggingClassifier

## SVM
Load training, testing, and validation dataset.

In [6]:
trainX=sparse.load_npz('../data/trainsetInputVector_sparse.npz')
trainY = np.genfromtxt('../data/trainsetResult.csv', delimiter='\n')

validX=sparse.load_npz('../data/validsetInputVector_sparse.npz')
validY = np.genfromtxt('../data/validsetResult.csv', delimiter='\n')

testX=sparse.load_npz('../data/testsetInputVector_sparse.npz')
testY = np.genfromtxt('../data/testsetResult.csv', delimiter='\n')



I use [sklearn.ensemble.BaggingClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html) here to speed up the training process while keep using the whole data set, which will split the dataset and train on each of them, and finally using the average classifier.

In [None]:
# bagging of SVM with linear kernel to speed up training
n_estimators = 20
clf = BaggingClassifier(svm.LinearSVC(), max_samples=1.0 / n_estimators, n_estimators=n_estimators,n_jobs=n_estimators)
clf.fit(trainX, trainY)
clf.score(validX,validY) # valid_score = 0.6235049863363736
clf.score(validX_sub,validY_sub) # valid_score = 0.6213

In [None]:
# bagging of SVM with linear kernel to speed up training
n_estimators = 20
clf = BaggingClassifier(svm.SVC(kernel='sigmoid',), max_samples=1.0 / n_estimators, n_estimators=n_estimators,n_jobs=n_estimators)
clf.fit(trainX, trainY)
valid_score = clf.score(validX,validY) # valid_score = 0.6235049863363736

In [None]:
# bagging of SVM with linear kernel to speed up training
n_estimators = 20
clf = BaggingClassifier(svm.SVC(kernel='poly',), max_samples=1.0 / n_estimators, n_estimators=n_estimators,n_jobs=n_estimators)
clf.fit(trainX, trainY)
valid_score = clf.score(validX_sub,validY_sub) # valid_score = 0.53057

Another way to speed up the process is to randomly select a subset of the original ones.

In [None]:
# randomly select n_samples and targets accordingly
n_samples = 100000
indices = random.sample(range(trainX.shape[0]), n_samples)
trainX_sub = trainX[indices]
trainY_sub = trainY[indices]

n_samples = 100000
indices = random.sample(range(validX.shape[0]), n_samples)
validX_sub = validX[indices]
validY_sub = validY[indices]

n_samples = 100000
indices = random.sample(range(testX.shape[0]), n_samples)
testX_sub = testX[indices]
testY_sub = testY[indices]

In [None]:
# SVM with linear kernel with random sampled a subset

clf = svm.LinearSVC()
clf.fit(trainX_sub, trainY_sub)
valid_score = clf.score(validX_sub,validY_sub) # valid_score = 0.62153

In [None]:
# SVM with linear kernel with random sampled a subset with different C

res = [] 
for c in np.arange(1,100,10):
    print('C=%d' % c)
    clf = svm.LinearSVC(C=c)
    clf.fit(trainX_sub, trainY_sub)
    res.append(clf.score(validX_sub,validY_sub))
plt.plot(np.arange(1,100,10),res)
plt.ylabel('Accuracy')
plt.xlabel('Penalty the error term')
plt.savefig('svm_pics/svm_poly_diff_Cs.png')

In [None]:
# SVM with poly with random sampled a subset
n_estimators = 10
clf = BaggingClassifier(svm.SVC(kernel='poly',), max_samples=1.0 / n_estimators, n_estimators=n_estimators,n_jobs=n_estimators)
clf.score(validX_sub,validY_sub) # valid_score = 0.53018

## LR

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  (0, 10)	1.0
  (0, 17)	1.0
  (0, 29)	1.0
  (0, 61)	1.0
  (0, 102)	1.0
  (0, 113)	1.0
  (0, 140)	1.0
  (0, 188)	1.0
  (0, 213)	1.0
  (0, 216)	1.0


## Some other analysis

In [None]:
# PCA
from sklearn.decomposition import PCA

pca = PCA()
trainX_transformed = pca.fit_transform(trainX.todense())

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
trainX_normalized = scaler.fit_transform(trainX.todense())
trainX_transformed_normalized = scaler.fit_transform(trainX_transformed)

import seaborn as sns
for i in range(10):
    print('processing %d' % i)
    plt.clf()
    sns.distplot(trainX_transformed_normalized[trainY==0][:,i], rug=True) 
    plt.savefig('svm_pics//pc1_0.png')
    plt.clf()
    sns.distplot(trainX_transformed_normalized[trainY==1][:,i], rug=True)
    plt.savefig('svm_pics//pc1_1.png')