### Load packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
%matplotlib inline
from sklearn import cross_validation as cv
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import grid_search
from sklearn import decomposition
from sklearn import svm
from scipy import stats
from scipy.spatial.distance import cdist
from scipy.spatial.distance import mahalanobis

### Load csv

In [2]:
motionloc = 'C:/Users/Valued Customer/Desktop/motion_feature.csv'
nomotionloc = 'C:/Users/Valued Customer/Desktop/nomotion_feature.csv'
motion = pd.read_csv(motionloc, header=None,skip_blank_lines=True) # load whole table, row 22 is NA
nomotion = pd.read_csv(nomotionloc, header=None,skip_blank_lines=True)
motion_data = motion.ix[1:,2:]  # column 2-end are features, row 0 is column name
nomotion_data = nomotion.ix[1:,2:]
motion_class = np.empty(len(motion)-1) 
motion_class [:] = 1 #
#motion.ix[1:,7] # column 7 is class
nomotion_class = np.empty(len(nomotion)-1) 
nomotion_class [:] = 0 
#nomotion.ix[1:,7]
x=motion_data[:].append(nomotion_data[:],ignore_index = True) # motion: 0-35 Baylor, non motion: 0-50 Baylor
x=x.astype('float64') # convert string to float
y=pd.concat([pd.DataFrame(motion_class[:]),pd.DataFrame(nomotion_class[:])],ignore_index = True)
for i in range(len(y)):
    y.ix[i]=y.ix[i].astype('category') # convert string to categorical

### Split training and testing

In [3]:
x_train, x_test, y_train, y_test=cv.train_test_split(x,y,test_size=0.1)

### Data preprocessing

In [4]:
x_train_scaled = preprocessing.scale(x_train)
scaler = preprocessing.StandardScaler().fit(x_train)
x_test_scaled= scaler.transform(x_test)

### Visualize preprocessed data

In [None]:
# data_scaled=pd.DataFrame(x_train_scaled)
# data_scaled['class']=y_train

In [None]:
# data_all=pd.DataFrame(x_train_scaled).append(pd.DataFrame(x_test_scaled))
# x_scaled=pd.DataFrame(x_train_scaled).append(pd.DataFrame(x_test_scaled))
# data_all['class']=y
# _=data_all.boxplot(by='class')
# _=data_all.groupby('class').boxplot() 
# _=scatter_matrix(x, alpha=0.2, figsize=(6, 6))#, diagonal='kde')
# _=data_all.hist()

# correlations = x_scaled.corr()
# # plot correlation matrix
# # machine learning mastery with python Chap 6.
# fig = plt.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(correlations, vmin=-1, vmax=1)
# fig.colorbar(cax)
# _=plt.show()

### Build classifier

In [25]:
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_train_scaled, y_train[0], cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.706363636364+/-0.155068633165
QDA: accuracy 0.766363636364+/-0.100251749227
logistic regression: accuracy 0.78+/-0.0849939229719
random forest: accuracy 0.800454545455+/-0.0613366944222
SVC: accuracy 0.805+/-0.0593508269627
KNN: accuracy 0.790454545455+/-0.0516844366491


### Build Classifier with first 5 of the features

In [26]:
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_train_scaled[:,:5], y_train[0], cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.701818181818+/-0.137753522851
QDA: accuracy 0.628181818182+/-0.11609699564
logistic regression: accuracy 0.769090909091+/-0.113632727215
random forest: accuracy 0.769090909091+/-0.104876944815
SVC: accuracy 0.814090909091+/-0.0632929043064
KNN: accuracy 0.761363636364+/-0.0861300669322


### Since SVC has the best result, grid search for SVC

In [23]:
gammas = np.logspace(-4, 1, 10)
svc = SVC()
clf = grid_search.GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas),n_jobs=-1)
clf.fit(x_train_scaled, y_train[0]) 
best_gamma=clf.best_estimator_.gamma
best_gamma

0.77426368268112777

In [24]:
score = cross_validation.cross_val_score(SVC(gamma=best_gamma), x_train_scaled[:,:], y_train[0], cv=10,scoring='accuracy')
score.mean()

0.83363636363636373

### Remove low variance features

In [None]:
x_train_scaled.var()

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold() #threshold=(.8 * (1 - .8))
sel.fit(x_train_scaled)
idx=sel.get_support(indices=True)
idx

### PCA

In [21]:
cor_mat = np.corrcoef(x_train_scaled.T)
eig_vals, eig_vecs = np.linalg.eig(cor_mat)
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs[0:8]: ### show top 8
    print(i[0])

Eigenvalues in descending order:
11.2831677
2.53735688786
0.783275533127
0.191859869295
0.150228184456
0.0316179753283
0.0160958952485
0.00457959269924


In [32]:
pca = decomposition.PCA(n_components=5) # from varirance-explained above, n=3
pca.fit(x_train_scaled)
train_xPCA=pca.transform(x_train_scaled)
test_xPCA=pca.transform(y_test) #PCA both trainging and testing data
# random forest classification
clf = SVC(gamma=best_gamma)
clf.fit(train_xPCA, y_train)
output=clf.predict(test_xPCA)
scores = cv.cross_val_score(clf, train_xPCA, y_train[0], cv=10)
scores.mean()

  y_ = column_or_1d(y, warn=True)


0.83363636363636373

### Outlier removal: one class SVM

In [37]:
outliers_fraction = 0.1 # assume % are outliers
clf=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) # nu=0.1
clf.fit(x_train_scaled)
y_pred = clf.decision_function(x_train_scaled).ravel()
threshold = stats.scoreatpercentile(y_pred,100 * outliers_fraction)
id_OR = y_pred > threshold

In [38]:
x_OR = x_train_scaled[id_OR,:] # training data outlier removed
y_OR = y_train[id_OR]
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_OR, y_OR[0], cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.767234262126+/-0.107402641242
QDA: accuracy 0.742810457516+/-0.0988927190105
logistic regression: accuracy 0.778706570347+/-0.076754728516
random forest: accuracy 0.784588923289+/-0.0650992956551
SVC: accuracy 0.805933952528+/-0.0527682163722
KNN: accuracy 0.783969728242+/-0.0515148943751


### PCA + one class SVM

In [33]:
outliers_fraction = 0.05 # assume % are outliers
clf=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) # nu=0.1
clf.fit(train_xPCA)
y_pred = clf.decision_function(train_xPCA).ravel()
threshold = stats.scoreatpercentile(y_pred,100 * outliers_fraction)
id_PCAOR = y_pred > threshold

In [34]:
x_PCAOR = train_xPCA[id_PCAOR,:] # training data outlier removed
y_PCAOR = y_train[id_PCAOR]
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_PCAOR, y_PCAOR[0], cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.753558897243+/-0.123983890893
QDA: accuracy 0.728796992481+/-0.104012332009
logistic regression: accuracy 0.763558897243+/-0.0929882284787
random forest: accuracy 0.774110275689+/-0.0659010439565
SVC: accuracy 0.82045112782+/-0.0659973514939
KNN: accuracy 0.768847117794+/-0.063123304766


### PCA + distance bases outlier removal

In [42]:
# create a center for motion and a center for non motion
data =  pd.DataFrame(train_xPCA)
data['class'] = y_train.values
# data.columns = ['PCA1','PCA2','PCA3','PCA4','PCA5','class']
m=data.loc[data['class']== 1]
nm=data.loc[data['class']== 0]
m_center = m.ix[:,0:5].mean(axis = 0) # motoin cetner, column mean gives center
nm_center = nm.ix[:,0:5].mean(axis = 0)

In [43]:
m_L = m_center.shape[0]
nm_L = nm_center.shape[0]

#### Euclidean distance
m_dis = cdist(m.ix[:,0:5], m_center.reshape(-1,m_L)) # Euclidean distance to the center
nm_dis = cdist(nm.ix[:,0:5], nm_center.reshape(-1,nm_L))

# #### mahalonobis distance
# dis = []
# cov_x = np.cov(train_xPCA, rowvar=0)
# invcov = np.linalg.inv(cov_x)
# dis.append([mahalanobis(train_xPCA[i,:], center.reshape(-1,L), invcov) for i in range(train_xPCA.shape[0])])
# dis = np.array(dis)
# dis = dis.reshape(-1,1)

In [44]:
m_std = m_dis.std() # standard deviation of the distance
nm_std = nm_dis.std()

In [77]:
idm_PCAdOR =(m_dis < m_dis.mean()+1*m_std) * (m_dis.mean()-1*m_std <m_dis) # outliers are 2*std away from the center
print(sum(sum(idm_PCAdOR))/float(m_dis.shape[0])) # percent of outliers
idm_PCAdOR = idm_PCAdOR.ravel()

0.951219512195


In [78]:
idnm_PCAdOR =(nm_dis < nm_dis.mean()+1*nm_std) * (nm_dis.mean()-1*nm_std <nm_dis) # outliers are 2*std away from the center
print(sum(sum(idnm_PCAdOR))/float(nm_dis.shape[0])) # percent of outliers
idnm_PCAdOR = idnm_PCAdOR.ravel()

0.686746987952


In [79]:
x_PCAdOR =pd.concat([m.ix[idm_PCAdOR,0:5], nm.ix[idnm_PCAdOR,0:5]],ignore_index = True) # training data outlier removed
y_PCAdOR =pd.concat([m['class'].ix[idm_PCAdOR,],nm['class'].ix[idnm_PCAdOR]],ignore_index = True)
y_PCAdOR = y_PCAdOR.astype('category')
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_PCAdOR, y_PCAdOR, cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.788194444444+/-0.0983687357803
QDA: accuracy 0.80625+/-0.0878657479368
logistic regression: accuracy 0.829166666667+/-0.0845512847512
random forest: accuracy 0.741666666667+/-0.085210342501
SVC: accuracy 0.850694444444+/-0.050580160374
KNN: accuracy 0.809027777778+/-0.0933892235105


### Distance based outlier removal on full set of features

In [13]:
data=pd.DataFrame(x_train_scaled[:,:])
data['class']=y_train.values
data.columns=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','class']

In [14]:
m=data.loc[data['class']==1]
m.columns=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','class']
nm=data.loc[data['class']==0]
nm.columns=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','class']

In [80]:
m_center = m.ix[:,0:-1].mean(axis = 0) # motoin center, column mean gives center
nm_center = nm.ix[:,0:-1].mean(axis = 0)
m_L = m_center.shape[0]
nm_L = nm_center.shape[0]

#### Euclidean distance
m_dis = cdist(m.ix[:,0:-1], m_center.reshape(-1,m_L)) # Euclidean distance to the center
nm_dis = cdist(nm.ix[:,0:-1], nm_center.reshape(-1,nm_L))

m_std = m_dis.std() # standard deviation of the distance
nm_std = nm_dis.std()

idm_PCAdOR =(m_dis < m_dis.mean()+1*m_std) * (m_dis.mean()-1*m_std <m_dis) # outliers are 2*std away from the center
print(sum(sum(idm_PCAdOR))/float(m_dis.shape[0])) # percent of outliers
idm_PCAdOR = idm_PCAdOR.ravel()

idnm_PCAdOR =(nm_dis < nm_dis.mean()+1*nm_std) * (nm_dis.mean()-1*nm_std <nm_dis) # outliers are 2*std away from the center
print(sum(sum(idnm_PCAdOR))/float(nm_dis.shape[0])) # percent of outliers
idnm_PCAdOR = idnm_PCAdOR.ravel()

0.951219512195
0.686746987952


In [81]:
a=m.ix[idm_PCAdOR,0:-1]
b=nm.ix[idnm_PCAdOR,0:-1]
x_PCAdOR =pd.concat([a, b],ignore_index = True,axis=0) # training data outlier removed
y_PCAdOR =pd.concat([m['class'].ix[idm_PCAdOR,],nm['class'].ix[idnm_PCAdOR]],ignore_index = True)
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_PCAdOR, y_PCAdOR, cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.788194444444+/-0.0983687357803
QDA: accuracy 0.80625+/-0.0878657479368
logistic regression: accuracy 0.829166666667+/-0.0845512847512
random forest: accuracy 0.793055555556+/-0.0440082764382
SVC: accuracy 0.850694444444+/-0.050580160374
KNN: accuracy 0.809027777778+/-0.0933892235105


### Distance based outlier removal on first 5 features

In [86]:
data=pd.DataFrame(x_train_scaled[:,0:5])
data['class']=y_train.values
data.columns=['1','2','3','4','5','class']

In [87]:
m=data.loc[data['class']==1]
m.columns=['1','2','3','4','5','class']
nm=data.loc[data['class']==0]
nm.columns=['1','2','3','4','5','class']

In [91]:
m_center = m.ix[:,0:-1].mean(axis = 0) # motoin center, column mean gives center
nm_center = nm.ix[:,0:-1].mean(axis = 0)
m_L = m_center.shape[0]
nm_L = nm_center.shape[0]

#### Euclidean distance
m_dis = cdist(m.ix[:,0:-1], m_center.reshape(-1,m_L)) # Euclidean distance to the center
nm_dis = cdist(nm.ix[:,0:-1], nm_center.reshape(-1,nm_L))

m_std = m_dis.std() # standard deviation of the distance
nm_std = nm_dis.std()

idm_PCAdOR =(m_dis < m_dis.mean()+1*m_std) * (m_dis.mean()-1*m_std <m_dis) # outliers are 2*std away from the center
print(sum(sum(idm_PCAdOR))/float(m_dis.shape[0])) # percent of outliers
idm_PCAdOR = idm_PCAdOR.ravel()

idnm_PCAdOR =(nm_dis < nm_dis.mean()+1.5*nm_std) * (nm_dis.mean()-1.5*nm_std <nm_dis) # outliers are 2*std away from the center
print(sum(sum(idnm_PCAdOR))/float(nm_dis.shape[0])) # percent of outliers
idnm_PCAdOR = idnm_PCAdOR.ravel()

0.934959349593
0.927710843373


In [92]:
a=m.ix[idm_PCAdOR,0:-1]
b=nm.ix[idnm_PCAdOR,0:-1]
x_PCAdOR =pd.concat([a, b],ignore_index = True,axis=0) # training data outlier removed
y_PCAdOR =pd.concat([m['class'].ix[idm_PCAdOR,],nm['class'].ix[idnm_PCAdOR]],ignore_index = True)
names=['LDA','QDA','logistic regression','random forest','SVC','KNN']
classifiers=[LinearDiscriminantAnalysis(),
             QuadraticDiscriminantAnalysis(),
             LogisticRegression(),
             RandomForestClassifier(max_depth=5, n_estimators=3, max_features=1),
             SVC(gamma=2, C=1),
             KNeighborsClassifier(3)]
for name, clf in zip(names,classifiers):
    #clf.fit(x_train_scaled,y_train)
    #print(clf.score(x_test_scaled,y_test))
    score = cross_validation.cross_val_score(clf, x_PCAdOR, y_PCAdOR, cv=10,scoring='accuracy')
    print'{}: accuracy {}+/-{}'.format(name,score.mean(),score.std())

LDA: accuracy 0.75014619883+/-0.0934624092811
QDA: accuracy 0.725964912281+/-0.118020244389
logistic regression: accuracy 0.787894736842+/-0.0977793866512
random forest: accuracy 0.781228070175+/-0.0516698685452
SVC: accuracy 0.828421052632+/-0.063225245657
KNN: accuracy 0.777894736842+/-0.0811351823864
