In [217]:
from sklearn import svm
from sklearn import cross_validation
from sklearn import preprocessing
import pandas as pd
import numpy as np

df = pd.read_csv('../data/DSL-StrongPasswordData.csv', sep = ',')

df = df.drop(['sessionIndex','rep'],1)

Y = df['subject'].apply(lambda x: int(x[-2:]))

X = df.drop('subject',1)
X = pd.DataFrame(preprocessing.scale(X))

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=42)

Y.value_counts()

47    400
35    400
22    400
53    400
37    400
21    400
5     400
52    400
36    400
20    400
4     400
51    400
19    400
54    400
3     400
50    400
34    400
18    400
2     400
49    400
33    400
17    400
48    400
32    400
38    400
7     400
31    400
11    400
15    400
46    400
30    400
29    400
13    400
44    400
28    400
12    400
43    400
27    400
42    400
39    400
26    400
10    400
57    400
41    400
25    400
56    400
40    400
24    400
8     400
55    400
16    400
Name: subject, dtype: int64

In [127]:
def score(clf,X_test,Y_test):
    
    count = 0
    score = 0

    for i in X_test.index.values:
        y_hat = clf.predict(X_test.ix[i].reshape(1,-1))
        if y_hat == Y_test[i]:
            score += 1
        count += 1
    
    return round(score/count*100,2)

In [116]:
clf = svm.SVC()
clf.fit(X_train,Y_train)

print('This initial SVM classifier is ' + str(score(clf,X_test,Y_test)) + '% accurate')

This initial SVM classifier is 88.14% accurate


In [117]:
results = []

for i in [1,25,50,75,100]:
    print(i)
    results.append(score(svm.SVC(C = i).fit(X_train,Y_train),X_test,Y_test))
    
results

1
25
50
75
100


[88.14, 90.12, 90.0, 89.9, 89.95]

In [128]:
# test different kernels

for i in ['linear','rbf','poly','sigmoid']:
    print(i + ': ' + str(score(svm.SVC(kernel = i).fit(X_train,Y_train),X_test,Y_test)) + '%')

linear: 84.85
rbf: 88.14
poly: 73.73
sigmoid: 1.32


In [201]:
def score_complex(clf,X_test,Y_test):
    
    count = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    
    user = Y_test.value_counts()[1]
    imposter = Y_test.value_counts()[0]

    for i in X_test.index.values:
        
        y_hat = clf.predict(X_test.ix[i].reshape(1,-1))
        
        if Y_test[i] == 1 and y_hat == 1:
            tp += 1
        if Y_test[i] == 1 and y_hat == 0:
            fn += 1
        if Y_test[i] == 0 and y_hat == 1:
            fp += 1
        if Y_test[i] == 0 and y_hat == 0:
            tn += 1
        
        count += 1
    
    return (round(tp/user*100,2),round(tn/imposter*100,2),round(fp/imposter*100,2),round(fn/user*100,2))

In [215]:
for i in [x for x in range(2,58) if x in Y]:

    Y = df['subject'].apply(lambda x: int(x[-2:])).apply(lambda x: 1 if x == i else 0)

    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=42)

    clf = svm.SVC()
    clf.fit(X_train,Y_train)

    scores = score_complex(clf,X_test,Y_test)

    print('SUBJECT' + str(i) + ': \n True-Positive = ' + str(scores[0]) + '% \n False-Negative = ' + str(scores[3]) + '% \n True-Negative = ' + str(scores[1]) + '% \n False-Positive = ' + str(scores[2]) + '%')

SUBJECT2: 
 True-Positive = 51.39% 
 False-Negative = 48.61% 
 True-Negative = 99.95% 
 False-Positive = 0.05%
SUBJECT3: 
 True-Positive = 59.42% 
 False-Negative = 40.58% 
 True-Negative = 99.93% 
 False-Positive = 0.07%
SUBJECT4: 
 True-Positive = 67.12% 
 False-Negative = 32.88% 
 True-Negative = 100.0% 
 False-Positive = 0.0%
SUBJECT5: 
 True-Positive = 83.56% 
 False-Negative = 16.44% 
 True-Negative = 99.93% 
 False-Positive = 0.07%


ValueError: The number of classes has to be greater than one; got 1