In [1]:
## This is a first cut (Run_01) trial on the dataset for keystrok dynamics
# Date: 23 Jul 2023
# By: Allen Lee

import numpy as np
import pandas as pd
import seaborn as sns

dataset = pd.read_csv('./Keystroke Data/DSL-StrongPasswordData.csv')
print("Data shape:",dataset.shape)
dataset.head()

Data shape: (20400, 34)


Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [2]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20400 entries, 0 to 20399
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   subject          20400 non-null  object 
 1   sessionIndex     20400 non-null  int64  
 2   rep              20400 non-null  int64  
 3   H.period         20400 non-null  float64
 4   DD.period.t      20400 non-null  float64
 5   UD.period.t      20400 non-null  float64
 6   H.t              20400 non-null  float64
 7   DD.t.i           20400 non-null  float64
 8   UD.t.i           20400 non-null  float64
 9   H.i              20400 non-null  float64
 10  DD.i.e           20400 non-null  float64
 11  UD.i.e           20400 non-null  float64
 12  H.e              20400 non-null  float64
 13  DD.e.five        20400 non-null  float64
 14  UD.e.five        20400 non-null  float64
 15  H.five           20400 non-null  float64
 16  DD.five.Shift.r  20400 non-null  float64
 17  UD.five.Shif

In [3]:
# Extract the target as a separate data series
subject = dataset['subject']

# Extract the features separately, excluding the DD timings
# Just using 21 features of Hold timing and key transfer timing from up to down stroke of next key
col_features = ['H.period', 'UD.period.t', 'H.t', 'UD.t.i', 'H.i', 'UD.i.e', 'H.e', 'UD.e.five',
                'H.five','UD.five.Shift.r', 'H.Shift.r', 'UD.Shift.r.o', 'H.o', 'UD.o.a', 'H.a',
                'UD.a.n','H.n','UD.n.l','H.l','UD.l.Return','H.Return']

features = dataset[col_features]
features.head()

Unnamed: 0,H.period,UD.period.t,H.t,UD.t.i,H.i,UD.i.e,H.e,UD.e.five,H.five,UD.five.Shift.r,...,UD.Shift.r.o,H.o,UD.o.a,H.a,UD.a.n,H.n,UD.n.l,H.l,UD.l.Return,H.Return
0,0.1491,0.2488,0.1069,0.0605,0.1169,0.1043,0.1417,1.0468,0.1146,1.4909,...,0.6523,0.1016,0.112,0.1349,0.0135,0.0932,0.2583,0.1338,0.2171,0.0742
1,0.1111,0.234,0.0694,0.0589,0.0908,0.0449,0.0829,1.1141,0.0689,0.7133,...,0.6307,0.1066,0.0618,0.1412,0.1146,0.1146,0.1496,0.0839,0.1917,0.0747
2,0.1328,0.0744,0.0731,0.056,0.0821,0.0721,0.0808,0.96,0.0892,0.5311,...,0.5741,0.1365,0.1566,0.1621,0.0711,0.1172,0.1533,0.1085,0.1762,0.0945
3,0.1291,0.1224,0.1059,0.1436,0.104,0.0998,0.09,0.9656,0.0913,1.1651,...,0.6096,0.0956,0.0574,0.1457,0.0172,0.0866,0.1475,0.0845,0.2387,0.0813
4,0.1249,0.1068,0.0895,0.0781,0.0903,0.0686,0.0805,0.7824,0.0742,0.8213,...,0.6389,0.043,0.1545,0.1312,0.027,0.0884,0.1633,0.0903,0.1614,0.0818


In [4]:
# Train and test data splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, subject, test_size=0.2, random_state=42, stratify=subject)

## Trial #1: Decision Tree Classifier

In [5]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42, max_depth=15)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.835
Accuracy on test set: 0.686


## Trial #2: LogisticRegression

In [12]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=10, random_state=42, max_iter=1000).fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(clf.score(X_test, y_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy on training set: 0.855
Accuracy on test set: 0.840


## Trial #3: K-Nearest Neighbours

In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(knn.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(knn.score(X_test, y_test)))

Accuracy on training set: 0.841
Accuracy on test set: 0.758


## Trial #4: Random Forest Ensemble

In [9]:
from sklearn.ensemble import RandomForestClassifier

randforest = RandomForestClassifier(random_state=42).fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(randforest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(randforest.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.934


## Trial #5: AdaBoost Ensemble

In [10]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(random_state=42)
ada_clf.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(ada_clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(ada_clf.score(X_test, y_test)))

Accuracy on training set: 0.046
Accuracy on test set: 0.045


## Trial #6: Gradient Boosting Ensemble

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
gr_clf = GradientBoostingClassifier(random_state=42)  #with default parameters
gr_clf.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(gr_clf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gr_clf.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.915


## Trial #7: Support Vector Machine

In [7]:
from sklearn.svm import SVC
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

print("Accuracy on training set: {:.3f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svc.score(X_test, y_test)))

Accuracy on training set: 0.848
Accuracy on test set: 0.809


In [10]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param = {
    'C' : [0.001, 0.01, 0.1, 1, 10, 20],
    'gamma' : [0.1, 1, 5, 10],
    'kernel': ['rbf', 'poly']
}

clf = GridSearchCV(svc, param, cv=10).fit(X_train, y_train)

print("Best param combinations:",clf.best_params_)


Best param combinations: {'C': 20, 'gamma': 1, 'kernel': 'rbf'}


In [15]:
print("Accuracy on training data: ", clf.best_score_)

best_svc = SVC(random_state=42, C=20,gamma=1, kernel='rbf')
best_svc.fit(X_train, y_train)
print("Accuracy on test set: {:.3f}".format(best_svc.score(X_test, y_test)))

Accuracy on training data:  0.875980392156863
Accuracy on test set: 0.886
