# Training With Additional Dataset
Additional dataset from Allen's own keystroke dynamic is captured and added into the benchmark dataset for training and more in depth evaluation using Support Vector Machine algorithm.

In [3]:
# Import all required libraries
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import model selection and data preparation libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


# Import algorithm libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier

## Loading of Datasets

In [11]:
# Loading Allen's keystroke data 
new_df = pd.read_csv('./Keystroke Data/Allen_keystroke.csv')

# Get the dataframe for feature columns only
new_features = new_df.drop(['Session', 'Sequence'], axis=1)

new_features

Unnamed: 0,User,H.Period,UD.period.t,H.t,UD.t.i,H.i,UD.i.e,H.e,UD.e.five,H.five,...,UD.Shift.r.o,H.o,UD.o.a,H.a,UD.a.n,H.n,UD.n.l,H.l,UD.l.Return,H.Return
0,Allen,131,131,91,63,91,100,101,161,115,...,3,88,90,132,3,77,154,71,130,71
1,Allen,131,134,82,70,71,19,91,144,71,...,130,82,59,112,80,74,180,72,181,73
2,Allen,121,115,81,70,71,19,91,265,101,...,121,80,60,125,60,62,170,62,193,71
3,Allen,121,130,92,73,81,29,81,191,124,...,123,81,80,111,70,71,164,81,90,81
4,Allen,122,90,91,63,81,40,101,161,95,...,144,90,50,111,30,82,143,51,150,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,Allen,112,90,94,70,93,141,101,174,101,...,3,68,100,112,3,78,173,61,231,61
361,Allen,94,120,102,29,61,151,104,211,112,...,3,88,100,116,-21,81,160,71,224,71
362,Allen,114,121,121,39,82,153,91,282,125,...,3,68,80,101,29,92,163,51,232,51
363,Allen,122,141,111,33,81,120,102,244,91,...,29,92,39,112,93,71,152,61,204,41


In [10]:
# Loading Allen's keystroke data 
new_df = pd.read_csv('./Keystroke Data/Allen_keystroke.csv')

# Get the dataframe for feature columns only
new_features = new_df.drop(['Session', 'Sequence'], axis=1)

# Loading benchmark dataset and only taking data from first 10 subjects
benchmark_df = pd.read_csv('./Keystroke Data/DSL-StrongPasswordData.csv')
first_10subject = (benchmark_df.groupby(by='subject', axis=0).count().index[:10])
selected_dataset = benchmark_df[benchmark_df['subject'].isin(first_10subject)]

# The DD and UD timings of each key have been showed to be highly correlated to each other 
# in a separate analysis. Hence, we will drop all features starting with 'DD'
all_features = selected_dataset.columns[3:34]
selected_features = [x for x in all_features if not x.startswith('DD')]

# Get a copy of dataset with selected feature columns
df = selected_dataset[selected_features].copy()

# Append new_df to df
print(df.head())

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [9]:
print(new_df.columns)
print(df.columns)

Index(['User', 'Session', 'Sequence', 'H.Period', 'UD.period.t', 'H.t',
       'UD.t.i', 'H.i', 'UD.i.e', 'H.e', 'UD.e.five', 'H.five',
       'UD.five.Shift.r', 'H.Shift.r', 'UD.Shift.r.o', 'H.o', 'UD.o.a', 'H.a',
       'UD.o.n', 'H.n', 'UD.n.l', 'H.l', 'UD.l.Return', 'H.Return'],
      dtype='object')
Index(['H.period', 'UD.period.t', 'H.t', 'UD.t.i', 'H.i', 'UD.i.e', 'H.e',
       'UD.e.five', 'H.five', 'UD.five.Shift.r', 'H.Shift.r', 'UD.Shift.r.o',
       'H.o', 'UD.o.a', 'H.a', 'UD.a.n', 'H.n', 'UD.n.l', 'H.l', 'UD.l.Return',
       'H.Return'],
      dtype='object')
