# Keystroke Dynamics Using RNNs
This notebook goes from the process of cleaning and preparing data to train and test a RNN for the job of Keystroke Dynamics. The Authentication Procedure comprises of using the timing of key strike, key hold, key change and so on to reach a conclusion whether the subject under observation is authentic or not.

## Data Preparation &amp; Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('C:\\Users\\user\\Desktop\\major_project\\keystroke-dynamics-datagen-master\\keystroke-dynamics-datagen-master\\DSL-StrongPasswordData.csv')
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [3]:
subjects = df['subject'].unique()
subjects

array(['s002', 's003', 's004', 's005', 's007', 's008', 's010', 's011',
       's012', 's013', 's015', 's016', 's017', 's018', 's019', 's020',
       's021', 's022', 's024', 's025', 's026', 's027', 's028', 's029',
       's030', 's031', 's032', 's033', 's034', 's035', 's036', 's037',
       's038', 's039', 's040', 's041', 's042', 's043', 's044', 's046',
       's047', 's048', 's049', 's050', 's051', 's052', 's053', 's054',
       's055', 's056', 's057'], dtype=object)

In [4]:
subjects_to_int = {subject: i  for i, subject in enumerate(subjects)}
int_to_subjects = {i: subject for i, subject in enumerate(subjects)}
#print(subjects_to_int)
print(int_to_subjects)

{0: 's002', 1: 's003', 2: 's004', 3: 's005', 4: 's007', 5: 's008', 6: 's010', 7: 's011', 8: 's012', 9: 's013', 10: 's015', 11: 's016', 12: 's017', 13: 's018', 14: 's019', 15: 's020', 16: 's021', 17: 's022', 18: 's024', 19: 's025', 20: 's026', 21: 's027', 22: 's028', 23: 's029', 24: 's030', 25: 's031', 26: 's032', 27: 's033', 28: 's034', 29: 's035', 30: 's036', 31: 's037', 32: 's038', 33: 's039', 34: 's040', 35: 's041', 36: 's042', 37: 's043', 38: 's044', 39: 's046', 40: 's047', 41: 's048', 42: 's049', 43: 's050', 44: 's051', 45: 's052', 46: 's053', 47: 's054', 48: 's055', 49: 's056', 50: 's057'}


In [5]:
df = df.replace(subjects_to_int)

In [6]:
df.shape

(20400, 34)

In [7]:
data = df.as_matrix()

In [8]:
temp_d = data[data[:, 0] == 0]
temp_d.shape
20400/51
np.random.choice(400, 150)

array([347, 295,  92, 341, 166,  87, 367, 270, 128,  93, 144, 226, 314,
       157, 120, 208, 145, 221, 317, 312,  65, 384, 204, 209, 350,  59,
        28, 134, 297,  44, 222, 379, 379, 114, 376, 119, 281, 349, 197,
        66, 328, 182, 107, 179, 210, 350, 166,  26, 266,  21, 309,  46,
        36, 122, 181, 310,  22,  19, 162, 219,   9, 364, 350, 314, 206,
       310, 368, 145, 259,   0, 302, 188, 384, 156, 133, 327, 130, 316,
        19, 280, 176, 328, 262, 270, 194, 394,  68, 128,  15, 163, 242,
       301, 168,  65, 206, 376,   0, 377,  40, 379, 337,   1, 370, 253,
       177,  11, 241, 247, 112, 383, 237, 373,  96, 268, 215,  23, 162,
       224, 288, 152, 177, 248, 150, 218, 167,  67, 244, 364, 156, 167,
        80, 211, 397, 275, 382, 234, 166, 293, 189,  40, 274,  34, 285,
        38, 151, 279,  90, 283, 369, 177])

In [9]:
def generate_positives(data, n_pos_per_subject=150):
    n_subjects = np.unique(data[:, 0]).shape[0]
    poss = []
    for i in range(n_subjects):
        temp_d = data[data[:, 0] == i]
        first_half = temp_d[np.random.choice(400, n_pos_per_subject), 3:-1]
        second_half = temp_d[np.random.choice(400, n_pos_per_subject), 3:-1]
        poss.append(np.hstack([first_half, second_half]))
    return np.vstack(poss)

In [10]:
poss = generate_positives(data)
poss.shape

(7650, 60)

In [11]:
def generate_negatives(data, n_neg_per_subject=150):
    n_subjects = np.unique(data[:, 0]).shape[0]
    negs = []
    for i in range(n_subjects):
        temp_d = data[data[:, 0] == i]
        temp_not_d = data[data[:, 0] != i]
        first_half = temp_d[np.random.choice(400, n_neg_per_subject), 3:-1]
        second_half = temp_not_d[np.random.choice(400, n_neg_per_subject), 3:-1]
        negs.append(np.hstack([first_half, second_half]))
    return np.vstack(negs)

In [12]:
negs = generate_negatives(data)
negs.shape

(7650, 60)

In [13]:
labels = np.zeros(poss.shape[0] + negs.shape[0])
labels[:poss.shape[0]] = 1
labels = np.expand_dims(labels, axis=1)
labels.shape

(15300, 1)

In [15]:
all_data = np.hstack([np.vstack([poss, negs]), labels])
all_data.shape

(15300, 61)

In [16]:
np.random.shuffle(all_data)

In [17]:
all_data_t = np.zeros((all_data.shape[0], 15, 4))
ctr = 0
for i, j in zip(range(0, 30, 2), range(30, 60, 2)):
    all_data_t[:, ctr, :] = np.hstack([all_data[:, i:i+2], all_data[:, j:j+2]])
    ctr += 1

In [27]:
all_data_t.shape

(15300, 15, 4)

In [28]:
X, y = all_data_t, all_data[:, -1]
print(X.shape)
print(y.shape)
print(y[10:20])

(15300, 15, 4)
(15300,)
[0. 0. 1. 1. 0. 1. 0. 0. 0. 1.]


## Training Phase
This Phase defines parameters of the model, the model itself along with its training to produce a simple RNN that can predict whether a person is the genuine holder of account or not.

### Some Params and HyperParams

In [17]:
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector, Concatenate
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [18]:
VALIDATION_SPLIT = 0.1

INPUT_SHAPE = [None, 2]

BATCH_SIZE = 32

EPOCHS = 20

In [19]:
def train_dev_split(x, y, val_split=0.1):
    m = x.shape[0]
    val_size = int(0.1 * m)
    return x[:-val_/size], y[:, :-val_size, :], x[-val_size:], y[:, -val_size:, :]

In [20]:
y = to_categorical(y)
print(y.shape)
print(y[0])
y_ = np.zeros((15, y.shape[0], y.shape[1]))
for i in range(15):
    y_[i, :, :] = y
print(y_.shape)

(15300, 2)
[0. 1.]
(15, 15300, 2)


In [21]:
y_[0, 0, :], y[0]

(array([0., 1.]), array([0., 1.], dtype=float32))

In [22]:
x_train, y_train, x_test, y_test = train_dev_split(X, y_)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((13770, 15, 4), (15, 13770, 2), (1530, 15, 4), (15, 1530, 2))

In [23]:
n_a = 10
n_out = 2

In [24]:
reshapor = Reshape((1, 4))
LSTM_cell = LSTM(n_a, return_state = True)
densor = Dense(n_out, activation='softmax')

In [25]:
def keystroke_model(Tx, n_in, n_a, n_out):
    X = Input(shape=(Tx, n_in))
    
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    
    outputs = []

    for t in range(Tx):
         
        x = Lambda(lambda x: X[:, t, :])(X)
        x = reshapor(x)
        a, _, c = LSTM_cell(x, initial_state=[a, c])
        print(a.shape)
        out = densor(a)
        #print(out.shape)
        outputs.append(out)

    model = Model(inputs=[X, a0, c0], outputs=outputs)
    
    return model

In [26]:
model = keystroke_model(15, 4, n_a, n_out)

(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)
(None, 10)


In [27]:
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [179]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 15, 4)        0                                            
__________________________________________________________________________________________________
lambda_118 (Lambda)             (None, 4)            0           input_12[0][0]                   
__________________________________________________________________________________________________
reshape_4 (Reshape)             (None, 1, 4)         0           lambda_118[0][0]                 
                                                                 lambda_119[0][0]                 
                                                                 lambda_120[0][0]                 
                                                                 lambda_121[0][0]           

In [180]:
m = x_train.shape[0]
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

In [181]:
model.fit([x_train, a0, c0], list(y_train), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1fe6340a668>

In [37]:
m_t = x_test.shape[0]
a0_t = np.zeros((m_t, n_a))
c0_t = np.zeros((m_t, n_a))

In [38]:
model.evaluate([x_test, a0_t, c0_t], list(y_test))



[8.110221078037437,
 0.6866126656532288,
 0.6632472276687622,
 0.6393974423408508,
 0.6197602152824402,
 0.6097667217254639,
 0.5514662265777588,
 0.5212035775184631,
 0.4958099126815796,
 0.4845815598964691,
 0.47372910380363464,
 0.46763792634010315,
 0.46981263160705566,
 0.4694244861602783,
 0.4710231125354767,
 0.483203649520874,
 0.5745097994804382,
 0.6503267884254456,
 0.6464052200317383,
 0.6339869499206543,
 0.6418300867080688,
 0.7627450823783875,
 0.7732025980949402,
 0.7973856329917908,
 0.7921568751335144,
 0.7960784435272217,
 0.7967320084571838,
 0.7980391979217529,
 0.800000011920929,
 0.801307201385498,
 0.7941176295280457]

In [40]:
model.save_weights('model.h5')