In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from joblib import dump, load
import pickle

In [None]:
# Reading the csv file
pd.set_option('display.max_columns',None)
df = pd.read_csv('Train_keystroke.csv')

df

Unnamed: 0,user,press-0,release-0,press-1,release-1,press-2,release-2,press-3,release-3,press-4,release-4,press-5,release-5,press-6,release-6,press-7,release-7,press-8,release-8,press-9,release-9,press-10,release-10,press-11,release-11,press-12,release-12
0,1,0,120,216,312,424,496,592,664,808,856,1000,1072,1304,1400,1496,1544,1712,1760,1992,2064,2376,2448,2584,2632,2752,2824
1,1,0,95,168,265,360,455,527,599,736,807,928,999,1024,1095,1215,1271,1423,1471,1664,1711,1880,1952,2039,2111,2231,2279
2,1,0,71,143,231,783,903,1087,1159,1351,1454,1559,1631,1703,1799,1823,1902,2039,2111,2271,2343,2487,2559,2679,2751,2871,2926
3,1,0,95,144,263,353,431,760,832,1159,1207,1327,1377,1500,1591,2968,3015,3151,3223,3415,3463,3631,3703,3815,3887,3983,4055
4,1,0,70,166,238,310,406,526,598,710,758,878,950,926,1022,1094,1166,1310,1382,1543,1605,1734,1806,1926,1998,2086,2182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,110,0,80,225,306,444,465,575,672,689,784,884,1040,1000,1123,1439,1576,1638,1735,1785,1884,1928,2040,2057,2186,2284,2504
876,110,0,81,217,289,387,473,954,1057,1091,1189,1265,1419,1384,1465,1595,1698,1755,1866,1921,2019,2161,2265,2351,2457,2561,2704
877,110,0,79,216,300,393,519,692,807,807,894,1004,1204,1161,1279,1407,1512,1591,1724,1763,1866,1879,2046,2048,2192,2303,2449
878,110,0,84,200,290,384,476,782,930,943,978,1064,1240,1200,1310,1413,1484,1537,1658,1760,1848,1831,1940,1944,2082,2174,2319


In [None]:
def get_features(df):
    # Get the features fro the dataframe
    # Input:
        # df: Dataframe
    # Output: 
        # X - Features
        # y - users
    
    # According the to Figure 1. from the challenge file 
    # The Hold Time (HT) and Release-Press Time (RPT) are calculated by taking the difference between the consecutive keys (columns)
    htrpt = df.drop(columns=['user']).diff(axis=1, periods=1).drop(columns=['press-0'])
    # Even number of columns of the resulted dataframe is equal to HT
    ht = htrpt.loc[:, ::2].abs()
    # and odd number of columns are RPT
    rpt = htrpt.iloc[:, np.arange(htrpt.shape[1]) % 2 !=0].abs()
    
    # Difference between two keys (columns) resulted in Press-Press Time (PPT) and Release-Release Time (RRT)
    ppt_rrt = df.drop(columns=['user']).diff(axis=1, periods=2).drop(columns=['press-0','release-0'])

    # Even columns corresponds to ppt and odd to rrt
    ppt = ppt_rrt.loc[:, ::2].abs()
    rrt = ppt_rrt.iloc[:, np.arange(ppt_rrt.shape[1]) % 2 !=0].abs()
    

    # Mean and STD of the features

    df['HT_mean'] = ht.mean(axis=1)
    df['HT_std']= ht.std(axis=1)
    df['RPT_mean'] = rpt.mean(axis=1)
    df['RPT_std']= rpt.std(axis=1)
    df['PPT_mean'] = ppt.mean(axis=1)
    df['PPT_std']= ppt.std(axis=1)
    df['RRT_mean'] = rrt.mean(axis=1)
    df['RRT_std']= rrt.std(axis=1)


    # Features dataframe
    features = df[['HT_mean', 'HT_std', 'RPT_mean', 'RPT_std','PPT_mean', 'PPT_std','RRT_mean', 'RRT_std']]

    
    # X is the numpy arrays for the features data
    X = features.values
    # y is the corresponding labels for the features in X
    y = df['user'].values


    # y-1 is just to avoid the problem with user class starting from 1 instead of 0
    # which was giving error with xgboost for some python versions. So solve this problem
    # using subtracting -1 from the y.
    return X, y-1

In [None]:
# RandomForestClassifier
# with default parameters
rf_clf = RandomForestClassifier()
rf_clf.fit(X,y)

RandomForestClassifier()

In [None]:
# SVM classifier
clf = svm.SVC()
clf.fit(X,y)

SVC()

In [None]:
# XGBoost Classifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X,y)

XGBClassifier(objective='multi:softprob')

In [None]:
# Saving of the models
dump(clf, 'svm.joblib')
dump(rf_clf, 'rf_clf.joblib')
dump(xgb_clf, 'xgb_clf.joblib')

['xgb_clf.joblib']