In [171]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_curve
from scipy.spatial.distance import cityblock, mahalanobis, euclidean

In [100]:
data = pd.read_csv("DSL-StrongPasswordData.csv")

In [101]:
# 51 total
subjects = data["subject"].unique()

In [102]:
def evaluateEER(user_scores, imposter_scores):
    labels = [0]*len(user_scores) + [1]*len(imposter_scores)
    fpr, tpr, thresholds = roc_curve(labels, user_scores + imposter_scores)
    missrates = 1 - tpr
    farates = fpr
    dists = missrates - farates
    idx1 = np.argmin(dists[dists >= 0])
    idx2 = np.argmax(dists[dists < 0])
    x = [missrates[idx1], farates[idx1]]
    y = [missrates[idx2], farates[idx2]]
    a = ( x[0] - x[1] ) / ( y[1] - x[1] - y[0] + x[0] )
    eer = x[0] + a * ( y[0] - x[0] )
    return eer

In [106]:
class EuclideanDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_score = np.linalg.norm(self.test_genuine.iloc[i].values - self.mean_vector)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            cur_score = np.linalg.norm(self.test_imposter.iloc[i].values - self.mean_vector)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [110]:
class EuclideanNormedDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_score = np.linalg.norm(self.test_genuine.iloc[i].values - self.mean_vector)**2
            cur_score = cur_score / np.linalg.norm(self.test_genuine.iloc[i].values)
            cur_score = cur_score / np.linalg.norm(self.mean_vector)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            cur_score = np.linalg.norm(self.test_imposter.iloc[i].values - self.mean_vector)**2
            cur_score = cur_score / np.linalg.norm(self.test_imposter.iloc[i].values)
            cur_score = cur_score / np.linalg.norm(self.mean_vector)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [114]:
class ManhattanDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_score = cityblock(self.test_genuine.iloc[i].values, self.mean_vector)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            cur_score = cityblock(self.test_imposter.iloc[i].values, self.mean_vector)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [215]:
class ManhattanFilteredDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        self.std_vector = self.train.std().values
        dropping_indices = []
        for i in range(train.shape[0]):
            cur_score = euclidean(self.train.iloc[i].values, self.mean_vector)
            if (cur_score > 3*self.std_vector).all() == True:
                dropping_indices.append(i)
        self.train = self.train.drop(self.train.index[dropping_indices])
        self.mean_vector = self.train.mean().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_score = cityblock(self.test_genuine.iloc[i].values, self.mean_vector)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            cur_score = cityblock(self.test_imposter.iloc[i].values, self.mean_vector)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [216]:
ManhattanFilteredDetector(subjects).evaluate()

(0.12535310570229607, 0.081299613989821939)

In [219]:
class ManhattanScaledDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        self.mad_vector = self.train.mad().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_score = 0
            for j in range(len(self.mean_vector)):
                cur_score = cur_score + \
                            abs(self.test_genuine.iloc[i].values[j] - \
                                self.mean_vector[j]) / self.mad_vector[j]
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            cur_score = 0
            for j in range(len(self.mean_vector)):
                cur_score = cur_score + \
                            abs(self.test_imposter.iloc[i].values[j] - \
                                self.mean_vector[j]) / self.mad_vector[j]
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [220]:
ManhattanScaledDetector(subjects).evaluate()

(0.094544239072754985, 0.068375435533740883)

In [136]:
class MahalanobisDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        self.covinv = np.linalg.inv(np.cov(train.T))        
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            diff = self.test_genuine.iloc[i].values - self.mean_vector
            cur_score = np.dot(np.dot(diff.T, self.covinv), diff)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            diff = self.test_imposter.iloc[i].values - self.mean_vector
            cur_score = np.dot(np.dot(diff.T, self.covinv), diff)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [142]:
class MahalanobisNormedDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        self.covinv = np.linalg.inv(np.cov(train.T))        
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            diff = self.test_genuine.iloc[i].values - self.mean_vector
            cur_score = np.dot(np.dot(diff.T, self.covinv), diff)
            self.user_scores.append(cur_score)
            
        for i in range(self.test_imposter.shape[0]):
            diff = self.test_imposter.iloc[i].values - self.mean_vector
            cur_score = np.dot(np.dot(diff.T, self.covinv), diff)
            self.imposter_scores.append(cur_score)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [144]:
class NearestNeighbourMahalanobisDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.covinv = np.linalg.inv(np.cov(train.T))        
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cur_scores = []
            for j in range(self.train.shape[0]):
                diff = self.test_genuine.iloc[i].values - self.train.iloc[j]
                cur_scores.append(np.dot(np.dot(diff.T, self.covinv), diff))
            self.user_scores.append(min(cur_scores))
            
        for i in range(self.test_imposter.shape[0]):
            cur_scores = []
            for j in range(self.train.shape[0]):
                diff = self.test_imposter.iloc[i].values - self.train.iloc[j]
                cur_scores.append(np.dot(np.dot(diff.T, self.covinv), diff))
            self.imposter_scores.append(min(cur_scores))
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [107]:
EuclideanDetector(subjects).evaluate()

(0.16929243070881717, 0.093142933315003756)

In [111]:
EuclideanNormedDetector(subjects).evaluate()

(0.21070527396302194, 0.11740641138703214)

In [115]:
ManhattanDetector(subjects).evaluate()

(0.15027387512717494, 0.090943572466230152)

In [137]:
MahalanobisDetector(subjects).evaluate()

(0.13370026430249218, 0.066780891150264782)

In [143]:
MahalanobisNormedDetector(subjects).evaluate()

(0.13370026430249218, 0.066780891150264782)

In [145]:
NearestNeighbourMahalanobisDetector(subjects).evaluate()

(0.10750418847396828, 0.062134969409931178)

In [235]:
class OutlierCountingDetector:
    
    def __init__(self, subjects):
        self.train = train
        self.test_genuine = test_genuine
        self.test_imposter = test_imposter
        self.user_scores = []
        self.imposter_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    def training(self):
        self.mean_vector = self.train.mean().values
        self.std_vector = self.train.std().values
        
    def testing(self):
        for i in range(self.test_genuine.shape[0]):
            cnt = 0.0
            for j in range(len(self.mean_vector)):
                cur_score = abs(self.test_genuine.iloc[i].values[j] - \
                                self.mean_vector[j]) // self.std_vector[j]
                if cur_score > 2.96:
                    cnt = cnt + 1.0
            self.user_scores.append(cnt)
            
        for i in range(self.test_imposter.shape[0]):
            cnt = 0.0
            for j in range(len(self.mean_vector)):
                cur_score = abs(self.test_imposter.iloc[i].values[j] - \
                                self.mean_vector[j]) // self.std_vector[j]
                if cur_score > 2.96:
                    cnt = cnt + 1.0     
            self.imposter_scores.append(cnt)
    
    def evaluate(self):
        eers = []
        
        for subject in subjects:
            
            self.user_scores = []
            self.imposter_scores = []
    
            # Consider current subject as genuine and rest as imposters
            genuine_user_data = data.loc[data.subject == subject, "H.period":"H.Return"]
            imposter_data = data.loc[data.subject != subject, :]
    
            # genuine user's first 200 time vectors for training
            self.train = genuine_user_data[:200]
    
            # True set (200 records)
            self.test_genuine = genuine_user_data[200:]
    
            # False set (250 records, 5 per imposter, 50 imposters in all)
            self.test_imposter = imposter_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
            
            self.training()
            
            self.testing()
    
            eers.append(evaluateEER(self.user_scores, self.imposter_scores))
        
        return np.mean(eers), np.std(eers)        

In [236]:
OutlierCountingDetector(subjects).evaluate()

(0.10316705067663712, 0.076910682379416034)