# 5. Hierarchical User Authentication System Demo

This notebook demonstrates a hierarchical approach to user authentication using behavioral biometrics. The hierarchical approach scales better as the number of users increases by first clustering users into groups, then training user-specific models within each cluster.

## 5.1. Import Libraries and Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 14})

# Import our hierarchical authenticator
import sys  
sys.path.insert(1, '../src/')
from user_hierarchical_auth import HierarchicalUserAuthenticator

import warnings
warnings.filterwarnings('ignore')

## 5.2. Load and Prepare Data

In [None]:
mov_slow = pd.read_csv('../data/processed/movement_slow_stat_cleaned.csv').drop(columns=['Unnamed: 0']).fillna(0)
mov_fast = pd.read_csv('../data/processed/movement_fast_stat_cleaned.csv').drop(columns=['Unnamed: 0']).fillna(0)
traffic_slow = pd.read_csv('../data/processed/traffic_slow_stat_cleaned.csv', index_col=0).fillna(0)
traffic_fast = pd.read_csv('../data/processed/traffic_fast_stat_cleaned.csv', index_col=0).fillna(0)

print(f"Movement (Slow): {mov_slow.shape[0]} samples, {mov_slow.shape[1]} features")
print(f"Movement (Fast): {mov_fast.shape[0]} samples, {mov_fast.shape[1]} features")
print(f"Number of unique users in slow game: {mov_slow['ID'].nunique()}")
print(f"Number of unique users in fast game: {mov_fast['ID'].nunique()}")

## 5.3. Initialize the Hierarchical Authenticator System

In [None]:
authenticator = HierarchicalUserAuthenticator(num_clusters=None)
user_data, all_user_ids = authenticator.prepare_data(mov_slow, mov_fast, traffic_slow, traffic_fast)
print(f"Prepared data for {len(all_user_ids)} unique users")

## 5.4. Cluster Users Based on Behavioral Patterns

In [None]:
clusters = authenticator.cluster_users(game_type='slow', method='kmeans', visualize=True)
fast_clusters = authenticator.cluster_users(game_type='fast', method='kmeans', visualize=True)

## 5.6. Train Hierarchical Authentication Models

In [None]:
metrics_slow = authenticator.train_hierarchical_models(game_type='slow', test_size=0.2, balance_classes=True)
metrics_fast = authenticator.train_hierarchical_models(game_type='fast', test_size=0.2, balance_classes=True)

print("\nPerformance Metrics Summary by Game Type:")
display(authenticator.performance_metrics.groupby(['Game_Type']).mean())

print("\nPerformance Metrics Summary by Cluster (Slow Game):")
display(authenticator.performance_metrics[authenticator.performance_metrics['Game_Type'] == 'Slow'].groupby(['Cluster_ID']).mean())


## 5.7. Authentication System Evaluation

In [None]:
# Test authentication
slow_results = authenticator.test_hierarchical_authentication(game_type='slow', num_samples=5)
fast_results = authenticator.test_hierarchical_authentication(game_type='fast', num_samples=5)

# Analyze results more deeply
print("\nSlow Game Authentication Results:")
print(slow_results['Result_Type'].value_counts())

print("\nFast Game Authentication Results:")
print(fast_results['Result_Type'].value_counts())

In [None]:
# Plot result distribution by user cluster
plt.figure(figsize=(14, 7))
for game_type, results in [('Slow', slow_results), ('Fast', fast_results)]:
    for i, result_type in enumerate(['True Positive', 'True Negative', 'False Positive', 'False Negative']):
        plt.subplot(2, 2, i+1)
        
        # Get claimed user IDs for this result type
        user_ids = results[results['Result_Type'] == result_type]['Claimed_User_ID'].unique()
        
        # Count users in each cluster
        cluster_counts = {}
        for uid in user_ids:
            cluster = authenticator.user_to_cluster[game_type.lower()].get(uid, -1)
            cluster_counts[cluster] = cluster_counts.get(cluster, 0) + 1
        
        if cluster_counts:
            clusters = list(cluster_counts.keys())
            counts = [cluster_counts[c] for c in clusters]
            plt.bar(clusters, counts, alpha=0.7, label=game_type)
            
        plt.title(f'{result_type} by Cluster')
        plt.xlabel('Cluster ID')
        plt.ylabel('Number of Users')
        plt.legend()
    
plt.tight_layout()
plt.show()

## 5.8. Visualize Authentication Performance

In [None]:
authenticator.visualize_performance()

## 5.9. Compare to Flat (Non-Hierarchical) Model

In [None]:
comparison = authenticator.compare_to_flat_model(game_type='slow')
avg_comparison = comparison.groupby('Model_Type')[['Accuracy', 'F1', 'FAR', 'FRR']].mean()
display(avg_comparison)

In [None]:
# Performance comparison by number of users in cluster
slow_metrics = authenticator.performance_metrics[authenticator.performance_metrics['Game_Type'] == 'Slow']
cluster_sizes = {cid: info['size'] for cid, info in authenticator.clusters['slow'].items()}
slow_metrics['Cluster_Size'] = slow_metrics['Cluster_ID'].map(cluster_sizes)

plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1)
sns.scatterplot(x='Cluster_Size', y='F1', data=slow_metrics, alpha=0.7)
plt.title('F1 Score vs Cluster Size')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
sns.scatterplot(x='Cluster_Size', y='FAR', data=slow_metrics, alpha=0.7)
plt.title('False Acceptance Rate vs Cluster Size')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5.10. Evaluate Scalability

In [None]:
scalability = authenticator.evaluate_scalability(max_users=100, step=20)
print("\nScalability Results:")
display(scalability[['Num_Users', 'Num_Clusters', 'Train_Speedup', 'Auth_Speedup']])

## 5.11. Security Threshold Analysis

In [None]:
# Examine how authentication threshold affects security
thresholds = np.arange(0, 1.01, 0.05)
threshold_results = pd.DataFrame(columns=['Threshold', 'TPR', 'FPR', 'TNR', 'FNR', 'Accuracy', 'F1', 'Game_Type'])

for game_type, results in [('Slow', slow_results), ('Fast', fast_results)]:
    for threshold in thresholds:
        # Apply threshold
        predicted_genuine = results['Confidence'] >= threshold
        
        # Calculate metrics
        tp = sum((results['Is_Genuine_Attempt'] == True) & (predicted_genuine == True))
        tn = sum((results['Is_Genuine_Attempt'] == False) & (predicted_genuine == False))
        fp = sum((results['Is_Genuine_Attempt'] == False) & (predicted_genuine == True))
        fn = sum((results['Is_Genuine_Attempt'] == True) & (predicted_genuine == False))
        
        total = len(results)
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        accuracy = (tp + tn) / total
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        threshold_results = threshold_results.append({
            'Threshold': threshold,
            'TPR': tpr, 
            'FPR': fpr,
            'TNR': tnr,
            'FNR': fnr,
            'Accuracy': accuracy,
            'F1': f1,
            'Game_Type': game_type
        }, ignore_index=True)

# Plot threshold analysis
plt.figure(figsize=(14, 10))

plt.subplot(2, 2, 1)
for game_type in ['Slow', 'Fast']:
    game_data = threshold_results[threshold_results['Game_Type'] == game_type]
    plt.plot(game_data['Threshold'], game_data['Accuracy'], marker='o', label=f'{game_type} Game')
plt.xlabel('Confidence Threshold')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Threshold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
for game_type in ['Slow', 'Fast']:
    game_data = threshold_results[threshold_results['Game_Type'] == game_type]
    plt.plot(game_data['Threshold'], game_data['F1'], marker='o', label=f'{game_type} Game')
plt.xlabel('Confidence Threshold')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Threshold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
for game_type in ['Slow', 'Fast']:
    game_data = threshold_results[threshold_results['Game_Type'] == game_type]
    plt.plot(game_data['FPR'], game_data['TPR'], marker='o', label=f'{game_type} Game')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
for game_type in ['Slow', 'Fast']:
    game_data = threshold_results[threshold_results['Game_Type'] == game_type]
    plt.plot(game_data['Threshold'], game_data['TPR'], marker='o', label=f'{game_type} TPR')
    plt.plot(game_data['Threshold'], game_data['FPR'], marker='s', label=f'{game_type} FPR')
plt.xlabel('Confidence Threshold')
plt.ylabel('Rate')
plt.title('TPR and FPR vs Threshold')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()