In [1]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from itertools import groupby

In [3]:
# Step 1: Load the dataset
df = pd.read_excel("Data (8).xlsx")

In [4]:
# Encode KeyPressed into numerical values
label_encoder = LabelEncoder()
df['KeyEncoded'] = label_encoder.fit_transform(df['KeyPressed'])

In [5]:
# Select timing features
features = ['FlightTime', 'KeyHoldTime']
df[features] = df[features].fillna(0)  # Handle missing values
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

In [6]:


# Function to train HMM per user and extract features
def train_hmm_per_user(user_df):
    user_df = user_df.copy()  # Ensure a new copy is modified
    X = user_df[['KeyEncoded'] + features].values
    lengths = [len(X)]
    
    hmm_model = hmm.GaussianHMM(n_components=4, covariance_type="diag", n_iter=1000, random_state=42)
    hmm_model.fit(X, lengths)
    
    log_likelihood = hmm_model.score(X)
    hidden_states = hmm_model.predict(X)
    
    user_df['HMM_LogLikelihood_PerUser'] = log_likelihood
    user_df['HMM_HiddenState'] = hidden_states
    
    # Calculate state transition counts
    transitions = np.bincount(hidden_states[:-1] * 4 + hidden_states[1:], minlength=16)
    user_df['HMM_HiddenState_TransitionCount'] = sum(transitions > 0)
    
    # Calculate state duration
    state_durations = [sum(1 for _ in group) for key, group in groupby(hidden_states)]
    user_df['HMM_HiddenState_Duration'] = np.mean(state_durations) if state_durations else 0
    
    return user_df

In [7]:
# Process each group separately and concatenate results
dfs = []
for _, group in df.groupby('DeviceId'):
    dfs.append(train_hmm_per_user(group))

df = pd.concat(dfs).reset_index(drop=True)

Model is not converging.  Current: 36697.19827242926 is not greater than 36697.21372707309. Delta is -0.015454643827979453
Model is not converging.  Current: 450737.19373090955 is not greater than 450743.245630542. Delta is -6.051899632439017
Model is not converging.  Current: 524987.4245966473 is not greater than 525063.6521197304. Delta is -76.2275230831001
Model is not converging.  Current: 174272.10493719982 is not greater than 174299.68980059604. Delta is -27.58486339621595
Model is not converging.  Current: 208.99705461648426 is not greater than 208.99707313683518. Delta is -1.85203509204257e-05
Model is not converging.  Current: 58109.90030096987 is not greater than 58114.19282994525. Delta is -4.2925289753766265
Model is not converging.  Current: 440.0153939696364 is not greater than 440.0388097038833. Delta is -0.023415734246896136
Model is not converging.  Current: 150949.4032550576 is not greater than 150949.43715782193. Delta is -0.033902764320373535
Model is not converging

In [8]:
# Additional derived features
df['TypingSpeed'] = 1 / (df['FlightTime'].replace(0, np.nan))
df['KeySequenceChangeRate'] = df['HMM_HiddenState_TransitionCount'] / df.groupby('DeviceId')['HMM_HiddenState_TransitionCount'].transform('sum')

df['InterKeyInterval_Mean'] = df.groupby('DeviceId')['FlightTime'].transform('mean')
df['InterKeyInterval_Std'] = df.groupby('DeviceId')['FlightTime'].transform('std')
df['Burstiness'] = df.groupby('DeviceId')['FlightTime'].transform(lambda x: (x.std() - x.mean()) / (x.std() + x.mean()))

df['TransitionEntropy'] = df.groupby('DeviceId')['HMM_HiddenState'].transform(lambda x: -np.sum(np.bincount(x, minlength=4) / len(x) * np.log2(np.bincount(x, minlength=4) / len(x))))
df['DominantHiddenState'] = df.groupby('DeviceId')['HMM_HiddenState'].transform(lambda x: np.bincount(x).argmax())

df['TimeSinceLastKeyPress'] = df.groupby('DeviceId')['FlightTime'].transform(lambda x: x.shift(1).fillna(0).cumsum())
df['AvgSessionLength'] = df.groupby('DeviceId')['FlightTime'].transform(lambda x: (x > 2 * x.mean()).cumsum().value_counts().mean())


In [9]:
# Save enhanced dataset
df.to_csv("enhanced_dataset.csv", index=False)
print("Feature extraction complete. Saved to enhanced_dataset.csv")

Feature extraction complete. Saved to enhanced_dataset.csv


# Data Dictionary

## Original Columns

| Column Name      | Description |
|-----------------|-------------|
| **DeviceId** | Unique identifier for each user/device. |
| **KeyPressed** | The key that was pressed by the user. |
| **FlightTime** | Time taken between releasing one key and pressing the next (milliseconds). |
| **KeyHoldTime** | Duration for which a key is held down before release (milliseconds). |

## Derived Columns

| Column Name | Description |
|-------------|-------------|
| **KeyEncoded** | Encoded numerical representation of `KeyPressed` using Label Encoding. |
| **HMM_LogLikelihood_PerUser** | Log-likelihood of the trained HMM model per user, representing how well the model fits the user’s typing pattern. |
| **HMM_HiddenState** | Hidden state assigned to each keystroke based on the trained HMM model. |
| **HMM_HiddenState_TransitionCount** | Number of transitions between hidden states, indicating typing variability. |
| **HMM_HiddenState_Duration** | Average duration (number of keystrokes) spent in a given hidden state before transitioning. |
| **TypingSpeed** | Estimated typing speed, computed as `1 / FlightTime`. (Higher values indicate faster typing.) |
| **KeySequenceChangeRate** | Ratio of hidden state transitions to the total transitions per user, capturing consistency in typing patterns. |
| **InterKeyInterval_Mean** | Mean time interval between consecutive keystrokes per user. |
| **InterKeyInterval_Std** | Standard deviation of inter-key intervals, indicating typing rhythm variability. |
| **Burstiness** | A measure of irregularity in typing speed, computed using `(std - mean) / (std + mean)`. |
| **TransitionEntropy** | Shannon entropy of state transitions, measuring typing pattern complexity. |
| **DominantHiddenState** | Most frequently occurring hidden state for each user, indicating the most common typing pattern. |
| **TimeSinceLastKeyPress** | Cumulative time elapsed since the last key press per user. |
| **AvgSessionLength** | Estimated session length based on typing pauses (breaks longer than twice the mean `FlightTime` are considered session boundaries). |

