In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from hmmlearn import hmm
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Step 1: Load the dataset
df = pd.read_excel("Data (7).xlsx")

In [4]:
# Step 2: Preprocessing - Convert categorical variables to numerical
label_encoders = {}
categorical_columns = ['DeviceId']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store for inverse transformation later

In [5]:
# Step 3: Feature Engineering - Creating new inferred metrics
df['Typing_Speed'] = 1 / (df['FlightTime'] + 1e-6)  # Avoid division by zero
df['Fatigue_Index'] = df['KeyHoldTime'].rolling(window=5).mean()
df['Cognitive_Load'] = df['FlightTime'].rolling(window=5).std()

In [8]:
df.head()

Unnamed: 0,Id,StartTime,KeyPressed,FullWord,EndTime,FlightTime,KeyHoldTime,DeviceId,CreatedAt,ModifiedAt,Typing_Speed,Fatigue_Index,Cognitive_Load
0,1,-1,W,W,1737542497876,0,0,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,1000000.0,,
1,2,1737542497876,a,Wa,1737542497903,27,13,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.037037,,
2,3,1737542497903,n,Wan,1737542497911,8,4,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.125,,
3,4,1737542497911,j,Wanj,1737542497918,7,2,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.142857,,
4,5,1737542497918,i,Wanji,1737542497926,8,1,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.125,4.0,10.074721


In [11]:
# Fill NaN values caused by rolling operations
df.fillna(method='bfill', inplace=True)  # Backward fill to handle missing values

  df.fillna(method='bfill', inplace=True)  # Backward fill to handle missing values


In [12]:
df.head()

Unnamed: 0,Id,StartTime,KeyPressed,FullWord,EndTime,FlightTime,KeyHoldTime,DeviceId,CreatedAt,ModifiedAt,Typing_Speed,Fatigue_Index,Cognitive_Load,Typing_Speed_HMM
0,1,-1,W,W,1737542497876,0,0,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,1000000.0,4.0,10.074721,1
1,2,1737542497876,a,Wa,1737542497903,27,13,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.037037,4.0,10.074721,0
2,3,1737542497903,n,Wan,1737542497911,8,4,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.125,4.0,10.074721,0
3,4,1737542497911,j,Wanj,1737542497918,7,2,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.142857,4.0,10.074721,0
4,5,1737542497918,i,Wanji,1737542497926,8,1,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,0.125,4.0,10.074721,0


In [13]:
# Step 4: Train Hidden Markov Model for hidden variable classification
n_states = 3  # Defining three states per hidden variable
hidden_vars = ['Typing_Speed', 'Cognitive_Load', 'Fatigue_Index']

for var in hidden_vars:
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000, random_state=42)
    model.fit(df[[var]])  # Fit HMM model
    df[var + '_HMM'] = model.predict(df[[var]])  # Predict hidden states

In [14]:
# Step 5: Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['Typing_Speed', 'Fatigue_Index', 'Cognitive_Load']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [15]:
# Step 6: Save the cleaned dataset
df.to_csv("processed_dataset.csv", index=False)

In [16]:
# Step 7: Generate Key Table for Hidden States Interpretation
state_interpretation = {
    "Typing_Speed_HMM": {0: "Slow", 1: "Moderate", 2: "Fast"},
    "Cognitive_Load_HMM": {0: "Low", 1: "Medium", 2: "High"},
    "Fatigue_Index_HMM": {0: "Fresh", 1: "Slightly Fatigued", 2: "Highly Fatigued"}
}

In [17]:
# Display sample of the dataset
df.head()

Unnamed: 0,Id,StartTime,KeyPressed,FullWord,EndTime,FlightTime,KeyHoldTime,DeviceId,CreatedAt,ModifiedAt,Typing_Speed,Fatigue_Index,Cognitive_Load,Typing_Speed_HMM,Cognitive_Load_HMM,Fatigue_Index_HMM
0,1,-1,W,W,1737542497876,0,0,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,62.689712,-0.084682,-0.076818,1,0,0
1,2,1737542497876,a,Wa,1737542497903,27,13,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,-0.015954,-0.084682,-0.076818,0,0,0
2,3,1737542497903,n,Wan,1737542497911,8,4,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,-0.015949,-0.084682,-0.076818,0,0,0
3,4,1737542497911,j,Wanj,1737542497918,7,2,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,-0.015947,-0.084682,-0.076818,0,0,0
4,5,1737542497918,i,Wanji,1737542497926,8,1,0,2025-01-22 10:41:39.287,2025-01-22 10:41:39.287,-0.015949,-0.084682,-0.076818,0,0,0
