In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from hmmlearn import hmm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Step 1: Load the dataset
df = pd.read_excel("Data (7).xlsx")

In [None]:
# Step 2: Preprocessing - Convert categorical variables to numerical
label_encoders = {}
categorical_columns = ['DeviceId']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store for inverse transformation later

In [None]:
# Step 3: Feature Engineering - Creating new inferred metrics
df['Typing_Speed'] = 1 / (df['FlightTime'] + 1e-6)  # Avoid division by zero
df['Fatigue_Index'] = df['KeyHoldTime'].rolling(window=5).mean()
df['Cognitive_Load'] = df['FlightTime'].rolling(window=5).std()

In [None]:
df.head()

In [None]:
# Fill NaN values caused by rolling operations
df.fillna(method='bfill', inplace=True)  # Backward fill to handle missing values

In [None]:
df.head()

In [None]:
# Step 4: Train Hidden Markov Model for hidden variable classification
n_states = 3  # Defining three states per hidden variable
hidden_vars = ['Typing_Speed', 'Cognitive_Load', 'Fatigue_Index']

for var in hidden_vars:
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000, random_state=42)
    model.fit(df[[var]])  # Fit HMM model
    df[var + '_HMM'] = model.predict(df[[var]])  # Predict hidden states

In [None]:
# Step 5: Scale numerical columns
scaler = StandardScaler()
numerical_columns = ['Typing_Speed', 'Fatigue_Index', 'Cognitive_Load']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [None]:
# Step 6: Save the cleaned dataset
df.to_csv("processed_dataset.csv", index=False)

In [None]:
# Step 7: Generate Key Table for Hidden States Interpretation
state_interpretation = {
    "Typing_Speed_HMM": {0: "Slow", 1: "Moderate", 2: "Fast"},
    "Cognitive_Load_HMM": {0: "Low", 1: "Medium", 2: "High"},
    "Fatigue_Index_HMM": {0: "Fresh", 1: "Slightly Fatigued", 2: "Highly Fatigued"}
}

In [None]:
# Display sample of the dataset
df.head()