# Hidden Markov Model for CSI dataset

This notebook uses HMM to augment the CSI dataset. It takes the evidence_sequence for training and generates new participants based on experience and the evidence_sequence.


In [None]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder

csv_path = "../data/csv/expanded_dataset.csv"

# Load your data
df = pd.read_csv(csv_path)

# Parse evidence_sequence column into lists of strings
df['evidence_sequence'] = df['evidence_sequence'].apply(lambda x: list(map(str.strip, str(x).split(','))))

# Encode all tokens to integers
all_tokens = [item for sublist in df['evidence_sequence'] for item in sublist]
le = LabelEncoder()
le.fit(all_tokens)

# Map each sequence to integer-encoded version
df['encoded_sequence'] = df['evidence_sequence'].apply(le.transform)

# Prepare result list to collect synthetic data
synthetic_rows = []

# Loop over experience groups
for exp_group, group_df in df.groupby('experience'):
    sequences = list(group_df['encoded_sequence'])
    lengths = [len(seq) for seq in sequences]

    # Concatenate sequences into single array for HMM training
    X = np.concatenate(sequences).reshape(-1, 1)

    # Train HMM for this group
    model = hmm.CategoricalHMM(n_components=5, n_iter=2000, random_state=42)
    model.fit(X, lengths)

    # Generate N new sequences for this experience group
    N = 200  # Number of synthetic participants per group
    seq_length = max(lengths)  # Use same length as originals

    for i in range(N):
        generated_seq, _ = model.sample(seq_length)
        generated_seq = generated_seq.flatten()
        decoded_seq = le.inverse_transform(generated_seq)

        synthetic_rows.append({
            'Participant': f'synthetic_{exp_group}_{i}',
            'experience': exp_group,
            'evidence_sequence': ','.join(decoded_seq)
        })

# Convert to DataFrame and append to original
df_synthetic = pd.DataFrame(synthetic_rows)
df_augmented = pd.concat([df[["Participant", "evidence_sequence", "experience"]], df_synthetic], ignore_index=True) #, "Image", "Scene"

# Save augmented dataset
df_augmented[['Participant', 'experience', 'evidence_sequence']].to_csv(
    "../data/csv/csi_data_evidence_augmented_expanded.csv",
    index=False
) #, 'Image', 'Scene'


In [None]:
# Save augmented dataset
df_augmented[['Participant', 'experience', 'evidence_sequence']].to_csv(
    "../data/csv/csi_data_evidence_augmented_expanded.csv",
    index=False
) #, 'Image', 'Scene'

In [3]:
df_augmented

Unnamed: 0,Participant,evidence_sequence,experience
0,2.0,"[no, no, no, no, yes, no, no, no, yes, no]",Control
1,2.0,"[no, no, no, no, no, no, no, no, no, no]",Control
2,2.0,"[no, no, yes, yes, no, no, no, no, no, no]",Control
3,2.0,"[no, no, no, no, no, no, no, no, no, no]",Control
4,2.0,"[no, no, no, yes, yes, no, no, no, no, no]",Control
...,...,...,...
3726,synthetic_ThirdYear_195,"no,no,no,no,yes,no,no,no,no,no",ThirdYear
3727,synthetic_ThirdYear_196,"no,no,no,no,no,no,no,no,yes,no",ThirdYear
3728,synthetic_ThirdYear_197,"yes,no,yes,no,no,yes,yes,yes,yes,yes",ThirdYear
3729,synthetic_ThirdYear_198,"yes,no,yes,yes,yes,no,yes,no,yes,yes",ThirdYear


# Hidden Markov Model for external dataset

This is the same method used on the following external eye tracking dataset: https://www.kaggle.com/datasets/priyankraval/eyet4empathy-eye-movement-and-empathy-dataset.
