In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
csv_path = "../../data/csv/expanded_dataset_roi.csv"
df = pd.read_csv(csv_path)


In [2]:
from scipy.stats import wasserstein_distance

# Example: Compare 'evidence_sequence' between two groups
group1 = df[df['experience'] == 'Control']['roi_sequence']
group2 = df[df['experience'] == 'CSI']['roi_sequence']

# Convert sequences to numerical values if necessary
def sequence_to_numeric(sequence):
    return [x for x in sequence.split(',')]

group1_numeric = group1.apply(sequence_to_numeric)
group2_numeric = group2.apply(sequence_to_numeric)


group1_train, group1_test = train_test_split(group1_numeric, test_size=0.1, random_state=42)
group2_train, group2_test = train_test_split(group2_numeric, test_size=0.1, random_state=42)


In [3]:
# Flatten the lists
group1_flat = [item for sublist in group1_train for item in sublist]
group2_flat = [item for sublist in group2_train for item in sublist]

distance = wasserstein_distance(group1_flat, group2_flat)
print(f'Wasserstein Distance: {distance}')


Wasserstein Distance: 0.458399445247582


In [4]:
group1_train

2816             [1, 1, 4, 4, 5, 3, 3, 3, 3, 3]
290           [11, 3, 13, 5, 5, 4, 7, 7, 11, 7]
54           [24, 17, 17, 0, 8, 17, 8, 3, 1, 3]
2954          [15, 12, 16, 2, 2, 7, 2, 0, 9, 0]
278            [16, 9, 9, 9, 9, 9, 9, 0, 9, 12]
                         ...                   
71      [9, 11, 19, 18, 19, 19, 19, 18, 19, 19]
106          [11, 11, 12, 1, 1, 1, 13, 1, 2, 4]
1636            [5, 2, 2, 7, 18, 0, 9, 9, 9, 9]
2648     [4, 11, 16, 0, 16, 16, 16, 16, 15, 14]
102              [2, 5, 2, 2, 2, 3, 6, 6, 1, 1]
Name: roi_sequence, Length: 711, dtype: object

In [5]:
correct = 0
incorrect = 0

for key in group1_train.keys():
    new_sample_numeric = group1_train[key]
    
    distance1 = wasserstein_distance(group1_flat, new_sample_numeric)
    distance2 = wasserstein_distance(group2_flat, new_sample_numeric)
    
    if distance1 < distance2:
        predicted_group = 'Control'
        correct += 1  # When the model predicts it is a Control
    else:
        predicted_group = 'CSI'
        incorrect += 1

# Calculate percentages
total = correct + incorrect
correct_percentage = (correct / total) * 100
incorrect_percentage = (incorrect / total) * 100

print(f'Correct classifications: {correct_percentage}%')
print(f'Incorrect classifications: {incorrect_percentage}%')


Correct classifications: 45.28832630098453%
Incorrect classifications: 54.71167369901547%


In [6]:
correct = 0
incorrect = 0

for key in group2_train.keys():
    new_sample_numeric = group2_train[key]
    
    distance1 = wasserstein_distance(group1_flat, new_sample_numeric)
    distance2 = wasserstein_distance(group2_flat, new_sample_numeric)
    
    if distance1 > distance2:
        predicted_group = 'Control'
        incorrect += 1
    else:
        predicted_group = 'CSI'
        correct += 1

# Calculate percentages
total = correct + incorrect
correct_percentage = (correct / total) * 100
incorrect_percentage = (incorrect / total) * 100

print(f'Correct classifications: {correct_percentage}%')
print(f'Incorrect classifications: {incorrect_percentage}%')


Correct classifications: 38.440111420612816%
Incorrect classifications: 61.55988857938719%
