In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
csv_path = "../../data/csv/expanded_dataset_exitentry.csv"
df = pd.read_csv(csv_path)


In [2]:
from scipy.stats import wasserstein_distance

# Example: Compare 'evidence_sequence' between two groups
group1 = df[df['experience'] == 'Control']['exitentry_sequence']
group2 = df[df['experience'] == 'CSI']['exitentry_sequence']

# Convert sequences to numerical values if necessary
def sequence_to_numeric(sequence):
    return [1 if x == 'yes' else 0 for x in sequence.split(',')]

group1_numeric = group1.apply(sequence_to_numeric)
group2_numeric = group2.apply(sequence_to_numeric)


group1_train, group1_test = train_test_split(group1_numeric, test_size=0.1, random_state=42)
group2_train, group2_test = train_test_split(group2_numeric, test_size=0.1, random_state=42)


In [3]:
# Flatten the lists
group1_flat = [item for sublist in group1_train for item in sublist]
group2_flat = [item for sublist in group2_train for item in sublist]

distance = wasserstein_distance(group1_flat, group2_flat)
print(f'Wasserstein Distance: {distance}')


Wasserstein Distance: 0.00298159836081624


In [4]:
group1_train

2816    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
290     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
54      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2954    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
278     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     ...              
71      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
106     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1636    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2648    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
102     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: exitentry_sequence, Length: 711, dtype: object

In [8]:
correct = 0
incorrect = 0

for key in group1_train.keys():
    new_sample_numeric = group1_train[key]
    
    distance1 = wasserstein_distance(group1_flat, new_sample_numeric)
    distance2 = wasserstein_distance(group2_flat, new_sample_numeric)
    
    if distance1 < distance2:
        predicted_group = 'Control'
        correct += 1  # When the model predicts it is a Control
    else:
        predicted_group = 'CSI'
        incorrect += 1

# Calculate percentages
total = correct + incorrect
correct_percentage = (correct / total) * 100
incorrect_percentage = (incorrect / total) * 100

print(f'Correct classifications: {correct_percentage}%')
print(f'Incorrect classifications: {incorrect_percentage}%')


Correct classifications: 8.860759493670885%
Incorrect classifications: 91.13924050632912%


In [6]:
correct = 0
incorrect = 0

for key in group2_train.keys():
    new_sample_numeric = group2_train[key]
    
    distance1 = wasserstein_distance(group1_flat, new_sample_numeric)
    distance2 = wasserstein_distance(group2_flat, new_sample_numeric)
    
    if distance1 > distance2:
        predicted_group = 'Control'
        incorrect += 1
    else:
        predicted_group = 'CSI'
        correct += 1

# Calculate percentages
total = correct + incorrect
correct_percentage = (correct / total) * 100
incorrect_percentage = (incorrect / total) * 100

print(f'Correct classifications: {correct_percentage}%')
print(f'Incorrect classifications: {incorrect_percentage}%')


Correct classifications: 7.7994428969359335%
Incorrect classifications: 92.20055710306406%
