In [None]:
# apply wilcoxon test on all 12 columns of eye gaze data on to compare cluster 0 and cluster 1

In [5]:
import pandas as pd
from scipy.stats import wilcoxon
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_without_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize an empty dictionary to store results
results = {'Column': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}

# Filter data for cluster 0 and cluster 1
cluster_0_data = df.loc[df['cluster'] == 0]
cluster_1_data = df.loc[df['cluster'] == 1]

# Ensure both samples have the same length
min_length = min(len(cluster_0_data), len(cluster_1_data))
cluster_0_data = cluster_0_data[:min_length]
cluster_1_data = cluster_1_data[:min_length]

# Perform Wilcoxon signed-rank test for each column
for column in df.columns:
    if column != 'cluster':
        # Check for zero differences and handle accordingly
        differences = cluster_0_data[column] - cluster_1_data[column]
        if (differences == 0).all():
            print(f"All differences are zero for column '{column}', skipping this column.")
            continue

        try:
            statistic, p_value = wilcoxon(cluster_0_data[column], cluster_1_data[column], zero_method='wilcox')
        except ValueError as e:
            print(f"Error in column '{column}': {e}")
            continue
        
        # Interpret the results
        alpha = 0.05
        if p_value < alpha:
            result = "Yes"
            print(f"There is a statistically significant difference between cluster 0 and cluster 1 in column '{column}'.")
        else:
            result = "No"
        
        # Store the results in the dictionary
        results['Column'].append(column)
        results['Statistic'].append(statistic)
        results['P-value'].append(p_value)
        results['Statistically Different'].append(result)

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print('-----------------------------------------------------------------------------------------------------------------------------')
print('Minimum length:', min_length)
print('-----------------------------------------------------------------------------------------------------------------------------')
print(results_df)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_without_acceptance
There is a statistically significant difference between cluster 0 and cluster 1 in column 'fixation_rate_AOI_A2'.
There is a statistically significant difference between cluster 0 and cluster 1 in column 'fixation_rate_AOI_A5'.
There is a statistically significant difference between cluster 0 and cluster 1 in column 'avg_fixation_time_AOI_A5'.
There is a statistically significant difference between cluster 0 and cluster 1 in column 'avg_fixation_time_AOI_A6'.
There is a statistically significant difference between cluster 0 and cluster 1 in column 'is_correct'.
-----------------------------------------------------------------------------------------------------------------------------
Minimum length: 74
-----------------------------------------------------------------------------------------------------------------------------
                      Column  Statistic   

In [None]:
Continuous Data (Fixation Time and Fixation Rate columns)
Use the Wilcoxon signed-rank test, which is suitable for comparing paired data.


Categorical Data (is_correct column)
For paired categorical data, you can use the McNemar test. This test is used for paired nominal data, specifically for 2x2 contingency tables.

In [7]:
import pandas as pd
from scipy.stats import wilcoxon, chi2_contingency
from statsmodels.stats.contingency_tables import mcnemar
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_without_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize a dictionary to store results
results = {'Column': [], 'Test': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}



# Continuous data: Fixation Rate columns
fixation_rate_columns = ['fixation_rate_AOI_A1', 'fixation_rate_AOI_A2', 'fixation_rate_AOI_A3', 'fixation_rate_AOI_A4', 'fixation_rate_AOI_A5', 'fixation_rate_AOI_A6']
for column in fixation_rate_columns:
    cluster_0_data = df.loc[df['cluster'] == 0, column]
    cluster_1_data = df.loc[df['cluster'] == 1, column]
    
    # Ensure both samples have the same length
    min_length = min(len(cluster_0_data), len(cluster_1_data))
    cluster_0_data = cluster_0_data[:min_length]
    cluster_1_data = cluster_1_data[:min_length]
    
    # Wilcoxon signed-rank test
    stat, p_value = wilcoxon(cluster_0_data, cluster_1_data)
    results['Column'].append(column)
    results['Test'].append('Wilcoxon Signed-Rank')
    results['Statistic'].append(stat)
    results['P-value'].append(p_value)
    results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')

# Continuous data: Fixation Time columns
fixation_time_columns = ['avg_fixation_time_AOI_A1', 'avg_fixation_time_AOI_A2', 'avg_fixation_time_AOI_A3', 'avg_fixation_time_AOI_A4', 'avg_fixation_time_AOI_A5', 'avg_fixation_time_AOI_A6']
for column in fixation_time_columns:
    cluster_0_data = df.loc[df['cluster'] == 0, column]
    cluster_1_data = df.loc[df['cluster'] == 1, column]
    
    # Ensure both samples have the same length
    min_length = min(len(cluster_0_data), len(cluster_1_data))
    cluster_0_data = cluster_0_data[:min_length]
    cluster_1_data = cluster_1_data[:min_length]
    
    # Wilcoxon signed-rank test
    stat, p_value = wilcoxon(cluster_0_data, cluster_1_data)
    results['Column'].append(column)
    results['Test'].append('Wilcoxon Signed-Rank')
    results['Statistic'].append(stat)
    results['P-value'].append(p_value)
    results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')
    
# Categorical data: Accuracy
accuracy_column = 'is_correct'
contingency_table = pd.crosstab(df['cluster'], df[accuracy_column])

# McNemar test for paired categorical data
if contingency_table.shape == (2, 2):
    result = mcnemar(contingency_table)
    stat = result.statistic
    p_value = result.pvalue
    results['Column'].append(accuracy_column)
    results['Test'].append('McNemar')
    results['Statistic'].append(stat)
    results['P-value'].append(p_value)
    results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)

/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_without_acceptance
                      Column                  Test  Statistic       P-value  \
0       fixation_rate_AOI_A1  Wilcoxon Signed-Rank     1338.0  7.897247e-01   
1       fixation_rate_AOI_A2  Wilcoxon Signed-Rank     1011.0  4.253022e-02   
2       fixation_rate_AOI_A3  Wilcoxon Signed-Rank     1077.0  9.437913e-02   
3       fixation_rate_AOI_A4  Wilcoxon Signed-Rank      971.0  5.425302e-02   
4       fixation_rate_AOI_A5  Wilcoxon Signed-Rank      258.0  2.488707e-04   
5       fixation_rate_AOI_A6  Wilcoxon Signed-Rank      183.0  2.027430e-01   
6   avg_fixation_time_AOI_A1  Wilcoxon Signed-Rank     1144.0  1.895903e-01   
7   avg_fixation_time_AOI_A2  Wilcoxon Signed-Rank     1268.0  5.197215e-01   
8   avg_fixation_time_AOI_A3  Wilcoxon Signed-Rank     1106.0  1.293912e-01   
9   avg_fixation_time_AOI_A4  Wilcoxon Signed-Rank     1215.0  5.785133e-01   
10  avg_fixa