In [None]:
# apply wilcoxon test on all 12 columns of eye gaze data on
#to compare pairs of clusters-  (0 vs 1), (1 vs 2), (0 vs 2)

In [None]:
#compare pairs of clusters (0 vs 1)

In [13]:
import pandas as pd
from scipy.stats import wilcoxon
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_with_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize an empty dictionary to store results
results = {'Column': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}

# Perform Wilcoxon signed-rank test for each column comparing cluster 0 and 1
for column in df.columns:
    if column != 'cluster':
        cluster_0_data = df.loc[df['cluster'] == 0, column]
        cluster_1_data = df.loc[df['cluster'] == 1, column]
        
        # Ensure both samples have the same length
        min_length = min(len(cluster_0_data), len(cluster_1_data))
        cluster_0_data = cluster_0_data[:min_length]
        cluster_1_data = cluster_1_data[:min_length]
        
        # Check for zero differences and handle accordingly
        differences = cluster_0_data - cluster_1_data
        if (differences == 0).all():
            print(f"All differences are zero for column '{column}', skipping this column.")
            continue
        
        try:
            statistic, p_value = wilcoxon(cluster_0_data, cluster_1_data, zero_method='wilcox')
        except ValueError as e:
            print(f"Error in column '{column}': {e}")
            continue
        
        # Interpret the results
        alpha = 0.05
        if p_value < alpha:
            result = "Yes"
            print(f"There is a statistically significant difference between cluster 0 and cluster 1 in column '{column}'.")
        else:
            result = "No"
        
        # Store the results in the dictionary
        results['Column'].append(column)
        results['Statistic'].append(statistic)
        results['P-value'].append(p_value)
        results['Statistically Different'].append(result)

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print('-----------------------------------------------------------------------------------------------------------------------------')
print('Minimum length:', min_length)
print('-----------------------------------------------------------------------------------------------------------------------------')
print(results_df)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_with_acceptance
Error in column 'is_correct': zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.
There is a statistically significant difference between cluster 0 and cluster 1 in column 'is_accepted'.
-----------------------------------------------------------------------------------------------------------------------------
Minimum length: 41
-----------------------------------------------------------------------------------------------------------------------------
                      Column  Statistic       P-value Statistically Different
0       fixation_rate_AOI_A1      323.0  1.636127e-01                      No
1       fixation_rate_AOI_A2      314.0  1.311331e-01                      No
2       fixation_rate_AOI_A3      359.0  4.930246e-01                      No
3       fixation_rate_AOI_A4      368.0  9.710785e-01                      No
4       

In [15]:
#compare pairs of clusters (1 vs 2)

In [14]:
import pandas as pd
from scipy.stats import wilcoxon
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_with_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize an empty dictionary to store results
results = {'Column': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}

# Perform Wilcoxon signed-rank test for each column comparing cluster 1 and 2
for column in df.columns:
    if column != 'cluster':
        cluster_1_data = df.loc[df['cluster'] == 1, column]
        cluster_2_data = df.loc[df['cluster'] == 2, column]
        
        # Ensure both samples have the same length
        min_length = min(len(cluster_1_data), len(cluster_2_data))
        cluster_1_data = cluster_1_data[:min_length]
        cluster_2_data = cluster_2_data[:min_length]
        
        # Check for zero differences and handle accordingly
        differences = cluster_1_data - cluster_2_data
        if (differences == 0).all():
            print(f"All differences are zero for column '{column}', skipping this column.")
            continue
        
        try:
            statistic, p_value = wilcoxon(cluster_1_data, cluster_2_data, zero_method='wilcox')
        except ValueError as e:
            print(f"Error in column '{column}': {e}")
            continue
        
        # Interpret the results
        alpha = 0.05
        if p_value < alpha:
            result = "Yes"
            print(f"There is a statistically significant difference between cluster 1 and cluster 2 in column '{column}'.")
        else:
            result = "No"
        
        # Store the results in the dictionary
        results['Column'].append(column)
        results['Statistic'].append(statistic)
        results['P-value'].append(p_value)
        results['Statistically Different'].append(result)

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print('-----------------------------------------------------------------------------------------------------------------------------')
print('Minimum length:', min_length)
print('-----------------------------------------------------------------------------------------------------------------------------')
print(results_df)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_with_acceptance
There is a statistically significant difference between cluster 1 and cluster 2 in column 'fixation_rate_AOI_A2'.
There is a statistically significant difference between cluster 1 and cluster 2 in column 'avg_fixation_time_AOI_A5'.
There is a statistically significant difference between cluster 1 and cluster 2 in column 'is_correct'.
There is a statistically significant difference between cluster 1 and cluster 2 in column 'is_accepted'.
-----------------------------------------------------------------------------------------------------------------------------
Minimum length: 41
-----------------------------------------------------------------------------------------------------------------------------
                      Column  Statistic       P-value Statistically Different
0       fixation_rate_AOI_A1      303.0  9.849567e-02                      No
1       fixation



In [22]:
#compare pairs of clusters (0 vs 2)

In [15]:
import pandas as pd
from scipy.stats import wilcoxon
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_with_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize an empty dictionary to store results
results = {'Column': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}

# Perform Wilcoxon signed-rank test for each column comparing cluster 0 and 2
for column in df.columns:
    if column != 'cluster':
        cluster_0_data = df.loc[df['cluster'] == 0, column]
        cluster_2_data = df.loc[df['cluster'] == 2, column]
        
        # Ensure both samples have the same length
        min_length = min(len(cluster_0_data), len(cluster_2_data))
        cluster_0_data = cluster_0_data[:min_length]
        cluster_2_data = cluster_2_data[:min_length]
        
        # Check for zero differences and handle accordingly
        differences = cluster_0_data - cluster_2_data
        if (differences == 0).all():
            print(f"All differences are zero for column '{column}', skipping this column.")
            continue
        
        try:
            statistic, p_value = wilcoxon(cluster_0_data, cluster_2_data, zero_method='wilcox')
        except ValueError as e:
            print(f"Error in column '{column}': {e}")
            continue
        
        # Interpret the results
        alpha = 0.05
        if p_value < alpha:
            result = "Yes"
            print(f"There is a statistically significant difference between cluster 0 and cluster 2 in column '{column}'.")
        else:
            result = "No"
        
        # Store the results in the dictionary
        results['Column'].append(column)
        results['Statistic'].append(statistic)
        results['P-value'].append(p_value)
        results['Statistically Different'].append(result)

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print('-----------------------------------------------------------------------------------------------------------------------------')
print('Minimum length:', min_length)
print('-----------------------------------------------------------------------------------------------------------------------------')
print(results_df)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_with_acceptance
There is a statistically significant difference between cluster 0 and cluster 2 in column 'fixation_rate_AOI_A4'.
There is a statistically significant difference between cluster 0 and cluster 2 in column 'fixation_rate_AOI_A5'.
There is a statistically significant difference between cluster 0 and cluster 2 in column 'avg_fixation_time_AOI_A3'.
There is a statistically significant difference between cluster 0 and cluster 2 in column 'avg_fixation_time_AOI_A5'.
There is a statistically significant difference between cluster 0 and cluster 2 in column 'is_correct'.
There is a statistically significant difference between cluster 0 and cluster 2 in column 'is_accepted'.
-----------------------------------------------------------------------------------------------------------------------------
Minimum length: 74
------------------------------------------------------------------

In [None]:
Continuous Data (Fixation Time and Fixation Rate Columns):
Use the Friedman test for comparing more than two related samples.


Categorical Data (is_correct and is_accepted):

Use Cochran's Q test for comparing more than two related samples.

In [2]:
import pandas as pd
from scipy.stats import friedmanchisquare
from statsmodels.stats.contingency_tables import cochrans_q
import os

# Get the current working directory
main_dir = os.getcwd()
print(main_dir)

# Load your dataset
file_path = os.path.join(main_dir, "clustering_with_acceptance.csv")
df = pd.read_csv(file_path)

# Initialize a dictionary to store results
results = {'Column': [], 'Test': [], 'Statistic': [], 'P-value': [], 'Statistically Different': []}

# Continuous: Fixation Rate columns
fixation_rate_columns = ['fixation_rate_AOI_A1', 'fixation_rate_AOI_A2', 'fixation_rate_AOI_A3', 
                         'fixation_rate_AOI_A4', 'fixation_rate_AOI_A5', 'fixation_rate_AOI_A6']

for column in fixation_rate_columns:
    cluster_0_data = df.loc[df['cluster'] == 0, column]
    cluster_1_data = df.loc[df['cluster'] == 1, column]
    cluster_2_data = df.loc[df['cluster'] == 2, column]
    
    # Ensure all samples have the same length
    min_length = min(len(cluster_0_data), len(cluster_1_data), len(cluster_2_data))
    cluster_0_data = cluster_0_data[:min_length]
    cluster_1_data = cluster_1_data[:min_length]
    cluster_2_data = cluster_2_data[:min_length]
    
    # Friedman test
    stat, p_value = friedmanchisquare(cluster_0_data, cluster_1_data, cluster_2_data)
    results['Column'].append(column)
    results['Test'].append('Friedman')
    results['Statistic'].append(stat)
    results['P-value'].append(p_value)
    results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')

# Continuous data: Avg Fixation Time columns
avg_fixation_time_columns = ['avg_fixation_time_AOI_A1', 'avg_fixation_time_AOI_A2', 'avg_fixation_time_AOI_A3', 
                             'avg_fixation_time_AOI_A4', 'avg_fixation_time_AOI_A5', 'avg_fixation_time_AOI_A6']

for column in avg_fixation_time_columns:
    cluster_0_data = df.loc[df['cluster'] == 0, column]
    cluster_1_data = df.loc[df['cluster'] == 1, column]
    cluster_2_data = df.loc[df['cluster'] == 2, column]
    
    # Ensure all samples have the same length
    min_length = min(len(cluster_0_data), len(cluster_1_data), len(cluster_2_data))
    cluster_0_data = cluster_0_data[:min_length]
    cluster_1_data = cluster_1_data[:min_length]
    cluster_2_data = cluster_2_data[:min_length]
    
    # Friedman test
    stat, p_value = friedmanchisquare(cluster_0_data, cluster_1_data, cluster_2_data)
    results['Column'].append(column)
    results['Test'].append('Friedman')
    results['Statistic'].append(stat)
    results['P-value'].append(p_value)
    results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')

# Categorical data: Accuracy and Acceptance
categorical_columns = ['is_correct', 'is_accepted']

for column in categorical_columns:
    cluster_0_data = df.loc[df['cluster'] == 0, column]
    cluster_1_data = df.loc[df['cluster'] == 1, column]
    cluster_2_data = df.loc[df['cluster'] == 2, column]

    # Ensure all samples have the same length
    min_length = min(len(cluster_0_data), len(cluster_1_data), len(cluster_2_data))
    cluster_0_data = cluster_0_data[:min_length]
    cluster_1_data = cluster_1_data[:min_length]
    cluster_2_data = cluster_2_data[:min_length]

    # Create a contingency table for Cochran's Q test
    contingency_table = pd.DataFrame({
        'Cluster_0': cluster_0_data,
        'Cluster_1': cluster_1_data,
        'Cluster_2': cluster_2_data
    })

    if contingency_table.shape == (min_length, 3):
        stat = cochrans_q(contingency_table)
        p_value = stat.pvalue

        results['Column'].append(column)
        results['Test'].append('Cochran\'s Q')
        results['Statistic'].append(stat.statistic)
        results['P-value'].append(p_value)
        results['Statistically Different'].append('Yes' if p_value < 0.05 else 'No')
    else:
        results['Column'].append(column)
        results['Test'].append('Cochran\'s Q')
        results['Statistic'].append('NaN')
        results['P-value'].append('NaN')
        results['Statistically Different'].append('NaN')

# Convert results to DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)


/Users/sarayabesi/Documents/research-poly/eye-tracker-project/machine_learning/dataset_1/clustering_with_acceptance
                      Column         Test  Statistic   P-value  \
0       fixation_rate_AOI_A1     Friedman   5.707317  0.057633   
1       fixation_rate_AOI_A2     Friedman   3.349693  0.187337   
2       fixation_rate_AOI_A3     Friedman   1.444444  0.485672   
3       fixation_rate_AOI_A4     Friedman    2.36129  0.307081   
4       fixation_rate_AOI_A5     Friedman  12.133333  0.002319   
5       fixation_rate_AOI_A6     Friedman   0.757576  0.684691   
6   avg_fixation_time_AOI_A1     Friedman   4.439024  0.108662   
7   avg_fixation_time_AOI_A2     Friedman   1.190184  0.551512   
8   avg_fixation_time_AOI_A3     Friedman   2.679012  0.261975   
9   avg_fixation_time_AOI_A4     Friedman   2.825806  0.243436   
10  avg_fixation_time_AOI_A5     Friedman   9.657143  0.007998   
11  avg_fixation_time_AOI_A6     Friedman   2.030303  0.362348   
12                is_corre