# Two-Sample Kolmogorov-Smirnov Test

In [1]:
import pandas as pd
from scipy.stats import ks_2samp

# Load the dataset
df = pd.read_csv('Final_Dataset.csv')

# List of variables you want to compare
variables = [
    'time_accel', 'accel_x_max', 'accel_y_max', 'accel_z_max', 'accel_inclination_max', 
    'accel_zscore_x_max', 'accel_zscore_y_max', 'accel_zscore_z_max', 'accel_zscore_inclination_max', 
    'accel_x_min', 'accel_y_min', 'accel_z_min', 'accel_inclination_min', 
    'accel_zscore_x_min', 'accel_zscore_y_min', 'accel_zscore_z_min', 'accel_zscore_inclination_min', 
    'accel_x_sd', 'accel_y_sd', 'accel_z_sd', 'accel_inclination_sd', 
    'accel_zscore_x_sd', 'accel_zscore_y_sd', 'accel_zscore_z_sd', 'accel_zscore_inclination_sd', 
    'accel_x_mean', 'accel_y_mean', 'accel_z_mean', 'accel_inclination_mean', 
    'accel_zscore_x_mean', 'accel_zscore_y_mean', 'accel_zscore_z_mean', 'accel_zscore_inclination_mean', 
    'accel_x_dominant_freq', 'accel_y_dominant_freq', 'accel_z_dominant_freq', 'accel_inclination_dominant_freq', 
    'accel_zscore_x_dominant_freq', 'accel_zscore_y_dominant_freq', 'accel_zscore_z_dominant_freq', 'accel_zscore_inclination_dominant_freq', 
    'accel_x_avg_weigh_freq', 'accel_y_avg_weigh_freq', 'accel_z_avg_weigh_freq', 'accel_inclination_avg_weigh_freq', 
    'accel_zscore_x_avg_weigh_freq', 'accel_zscore_y_avg_weigh_freq', 'accel_zscore_z_avg_weigh_freq', 'accel_zscore_inclination_avg_weigh_freq', 
    'time_linaccel', 'linaccel_x_max', 'linaccel_y_max', 'linaccel_z_max', 
    'linaccel_zscore_x_max', 'linaccel_zscore_y_max', 'linaccel_zscore_z_max', 
    'linaccel_x_min', 'linaccel_y_min', 'linaccel_z_min', 
    'linaccel_zscore_x_min', 'linaccel_zscore_y_min', 'linaccel_zscore_z_min', 
    'linaccel_x_sd', 'linaccel_y_sd', 'linaccel_z_sd', 
    'linaccel_zscore_x_sd', 'linaccel_zscore_y_sd', 'linaccel_zscore_z_sd', 
    'linaccel_x_mean', 'linaccel_y_mean', 'linaccel_z_mean', 
    'linaccel_zscore_x_mean', 'linaccel_zscore_y_mean', 'linaccel_zscore_z_mean', 
    'time_mergedpost', 'gyro_x_max', 'gyro_y_max', 'gyro_z_max', 
    'gyro_zscore_x_max', 'gyro_zscore_y_max', 'gyro_zscore_z_max', 
    'gyro_x_min', 'gyro_y_min', 'gyro_z_min', 
    'gyro_zscore_x_min', 'gyro_zscore_y_min', 'gyro_zscore_z_min', 
    'gyro_x_sd', 'gyro_y_sd', 'gyro_z_sd', 
    'gyro_zscore_x_sd', 'gyro_zscore_y_sd', 'gyro_zscore_z_sd', 
    'gyro_x_mean', 'gyro_y_mean', 'gyro_z_mean', 
    'gyro_zscore_x_mean', 'gyro_zscore_y_mean', 'gyro_zscore_z_mean', 
    'time_baro', 'baro_x', 'baro_zscore_x'
]

# Preparing to collect results
results = []

# Conduct the K-S test for each pair of variables
for i in range(len(variables)):
    for j in range(i + 1, len(variables)):
        var1 = variables[i]
        var2 = variables[j]
        
        # Perform K-S test, ensuring no NaN values are included
        result = ks_2samp(df[var1].dropna(), df[var2].dropna())
        results.append((var1, var2, result.statistic, result.pvalue))

# Print the results sorted by p-value (most significant first)
results_sorted = sorted(results, key=lambda x: x[3])
for res in results_sorted:
    print(f"Variables: {res[0]} vs {res[1]}, K-S Statistic: {res[2]}, p-value: {res[3]}")


Variables: linaccel_zscore_x_min vs linaccel_z_sd, K-S Statistic: 0.715652128954629, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_x_sd, K-S Statistic: 0.7276000319613456, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_y_sd, K-S Statistic: 0.7268291995092994, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_z_sd, K-S Statistic: 0.7268574006965693, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_x_mean, K-S Statistic: 0.5052101693481296, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_y_mean, K-S Statistic: 0.5736873522375292, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_z_mean, K-S Statistic: 0.5386238760651824, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_x_mean, K-S Statistic: 0.23834703441014865, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_y_mean, K-S Statistic: 0.2833608294909216, p-value: 0.0
Variables: linaccel_zscore_x_min vs linaccel_zscore_z_mean, K-S 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5ea8f536-739f-4c50-806e-84490cb7d5e0' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>