# Data Integrity Checks

## Make sure all values are expected

In [3]:
import pandas as pd

df = pd.read_csv('./data/results_anon.csv', dtype = {'student_id_anon':str})

print(f'❔ Verifying no duplicate values')
if len(df['student_id_anon']) != df['student_id_anon'].nunique():
    print(f'🔴 Expected no duplicate values, found {df.isnull().sum().sum()} duplicate values.')
else:
    print(f'✅ Found no duplicate values.')

# We don't verify strings
df = df.drop(columns=['seq', 'student_id_anon'])

print(f'❔ Verifying no NaN values')
if df.isnull().sum().sum() > 0:
    print(f'🔴 Expected no NaN values, found {df.isnull().sum().sum()} NaN values.')
else:
    print(f'✅ Found no NaN values.')

print(f'❔ Verifying no negative values in numeric columns')
if (df.select_dtypes(include=['int64', 'float64']) < 0).any().any() > 0:
    print(f'🔴 Expected no negative values, found {(df < 0).any().any()} negative values.')
else:
    print(f'✅ Found no negative values.')

print(f'❔ Verifying no values > 1 for correctness')
if (df[['t1_correctness', 't2_correctness', 't3_correctness', 't4_correctness', 't5_correctness', 't6_correctness', 't7_correctness', 't8_correctness', 't9_correctness']] > 1).any().any() > 0:
    print(f'🔴 Expected no values > 1 for correctness, found values > 1 for correctness.')
else:
    print(f'✅ Found no values > 1 for correctness.')

print(f'❔ Verifying no values > 40min for combined time')
overTime = (df['t1_time'] + df['t2_time'] + df['t3_time'] + df['t4_time'] + df['t5_time'] + df['t6_time'] + df['t7_time'] +df['t8_time'] + df['t9_time'] > 40 * 60).any().any()
if (overTime > 0):
    print(f'🔴 Expected no values > 40min for combined time, found {overTime.sum()} values > 40min for combined time.')
    print(f"🔴 Maximum time: {(df['t1_time'] + df['t2_time'] + df['t3_time'] + df['t4_time'] + df['t5_time'] + df['t6_time'] + df['t7_time'] +df['t8_time'] + df['t9_time']).max() / 60} minutes.")
    print(f'✅ Considered okay and likely related to delay after being told about running out of time.')
else:
    print(f'✅ Found no values > 40min for combined time.')
    

❔ Verifying no duplicate values
✅ Found no duplicate values.
❔ Verifying no NaN values
✅ Found no NaN values.
❔ Verifying no negative values in numeric columns
✅ Found no negative values.
❔ Verifying no values > 1 for correctness
✅ Found no values > 1 for correctness.
❔ Verifying no values > 40min for combined time
✅ Found no values > 40min for combined time.


## Mark outliers in times or correctness for future removal during analysis

In [4]:
import pandas as pd

outlierDetectionColumns = [
    'time_write_jv',
    'time_write_py',
    'time_read_jv',
    'time_read_py',
    'correctness_write_jv',
    'correctness_write_py',
    'correctness_read_jv',
    'correctness_read_py'
]

df = pd.read_csv('./data/results_anon.csv', dtype = {'student_id_anon':str})

df['has_outlier'] = False

for column in outlierDetectionColumns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)

    # Calculate IQR
    IQR = Q3 - Q1

    # Determine the lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[f'{column}_outlier'] = (df[column] < lower_bound) | (df[column] > upper_bound)

outlierColumns = [f"{col}_outlier" for col in outlierDetectionColumns]

df['has_outlier'] = df[outlierColumns].any(axis=1)

df[df['has_outlier'] == False].to_csv('./data/results_anon_without_outliers.csv', index=False)
df[df['has_outlier'] == True].to_csv('./data/results_anon_only_outliers.csv', index=False)
df.to_csv('./data/results_anon.csv', index=False)

print(f'🔴 {df[df["has_outlier"] == True].shape[0]} experiments have values that are outliers according to 1.5 IQR')

🔴 24 experiments have values that are outliers according to 1.5 IQR
