# Raw Data Integrity Checks

## Verify events are of excepted quantity and order

In [2]:
import sqlite3
import os
import shutil
import pandas as pd
from util import getTaskFromLocation

expectedEvents = [
    ("TASKSTART", 1),
    ("TASKCOMPLETED", 1),
    ("TASKSTART", 2),
    ("TASKCOMPLETED", 2),
    ("TASKSTART", 3),
    ("TASKCOMPLETED", 3),
    ("TASKSTART", 4),
    ("TASKCOMPLETED", 4),
    ("TASKSTART", 5),
    ("TASKCOMPLETED", 5),
    ("TASKSTART", 6),
    ("TASKCOMPLETED", 6),
    ("TASKSTART", 7),
    ("TASKCOMPLETED", 7),
    ("TASKSTART", 8),
    ("TASKCOMPLETED", 8),
    ("TASKSTART", 9),
    ("TASKCOMPLETED", 9),
]

experimentsWithUnexpectedEvents = []

# Copy the original data export so we do not modify it
if os.path.exists('./data/data_anon_after_data_integrity.db'):
    os.remove('./data/data_anon_after_data_integrity.db')
shutil.copyfile('./data/data_anon.db', './data/data_anon_after_data_integrity.db')

with sqlite3.connect('data/data_anon_after_data_integrity.db') as connection:
    # Get relevant events, aka those that we use to compute other values in utils
    df = pd.read_sql_query("SELECT * FROM events WHERE type in ('TASKSTART', 'CODEREVEALED', 'TASKCOMPLETED')", connection)
    df['task'] = df['location'].map(getTaskFromLocation)

for experimentid in df['experiment'].unique().tolist():
    print(f'❔ Checking {experimentid}')
    dfExperiment = df[(df['experiment'] == experimentid) & (df['task'] != 0)].reset_index()

    if (len(expectedEvents) != dfExperiment.shape[0]):
        print(f'🔴 Expected {len(expectedEvents)} events, experiment {experimentid} has {dfExperiment.shape[0]} events.')
        print(dfExperiment[['type', 'task', 'time']])
        experimentsWithUnexpectedEvents.append(experimentid)
        continue

    print(f'✅ Experiment {experimentid} has the expected amount of events.')

    i = 0
    for expectedEvent, task in expectedEvents:
        #print(f'\tChecking {i}: {expectedEvent} on task {task}')

        if (dfExperiment.loc[i, 'type'] != expectedEvent or task != dfExperiment.loc[i, 'task']):
            print(f'🔴 Expected {expectedEvent} and task {task}, experiment {experimentid} has {dfExperiment.loc[i, "type"]} and task {dfExperiment.loc[i, "task"]}.')
            print(dfExperiment[['type', 'task', 'time']])
            experimentsWithUnexpectedEvents.append(experimentid)
            break
        i += 1
    
    print(f'\t✅ All events for experiment {experimentid} are in the expected order.')

print(f'✅ Checked all experiments for expected amount of events and order of events.')
print(f'Found {len(experimentsWithUnexpectedEvents)} experiments with unexpected order or amount of events: {",".join(experimentsWithUnexpectedEvents)}')

with sqlite3.connect('data/data_anon_after_data_integrity.db') as connection:
    cursor = connection.cursor()
    cursor.execute(f'DELETE FROM events WHERE experiment IN ({",".join(experimentsWithUnexpectedEvents)})')
    connection.commit()

    print(f'✅ Deleted experiments with unexpected order or amount of events: {",".join(experimentsWithUnexpectedEvents)}')


❔ Checking 54
✅ Experiment 54 has the expected amount of events.
	✅ All events for experiment 54 are in the expected order.
❔ Checking 50
✅ Experiment 50 has the expected amount of events.
	✅ All events for experiment 50 are in the expected order.
❔ Checking 24
🔴 Expected 18 events, experiment 24 has 19 events.
             type  task       time
0       TASKSTART     1   747067.0
1       TASKSTART     1   862421.0
2   TASKCOMPLETED     1  1372462.0
3       TASKSTART     2  1373473.0
4   TASKCOMPLETED     2  1461485.0
5       TASKSTART     3  1462567.0
6   TASKCOMPLETED     3  1571386.0
7       TASKSTART     4  1572559.0
8   TASKCOMPLETED     4  1606628.0
9       TASKSTART     5  1608063.0
10  TASKCOMPLETED     5  1636300.0
11      TASKSTART     6  1637567.0
12  TASKCOMPLETED     6  1671772.0
13      TASKSTART     7  1672782.0
14  TASKCOMPLETED     7  1694553.0
15      TASKSTART     8  1695470.0
16  TASKCOMPLETED     8  1719916.0
17      TASKSTART     9  1720886.0
18  TASKCOMPLETED     