# Create splits
```
input: trialdata.csv
output: splits folder
```
Trialdata.csv is created by psiturk when you download the datafile. This is the experiment data collected by the task stored in ```Experiment code```

The splits folder will contain a named folder per subject. Test data is filtered out by inspecting the user id. Incomplete data is filtered out by inspecting the number of finished games. These folders contain data that can be fitted to a model by running ```Model code/matlab wrapper/auto_fit.sh``` on it. After model fitting is complete you can create paramsMatrix further down in this notebook.

In [None]:
input_file = "../data/trialdata.csv"
output_folder = "../data"

In [None]:
import pandas as pd
import json

# Load trialdata and put it in a dictionary from user id to list of data
def load_data(filename):
    df = pd.read_csv(filename, header=None, names=['participant_id','i','ts','info'])
    result = {}
    for participant_id in df['participant_id'].unique():
        # Skip debug data by filtering in name
        if any(part in participant_id.lower() for part in ["debug", "test", "noas", "null"]):
            print("drop " + participant_id)
            continue
        events = [json.loads(e) for e in df[df['participant_id'] == participant_id]['info']]
        result[participant_id] = sorted(events,key=lambda e:e['event_time'])
    return result

data_dict = load_data(input_file)

In [None]:
import os
import numpy as np

#helper functions that encode boards as and moves as integers for the model fitting pipeline
def encode_board(pieces):
    return np.sum([2**i for i,p in enumerate(pieces) if p=='1']).astype(np.int64)

def get_events_with_type(f, event_type):
    return [e for e in f if e['event_type'].replace('_',' ') == event_type.replace('_', ' ')]

# Make data more accessible
def get_parsed_data(data):
    nGames = len(get_events_with_type(data, 'end game'))
    # A full task includes 37 games, but completing almost all of them is good enough
    assert nGames >= 36, f"user only finished {nGames} games"
    assert nGames <= 37, f"user completed too many games? {nGames}"
    your_turn_events = get_events_with_type(data, 'your turn')
    user_move_events = get_events_with_type(data, 'user move')
    assert len(your_turn_events) == len(user_move_events), "user quit in the middle of a turn"
    return [(e['event_info']['bp'], e['event_info']['wp'], e['event_info']['tile'],
             e['event_info']['user_color'], (e['event_time'] - e_your_turn['event_time']) / 1000)
            for e_your_turn, e in zip(your_turn_events, user_move_events) if "bp" in e["event_info"]]

def ensure_dir(dirname):
    if not os.path.exists(dirname):
        os.mkdir(dirname)

def save_raw_splits(data_dict, root):
    ensure_dir(root + '/raw/')
    ensure_dir(root + '/splits/')
    for username, data in data_dict.items():
        print(f"writing {username} {len(data)}", end='\r')
        try:
            parsed_data = get_parsed_data(data)
        except AssertionError as e:
            print(f"Incomplete {username}: {e}")
            continue
        filename = username.replace(":", "-")
        with open(root + '/raw/' + filename + '.csv', 'w') as f:
            df = pd.DataFrame([(encode_board(bp), encode_board(wp), c.lower().capitalize(), 2**m, rt, username.split(':')[0]) for bp,wp,m,c,rt in parsed_data])
            f.write(df.to_csv(None, index=False, header=False, sep='\t', line_terminator ='\n')[:-1])
        group = (5 * (np.random.permutation(len(parsed_data))/len(parsed_data))).astype(int) + 1
        ensure_dir(root + '/splits/' + filename)
        with open(root + '/splits/' + filename + '/data.csv', 'w') as f:
            df = pd.DataFrame([(encode_board(bp),encode_board(wp), c.lower().capitalize(), 2**m, rt,g,username.split(':')[0]) for (bp,wp,m,c,rt), g in zip(parsed_data, group)])
            f.write(df.to_csv(None, index = False, header=False, sep='\t', line_terminator='\n')[:-1])
        for g in range(1,6):
            with open(root + '/splits/' + filename + '/' + str(g) + '.csv', 'w') as f:
                df = pd.DataFrame([(encode_board(bp), encode_board(wp), c.lower().capitalize(), 2**m, rt,g,username.split(':')[0]) for (bp,wp,m,c,rt), g in zip(parsed_data, group)])
                f.write(df[df[5]==g].to_csv(None, index=False, header=False, sep='\t', line_terminator='\n')[:-1])
    print(f"Done{' ' * 40}")

save_raw_splits(data_dict, output_folder)

# Create paramsMatrix
```
input: fitted parameters
output: paramsMatrix.csv
```
The input structure is a root folder that contains folders with fitted parameters, like this:
```
data folder
 +- subject_id_1
 |  +- params1.csv
 |  +- params2.csv
 |  +- ...
 +- subject_id_2
 |  +- params1.csv
 ...
```
Use paramsMatrix.csv in the ```Calculate metrics and Elo``` notebook.

In [None]:
data_folder = "../data/splits"
output_file = "../data/paramsMatrix.csv"

import os
import re

parampattern = "params(\\d+).csv"

lines = []
num_subjects = 0
for subject_id in os.listdir(data_folder):
    subdir = os.path.join(data_folder, subject_id)
    if not os.path.isdir(subdir):
        # We're looking for directories only. Skip everything else
        continue
    num_subjects += 1
    for filename in os.listdir(subdir):
        filepath = os.path.join(subdir, filename)
        match = re.match(parampattern, filename)
        if not os.path.isfile(filepath) or not match:
            # Skip any file that isn't a param*.csv file
            continue
        fold_number = match.group(1)
        with open(filepath) as infile:
            lines.append([subject_id.replace("-",":"), int(fold_number), infile.readline().strip()])
lines.sort()
linecounter = 0
with open(output_file, "w") as targetfile:
    # Write the header
    targetfile.write(",subject,fold,")
    targetfile.write(",".join([str(item) for item in range(1,11)]))
    targetfile.write("\n")
    # Write content
    for line in lines:
        line.insert(0, linecounter)
        targetfile.write(",".join([str(item) for item in line]))
        targetfile.write("\n")
        linecounter += 1
print(f"Found {num_subjects} subjects. Wrote {len(lines)} lines to {output_file}")