
shall we try the BIDS scheme for all behavioral data as well? the HED seems quite elaborate and rather unintuitive to me: https://bids-specification.readthedocs.io/en/stable/99-appendices/03-hed.html#annotating-events-by-categories

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import itertools
import seaborn as sns
import zipfile
import matplotlib.pyplot as plt
import shutil
import multiprocessing as mp
import re

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

In [None]:
#msit response buttons
# r = 1, g = 2 ,y = 3

In [None]:
class prepBehavior(object):
    ''' To run, simply prepBehavior(subject_n, session_n, task_name).process()
    
    The idea here is to extract/generate two types of files: 
    - One used to do proper behavioral analyses/modelling later on (a csv with a row per trial and many additional columns)
    - One used for the fMRI data analyses, which follows the bids convention (one row per event, with columns [onset, trial_type, duration])
    
    Which events need to be extracted for the fMRI data analyses will depend on the fMRI modelling later on. 
    Currently it extracts all event types (in the behavioral files) except for pulses; 
    but it takes no effort in categorizing stimuli or anything like that (so all stimuli types appear as "stimulus"). 
    The only exception are responses, which are mapped to response_left and response_right so we can easily
    check for lateralization in M1.
    
    - For the learning tasks, I coded up some plots as well to do a brief visual quality check.  
    '''
    
    def __init__(self, sub, ses, task, plot=True, sourcedata_dir='../sourcedata'):
        
        self.sub = sub
        self.ses = ses
        self.task = task
        self.plot = plot
        
        self.save_directories = {'events': '../derivatives/behavior/sub-{}/ses-{}/func'.format(self.sub, self.ses),
                                 'behavior': '../derivatives/event_files/sub-{}/ses-{}/func'.format(self.sub, self.ses)}
    
    def make_directories(self):
        for _, directory in self.save_directories.items():
            os.makedirs(directory, exist_ok=True)
    
    def find_event_files(self):
        #self.run_fns = sorted(glob.glob('../sourcedata/sub-{sub}/ses-{ses}/task*_data/sub-{sub}_*task-{task}*_block-*_events.tsv'.format(sub=self.sub, ses=self.ses, task=self.task)))
        #print(f'../sourcedata/zipdata/sub-{self.sub}/ses-{self.ses}/sub-{self.sub}_ses-{self.ses}_data_tmpunzip/RESOURCES/task*_data/sub-{self.sub}_*task-{self.task}*_block-*_events.tsv')
        self.run_fns = sorted(glob.glob(f'../sourcedata/zipdata/sub-{self.sub}/ses-{self.ses}/sub-{self.sub}_ses-{self.ses}_data_tmpunzip/sub-{self.sub}_ses-{self.ses}_data/RESOURCES/task*_data/sub-{self.sub}_*task-{self.task}*_block-*_events.tsv'))
        
        # sub 009 rbrevl task data saved weirdly, must manually input
        if self.sub == '009' and self.ses == 'rbrevl':
            if self.task == 'ReferenceBack':
                self.run_fns = ['../sourcedata/zipdata/sub-009/ses-rbrevl/sub-009_ses-rbrevl_data_tmpunzip/sub-009_ses-rbrevl_data/RESOURCES/task_data/sub-009_ses-MRI_task-ReferenceBack_datetime-20210302-083106_events.tsv',
                               '../sourcedata/zipdata/sub-009/ses-rbrevl/sub-009_ses-rbrevl_data_tmpunzip/sub-009_ses-rbrevl_data/RESOURCES/task_data/sub-009_ses-MRI_task-ReferenceBack_datetime-20210302-084621_events.tsv']
            elif self.task == 'reversal-learning':
                self.run_fns = ['../sourcedata/zipdata/sub-009/ses-rbrevl/sub-009_ses-rbrevl_data_tmpunzip/sub-009_ses-rbrevl_data/RESOURCES/task_data/sub-009_task-reversal-learning_datetime-20210302-075736_events.tsv',
                               '../sourcedata/zipdata/sub-009/ses-rbrevl/sub-009_ses-rbrevl_data_tmpunzip/sub-009_ses-rbrevl_data/RESOURCES/task_data/sub-009_task-reversal-learning_datetime-20210302-081521_events.tsv']
        
        if len(self.run_fns) == 0:
            raise IOError("No data file found for sub {} task {}".format(self.sub, self.task))
        
        self.make_directories() # only make directories if event files are found
        
    def load_run_events(self):
        ''' loads event data of all found runs '''
        self.data = []
        for fn in self.run_fns:
            print(fn)
            data = pd.read_csv(fn, sep='\t')
            ## NB: _events-files of blocks > 1 contain ALL data, so the event file of block 3 also contains blocks 1 and 2
            # hence, we need to find the last pulse with phase = -1, which marks the beginning of the run
            if self.task == 'SST':
                
                ### SM added this ##
                last_block_start_idx = np.where(data['event_type'] == 'non_response_keypress')[0][-1]  # cut on last space
                data = data.iloc[last_block_start_idx:].reset_index(drop=True)

                first_pulse_idx = np.where((data['event_type'] == 'pulse') & pd.isnull(data.phase))[0][0]  # cut on first pulse, which marks the start of the block
                data = data.iloc[first_pulse_idx:].reset_index(drop=True)
                
                ### which used to be this, but this forgot about the dummy volumes
#                indx_last_block_start = np.where(np.isnan(data.phase))[0][-1]
#                data = data.loc[indx_last_block_start:].reset_index(drop=True)
                data['correct_response_direction'] = data.direction
                #data = data.rename(columns={'correct_response': 'correct_response_direction'})
                self.data.append(data)
            elif self.task == 'MSIT':
                
#                 last_block_start_idx = np.where(data['event_type'] == 'non_response_keypress')[0][-1]  # cut on last space
                last_block_start_idx = np.where((data['event_type'] == 'non_response_keypress') & (data['response'] == 'space'))[0][-1]
                data = data.iloc[last_block_start_idx:].reset_index(drop=True)
                first_pulse_idx = np.where((data['event_type'] == 'pulse') & pd.isnull(data.phase))[0][0]  # cut on first pulse, which marks the start of the block
                data = data.iloc[first_pulse_idx:].reset_index(drop=True)
                
                ### which used to be this, but this forgot about the dummy volumes
#                indx_last_block_start = np.where(np.isnan(data.phase))[0][-1]
#                data = data.loc[indx_last_block_start:].reset_index(drop=True)
#                 data['correct_response_direction'] = data.direction
                data['correct_response_direction'] = data.correct_response
                #data = data.rename(columns={'correct_response': 'correct_response_direction'})
                self.data.append(data)
                
#                 indx_last_block_start = np.where(np.isnan(data.phase))[0][-1]
#                 data = data.loc[indx_last_block_start:].reset_index(drop=True)
#                 data['correct_response_direction'] = data.correct_response
#                 #data = data.rename(columns={'correct_response': 'correct_response_direction'})
#                 self.data.append(data)
            else:
                indx_last_block_start = np.where(data.phase == -1)[0][-1]
                data = data.loc[indx_last_block_start:].reset_index(drop=True)
                data = data.rename(columns={'correct_response': 'correct_response_direction'})
                self.data.append(data)
    
    def process_learning_run(self, data):
        
        # collect trials: take 'response' event whenever present in that trial, otherwise take 'stimulus'
        trials_with_responses = data.loc[(data['event_type'] == 'response') & (pd.notnull(data['rt_too_slow']))]  # the latter gets rid of secondary button presses
        trials_with_responses_idx = trials_with_responses['trial_nr'].unique()

        trials_with_stimuli_idx = data.loc[data['event_type'] == 'stimulus', 'trial_nr'].unique()
        trials_without_responses_idx = trials_with_stimuli_idx[~np.in1d(trials_with_stimuli_idx, trials_with_responses_idx)]
        trials_without_responses = data.loc[(data['event_type'] == 'stimulus') & (data['trial_nr'].isin(trials_without_responses_idx))]
#         trials_without_responses = data.loc[data['trial_nr'].isin(trials_without_responses_idx)]

#         trials_without_responses = data.loc[data.trial_nr.isin(trials_without_responses_idx)]
        
        trials_stim_response = pd.concat([trials_with_responses, trials_without_responses], axis=0).sort_values('onset')

        ## useful columns
        trials_stim_response = trials_stim_response[['block_nr', 'trial_nr', 'event_type', 'onset', 'response', 'stimulus_symbol_left', 'stimulus_symbol_right', 'correct_response_direction', 'p_win_left', 'p_win_right', 'rt', 'choice_direction', 'choice_outcome', 'cue']]
        trials_stim_response.correct_response_direction = trials_stim_response.correct_response_direction.replace({0: 'left', 1: 'right'}).astype('category')
        trials_stim_response.choice_direction = trials_stim_response.choice_direction.replace({0: 'left', 1: 'right'}).astype('category')
        trials_stim_response['ease'] = np.round(np.abs(trials_stim_response['p_win_left']-trials_stim_response['p_win_right']),3) ## difficulty inverted
        trials_stim_response['accuracy'] = trials_stim_response['choice_direction'] == trials_stim_response['correct_response_direction']
        
#         responses = data.loc[data.event_type == 'response', ['block_nr', 'trial_nr', 'onset', 'response', 'stimulus_symbol_left', 'stimulus_symbol_right', 'correct_response_direction', 'p_win_left', 'p_win_right', 'rt', 'choice_direction', 'choice_outcome', 'cue']]
#         responses.correct_response_direction = responses.correct_response_direction.replace({0: 'left', 1: 'right'}).astype('category')
#         responses.choice_direction = responses.choice_direction.replace({0: 'left', 1: 'right'}).astype('category')
#         responses['ease'] = np.round(np.abs(responses['p_win_left']-responses['p_win_right']),3) ## difficulty inverted
#         responses['accuracy'] = responses['choice_direction'] == responses['correct_response_direction']
        
        return trials_stim_response
    
    def process_sst_run(self, data):

        data['onset'] = data['onset'] - data.loc[data['event_type']=='pulse','onset'].values[0]
        data = data.loc[(data.event_type!='pulse')]         # remove pulses
        data = data.loc[~pd.isnull(data['trial_nr'])]       # remove stuff without trial numbers
        data = data.loc[data.event_type.isin(['stimulus', 'response'])]  # only stim & responses
        data = data.loc[(data.null_trial == 0)]             # remove null trials
        data = data.loc[(data.phase == 1)]                  # Only include responses given in phase 1
        ## find trials with responses
        has_response = data.groupby('trial_nr')['choice_key'].apply(lambda x: np.any(pd.notnull(x)))
        has_response.name = 'has_response'
        data = pd.merge(data, has_response, left_on='trial_nr', right_index=True)  # merge back in
        # make is correct column
        data['correct'] = np.nan
        data.loc[(data.direction == 1) & (data.choice_key == 'r'), 'correct'] = 1
        data.loc[(data.direction == 0) & (data.choice_key == 'b'), 'correct'] = 1
        is_correct = data.groupby('trial_nr')['correct'].apply(lambda x: np.any(pd.notnull(x)))
        is_correct.name = 'is_correct'
        data = pd.merge(data, is_correct, left_on='trial_nr', right_index=True)  # merge back in
        # categorize fs / ss /go /gf
        data['trial_type'] = np.nan
        data.loc[(data['stopsig_trial'] == 1) & (data['has_response'] == 1) , 'trial_type'] = 'fs'
        data.loc[(data['stopsig_trial'] == 1) & (data['has_response'] == 0), 'trial_type'] = 'ss'
        data.loc[(data['stopsig_trial'] == 0), 'trial_type'] = 'go'
        data.loc[(data['stopsig_trial'] == 0) & (data['has_response'] == 0), 'trial_type'] = 'gf'
        data['correct'] = 0
        data.loc[(data.direction == 1) & (data.choice_key == 'r'), 'correct'] = 1
        data.loc[(data.direction == 0) & (data.choice_key == 'b'), 'correct'] = 1
        data.loc[(data['stopsig_trial'] == 0) & (data['has_response'] == 1) & (data['is_correct'] == False), 'trial_type'] = 'go_INC'
        # categorize response left / response right
        data = pd.merge(data, data.loc[data.event_type=='response', ['trial_nr', 'rt']], on='trial_nr', how='outer')
        data.loc[(data['event_type'] == 'response') & (data.response == 'b'), 'trial_type'] = 'response_right'
        data.loc[(data['event_type'] == 'response') & (data.response == 'r'), 'trial_type'] = 'response_left'
        responses = data
#         print(responses)
        
        return responses
        
    def process_msit_run(self, data):
#         data['accuracy'] = 0
#         data.loc[(data.correct_response == 1.0) & (data.choice_key == 'r'),'accuracy'] = 1
#         data.loc[(data.correct_response == 2.0) & (data.choice_key == 'g'),'accuracy'] = 1
#         data.loc[(data.correct_response == 3.0) & (data.choice_key == 'y'),'accuracy'] = 1
        if self.sub =='025':
            data.event_type[data.response=='b'] = 'response'
        responses = data.loc[data.event_type == 'response', ['block_nr', 'trial_nr', 'onset', 'response', 'stimuli', 'condition', 'choice_key', 'rt','correct_response']]
        responses.correct_response = responses.correct_response.replace({1: 'index', 2: 'middle', 3:'ring'}).astype('category')
        if self.sub =='025':
            print('subject 025')
            responses.choice_key[responses.response=='b'] = 'b'
            print(responses.choice_key)
            responses.choice_key = responses.choice_key.replace({'g': 'index', 'y': 'middle', 'b':'ring'}).astype('category')
        else:
            responses.choice_key = responses.choice_key.replace({'r': 'index', 'g': 'middle', 'y':'ring'}).astype('category')
        print(responses.correct_response)
        print(responses.choice_key)
        responses['accuracy'] = responses['choice_key'] == responses['correct_response']
        
        return responses
    
    def process_rb_run(self,data):
            
        data['WMstim'] = data['WMstim'].ffill()
        data['stimulus'] = data['stimulus'].ffill()
        data['color'] = data['color'].ffill()
        
        data['onset'] = data['onset'] - data.loc[data['event_type']=='pulse','onset'].values[0]
   
        data = data.loc[data.event_type.isin(['stimulus','response'])]
        stimuli_idx = data.event_type == 'stimulus'

        # Previous color?
        data.loc[stimuli_idx,'prev_color'] = np.nan
        data.loc[stimuli_idx,'prev_color'] = data.loc[stimuli_idx,'color'].shift(1)
        
        data['choice_outcome'] = np.nan
        data.loc[(data['stimulus'] == data['WMstim']) & (data['choice_key'] == 'b'), 'choice_outcome'] = 1.0 #correct match
        data.loc[(data['stimulus'] != data['WMstim']) & (data['choice_key'] == 'r'), 'choice_outcome'] = 1.0 #correct mismatch
        data.loc[(data['stimulus'] != data['WMstim']) & (data['choice_key'] == 'b'), 'choice_outcome'] = 0.0 #incorrect match
        data.loc[(data['stimulus'] == data['WMstim']) & (data['choice_key'] == 'r'), 'choice_outcome'] = 0.0 #incorrect mismatch

        # Switch or repeat?
        data.loc[stimuli_idx&(data['color'] == data['prev_color']), 'switch'] = 'repeat'
        data.loc[stimuli_idx&(data['color'] != data['prev_color']), 'switch'] = 'switch'
        
        data['accuracy'] = data['choice_outcome'] == 1.0
        data.loc[pd.isnull(data['choice_outcome']), 'accuracy'] = np.nan 
        data['accuracy'] = data['accuracy'].bfill()

        data.loc[(data['trial_nr'] == 0) | (data['trial_nr'] == 129), 'accuracy'] = np.nan


        data['trial_type'] = np.nan
        data.loc[(data['color'] == 'red') & (data['switch'] == 'repeat') & (data['stimulus'] == data['WMstim']), 'trial_type'] = 'RefRepSame' 
        data.loc[(data['color'] == 'red') & (data['switch'] == 'repeat') & (data['stimulus'] != data['WMstim']), 'trial_type'] = 'RefRepDifferent' 
        data.loc[(data['color'] == 'red') & (data['switch'] == 'switch') & (data['stimulus'] == data['WMstim']), 'trial_type'] = 'RefSwiSame' 
        data.loc[(data['color'] == 'red') & (data['switch'] == 'switch') & (data['stimulus'] != data['WMstim']), 'trial_type'] = 'RefSwiDifferent' 
        data.loc[(data['color'] == 'blue') & (data['switch'] == 'repeat') & (data['stimulus'] == data['WMstim']), 'trial_type'] = 'ComRepSame' 
        data.loc[(data['color'] == 'blue') & (data['switch'] == 'repeat') & (data['stimulus'] != data['WMstim']), 'trial_type'] = 'ComRepDifferent' 
        data.loc[(data['color'] == 'blue') & (data['switch'] == 'switch') & (data['stimulus'] == data['WMstim']), 'trial_type'] = 'ComSwiSame' 
        data.loc[(data['color'] == 'blue') & (data['switch'] == 'switch') & (data['stimulus'] != data['WMstim']), 'trial_type'] = 'ComSwiDifferent' 
        data.loc[(data['trial_nr'] == 0) | (data['trial_nr'] == 129), 'trial_type'] = np.nan
        #copy trial_type column for behavioral analyses (before removing trial information for incorrect trials)
        data['condition'] = data['trial_type'].ffill()
        data.loc[(data['trial_nr'] == 0) | (data['trial_nr'] == 129), 'condition'] = np.nan

        # code error trials
        data.loc[(data.event_type=='stimulus') & (data.accuracy == False), 'trial_type'] = 'error'

    
        data.loc[(data.event_type=='response') & (data.choice_direction == 0), 'trial_type'] = 'response_left'
        data.loc[(data.event_type=='response') & (data.choice_direction == 1), 'trial_type'] = 'response_right'
        
        data['switch'] = data['switch'].ffill()
        data['prev_color'] = data['prev_color'].ffill()

        
        # code error trials rather than left/right
        #data.loc[(data.event_type=='response') & (data.choice_outcome == 0), 'trial_type'] = 'error'
        #data.loc[(data.event_type=='response') & (data.choice_outcome == 1), 'trial_type'] = 'correct'


        data = data.loc[~((pd.isnull(data['trial_type']))&(data.event_type=='response'))]
      #  data['accuracy'] = data['choice_direction'] == data['correct_response_direction']
       # data.loc[(data['trial_nr'] == 0) | (data['trial_nr'] == 129), 'accuracy'] = np.nan
        data.loc[(data['trial_nr'] == 0) | (data['trial_nr'] == 129), 'choice_outcome'] = np.nan
#         trials_stim_response['accuracy'] = trials_stim_response['choice_direction'] == trials_stim_response['correct_response_direction']
    
        #data['accuracy'] = np.nan
        #data.loc[(data['choice_outcome'] == 1.0), 'accuracy'] = 
        
        return data

    def plot_learning(self, responses, f=None, ax=None):
        if f is None:
            f, ax = plt.subplots(2,2, figsize=(8,5))

        _ = [ax_.grid() for ax_ in ax.ravel()]

        all_eases = sorted(responses.ease.unique())
        color_map = {'0.2': 'red',
                     '0.4': 'blue',
                     '0.6': 'green',
                     'SPD': 'darkred',
                     'ACC': 'darkgreen',
                     '': 'orange'}
        
        all_cues = sorted(responses.cue.unique())
        if len(all_cues) == 1:
            all_cues = ['']
        responses.loc[pd.isnull(responses.cue), 'cue'] = ''
        
        ## by cue (or overall)
        for cue in all_cues:
            for accuracy in [True, False]:
                rts = responses.loc[(responses.accuracy==accuracy) & (responses.cue==cue), 'rt']
                rts = rts * 1 if accuracy else rts*-1
                sns.histplot(rts, color=color_map[cue], ax=ax[0,0], kde=True, label='Accuracy = {:.2f}'.format(responses.accuracy.mean()) if (accuracy and cue == 'SPD') or (accuracy and cue == '') else None)
        ax[0,0].legend()
        
        for ease in all_eases:
            color = color_map[str(ease)] # = 'blue' if ease == 0.6 else 'red'
            for accuracy in [True, False]:
                rts = responses.loc[(responses.ease==ease) & (responses.accuracy==accuracy), 'rt']
                rts = rts * 1 if accuracy else rts*-1
                sns.histplot(rts, color=color, ax=ax[1,0], kde=True, label='Ease = {}'.format(ease) if accuracy else None)
        ax[1,0].legend()

        ## quantile plots
        qps = np.arange(.1, .91, .2)
        for cue in all_cues:
            quantiles = responses.loc[responses.cue==cue].groupby('accuracy')['rt'].apply(lambda x: np.quantile(x, qps))
            acc = responses.loc[responses.cue==cue, 'accuracy'].mean()
            if acc > 0:
                ax[0,1].plot(quantiles.loc[True], qps*acc, linestyle='-', marker='o', color=color_map[cue], label='Accuracy = {:.2f}'.format(acc))
            if acc < 1:
                ax[0,1].plot(quantiles.loc[False], qps*(1-acc), linestyle='-', marker='o', color=color_map[cue])

        ax[0,1].legend()
        ax[0,1].set_ylim(0, .9)
        ax[0,1].set_xlabel('RT')
        ax[0,1].set_ylabel('Cum. Def. Prob.')

        for ease in all_eases:
            color = color_map[str(ease)]

            acc = responses.loc[responses.ease == ease, 'accuracy'].mean()
            quantiles = responses.loc[responses.ease == ease].groupby('accuracy')['rt'].apply(lambda x: np.quantile(x, qps))
            #print(quantiles)
            if acc > 0:
                ax[1,1].plot(quantiles.loc[True], qps*acc, linestyle='-', marker='o', color=color, label='Accuracy = {:.2f}'.format(acc))
            if acc < 1:
                ax[1,1].plot(quantiles.loc[False], qps*(1-acc), linestyle='-', marker='o', color=color)
            ax[1,1].set_ylim(0, .9)
            ax[1,1].set_xlabel('RT')
            ax[1,1].set_ylabel('Cum. Def. Prob.')
        ax[1,1].legend()

        return f, ax
    
    # This is where we get task-specific
    def process(self):
        ''' processes behavioral data '''
        
        print(f'Processing sub {self.sub} ses {self.ses} task {self.task}')
        
        self.find_event_files()
        self.load_run_events()
        
        if self.task == 'ReferenceBack':
            responses = []
            for run, data in enumerate(self.data):
                run += 1
                ## subtract first pulse from onset
                data['onset'] -= data.loc[(data.trial_nr == -1) & (data.event_type == 'pulse'), 'onset'].values[0]
    
#                 run_responses = self.process_rb_run(data)
                data = run_responses = self.process_rb_run(data)
                #data['duration'] = 0.001
        
                ## For fmri analyses, extract & save simple task info for now (left/right responses)
#                 data.loc[data.event_type == 'response', 'event_type'] = data['choice_direction'].replace({0: 'response_left', 1: 'response_right'}).dropna() # get left_right responses
#                 data = data.loc[data.event_type.isin(['stimulus', 'response_left', 'response_right']), ['onset', 'event_type', 'duration', 'trial_nr']]#.rename(columns={'event_type': 'trial_type'})
                #data = data[['onset', 'event_type', 'trial_type', 'duration']] #.rename(columns={'event_type': 'trial_type'})
                data = data[['onset','trial_nr','trial_type', 'duration']] # .rename(columns={'event_type': 'trial_type'})

                save_fn = '../derivatives/event_files/sub-{sub}/ses-rbrevl/func/sub-{sub}_ses-{ses}_task-rb_run-{run}_events.tsv'.format(sub=self.sub, ses=self.ses, run=run)
                data.to_csv(save_fn, sep='\t', index=False)
        
                responses.append(run_responses)
            
            responses = pd.concat(responses)
            save_fn = '../derivatives/behavior/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_behavior.tsv'.format(sub=self.sub, ses=self.ses, task=self.task)
            responses.to_csv(save_fn, sep='\t', index=False)
        
        elif self.task == 'SST':
            responses = []
            for run, data in enumerate(self.data):
                run += 1
                ## subtract first pulse from onset
                data['onset'] -= data.loc[(np.isnan(data.trial_nr)) & (data.event_type == 'pulse'), 'onset'].values[0]
                
                run_responses = self.process_sst_run(data)

                eventDat = run_responses
                eventDat = eventDat.loc[eventDat.trial_type.isin(['response_left','response_right','go','fs','ss','gf','go_INC']), ['onset', 'trial_type','trial_nr','duration']]

                save_fn = '../derivatives/event_files/sub-{sub}/ses-sstmsit/func/sub-{sub}_ses-{ses}_task-sst_run-{run}_events.tsv'.format(sub=self.sub, ses=self.ses, run=run)
                eventDat.to_csv(save_fn, sep='\t', index=False)
        
                # for behavioral analyses, do some other things & save more extensive things
                responses.append(run_responses)
                
                if self.plot:
                    # look at staircase of behavioural data
                    plotDat = pd.concat(responses)
                    tmp = plotDat.loc[(plotDat['current_ssd']>0) & (plotDat['block_nr']==run), ['trial_nr', 'staircase_id', 'current_ssd']]
                    tmp_ssd = [k for k,_g in itertools.groupby(tmp['current_ssd'])]
                    plt.plot(range(0,len(tmp_ssd)),tmp_ssd)
                    plt.xlabel('Trial')
                    plt.ylabel('SSD')
                
            ## Save to csv for behavior
            responses = pd.concat(responses)
            save_fn = '../derivatives/behavior/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_behavior.tsv'.format(sub=self.sub, ses=self.ses, task=self.task)
            responses.to_csv(save_fn, sep='\t', index=False)
        
        elif self.task == 'MSIT':
            responses = []
            for run, data in enumerate(self.data):
                run += 1
                ## subtract first pulse from onset
                data['onset'] -= data.loc[(np.isnan(data.trial_nr)) & (data.event_type == 'pulse'), 'onset'].values[0]

#                 if self.sub == '025':
#                     print('processing numero 025')
#                     data.loc[data.response == 'g', 'choice_key'] = 'r'
#                     data.loc[data.response == 'y', 'choice_key'] = 'g'
#                     data.loc[data.response == 'b', 'choice_key'] = 'y'
#                     print(data['choice_key'])
                
                run_responses = self.process_msit_run(data)
                
#                 if self.sub == '025':
#                     data.loc[data.response == 'g', 'choice_key'] = 'r'
#                     data.loc[data.response == 'y', 'choice_key'] = 'g'
#                     data.loc[data.response == 'b', 'choice_key'] = 'y'
                    
#                 else:
                ## For fmri analyses, extract & save simple task info for now (index, middle, ring responses, conditions)
                data['accuracy'] = 0
                data.loc[(data.correct_response == 1.0) & (data.choice_key == 'r'),'accuracy'] = 1
                data.loc[(data.correct_response == 2.0) & (data.choice_key == 'g'),'accuracy'] = 1
                data.loc[(data.correct_response == 3.0) & (data.choice_key == 'y'),'accuracy'] = 1
                data_acc = data.copy()
                data_acc.loc[data.event_type == 'response', 'event_type'] = data['accuracy'].replace({1:'correct',0:'incorrect'}).dropna()
                
                data.loc[data.event_type == 'response', 'event_type'] = data['choice_key'].replace({'r': 'response_index', 'g': 'response_middle', 'y': 'response_ring'}).dropna() # get index, middle, ring responses
                data.loc[data.event_type == 'stimulus', 'event_type'] = data['condition'].replace({1:'con',2:'simon',3:'flanker',4:'inc'}).dropna()
                
                data = data.loc[data.event_type.isin(['con', 'simon','flanker','inc','response_index', 'response_middle', 'response_ring', 'timing_feedback','correct','incorrect']), ['onset', 'event_type', 'duration']].rename(columns={'event_type': 'trial_type'})
                data_acc = data_acc.loc[data_acc.event_type.isin(['correct','incorrect']), ['onset', 'event_type', 'duration']].rename(columns={'event_type': 'trial_type'})
                data = pd.concat([data,data_acc]).sort_values(by=['onset'])
#                 data = data.sort_values(by=['onset'])
                
                save_fn = '../derivatives/event_files/sub-{sub}/ses-sstmsit/func/sub-{sub}_ses-{ses}_task-msit_run-{run}_events.tsv'.format(sub=self.sub, ses=self.ses, run=run)
                data.to_csv(save_fn, sep='\t', index=False)
        
                # for behavioral analyses, do some other things & save more extensive things
                responses.append(run_responses)
                
            ## Save to csv for behavior
            responses = pd.concat(responses)
            save_fn = '../derivatives/behavior/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_behavior.tsv'.format(sub=self.sub, ses=self.ses, task=self.task)
            responses.to_csv(save_fn, sep='\t', index=False)
        
        elif self.task == 'reversal-learning' or self.task == 'SAT-learning':
            responses = []
            for run, data in enumerate(self.data):
                run += 1
                ## subtract first pulse from onset
                data['onset'] -= data.loc[(data.trial_nr == -1) & (data.event_type == 'pulse'), 'onset'].values[0]
                
                run_responses = self.process_learning_run(data)
                
                ## For fmri analyses, extract & save simple task info for now (left/right responses)
                data.loc[data.event_type == 'response', 'event_type'] = data['choice_direction'].replace({0: 'response_left', 1: 'response_right'}).dropna() # get left_right responses
                if not pd.isnull(data['cue']).all():
                    data.loc[data.event_type == 'cue', 'event_type'] = data['cue'].replace({'SPD': 'cue_SPD', 'ACC': 'cue_ACC'}).dropna() # get left_right responses
                
                data = data.loc[data.event_type.isin(['stimulus', 'cue', 'cue_SPD', 'cue_ACC', 'response_left', 'response_right', 'feedback']), ['onset', 'event_type', 'duration', 'trial_nr']].rename(columns={'event_type': 'trial_type'})  # keep track of trial_nr!
                
                task_renamed = 'rlsat' if self.task == 'SAT-learning' else 'revl'
                save_fn = '../derivatives/event_files/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_run-{run}_events.tsv'.format(sub=self.sub, task=task_renamed, ses=self.ses, run=run)
                data.to_csv(save_fn, sep='\t', index=False)
                
                # for behavioral analyses, do some other things & save more extensive things
                responses.append(run_responses)

            ## Save to csv for behavior
            responses = pd.concat(responses)
            save_fn = '../derivatives/behavior/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_behavior.tsv'.format(sub=self.sub, ses=self.ses, task=self.task)
            responses.to_csv(save_fn, sep='\t', index=False)
            
            ## plot?
            if self.plot:
                responses = responses.loc[pd.notnull(responses.rt)]
                
                n_runs = len(self.data)
                subplots_heights = [1,1,.3] * (n_runs) + [1,1]
                n_subplots_rows = len(subplots_heights)
                ## where to plot? ignore every third, so [0,1,3,4,6,7]
                populate_rows = np.delete(np.arange(n_subplots_rows), np.arange(2, n_subplots_rows, 3))
                fig = plt.figure(figsize=(8,(4+n_runs*4)))
                gs = fig.add_gridspec(n_subplots_rows, 2, height_ratios=subplots_heights, hspace=0)
                
                ax = np.array([[fig.add_subplot(gs[y,x]) for x in [0,1]] for y in populate_rows])
                fig, _ = self.plot_learning(responses, f=fig, ax=ax[:2,:2])
                for run in range(n_runs):
                    run += 1
                    fig, _ = self.plot_learning(responses.loc[responses.block_nr == run], f=fig, ax=ax[slice(run*2, run*2+2),:2])

                # Add row titles
                for run in range(n_runs):
                    ax_ = fig.add_subplot(gs[(0+run*2+run):(2+run*2+run),:2])
                    ax_.axis('off')
                    if run == 0:
                        ax_.set_title('Across blocks')
                    else:
                        ax_.set_title('Run {}'.format(run))
                fig.tight_layout()
                task_renamed = 'rlsat' if self.task == 'SAT-learning' else 'revl'
                plot_save_fn = '../derivatives/quality_control_plots/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_desc-behavior.pdf'.format(sub=self.sub, ses=self.ses, task=task_renamed)
#                 plot_save_fn = save_fn.replace('tsv', 'pdf').replace('behavior/', 'quality_control_plots/')
                os.makedirs(os.path.dirname(plot_save_fn), exist_ok=True)
                fig.savefig(plot_save_fn)
        
        print(f'sub {self.sub} ses {self.ses} task {self.task} processed')
        return self

In [None]:
def make_tmpunzip(sub, ses):

    if not os.path.exists(f'/home/Public/trondheim/sourcedata/zipdata/sub-{sub}/ses-{ses}/sub-{sub}_ses-{ses}_data_tmpunzip'):
        print(f'unzipping dicoms for sub {sub} ses {ses}..')
        with zipfile.ZipFile(f'/home/Public/trondheim/sourcedata/zipdata/sub-{sub}/ses-{ses}/sub-{sub}_ses-{ses}_data.zip', 'r') as zip_ref:
            zip_ref.extractall(f'/home/Public/trondheim/sourcedata/zipdata/sub-{sub}/ses-{ses}/sub-{sub}_ses-{ses}_data_tmpunzip')

def del_tmp(sub, ses):
    
    if os.path.exists(f'/home/Public/trondheim/sourcedata/zipdata/sub-{sub}/ses-{ses}/sub-{sub}_ses-{ses}_data_tmpunzip'):
        print(f'Deleting temporary dicom folder for sub-{sub} ses-{ses}')
        shutil.rmtree(f'/home/Public/trondheim/sourcedata/zipdata/sub-{sub}/ses-{ses}/sub-{sub}_ses-{ses}_data_tmpunzip')


In [None]:
def find_all_to_process(return_tuples=False):
    
    all_files = sorted(glob.glob('../derivatives/bids/sub-*/ses-*/func/sub-*_ses-*_task-*_run-*.nii.gz'))
    
    regex = re.compile('.*sub-(?P<sub>\d+)/ses-(?P<ses>\S+)/func/sub-.*_ses-.*_task-(?P<task>\S+)_run-.*_.*.nii.gz')
    
    all_files_dict = [regex.match(x).groupdict() for x in all_files]
    all_files_df = pd.DataFrame.from_dict(all_files_dict).sort_values(['sub','ses','task']).drop_duplicates()
    
    dict_list = all_files_df.to_dict(orient='records')
    if return_tuples:
        return [tuple(x.values()) for x in dict_list]
    else:
        return dict_list


def find_new_to_process():
    
    all_new_files = find_all_to_process()
    
    to_run = []
    for dict_ in all_new_files:
        fp = '../derivatives/event_files/sub-{sub}/ses-{ses}/func/sub-{sub}_ses-{ses}_task-{task}_run-1_events.tsv'.format(**dict_)
        if not os.path.exists(fp) and not dict_['ses']=='anatomical':
            to_run.append(dict_)
    return [tuple(x.values()) for x in to_run]


In [None]:
to_process = find_new_to_process()
to_process = [x for x in to_process if x[1] == 'rlsat']
# to_process = find_all_to_process(True)
# to_process = [x for x in to_process if 'anatomical' not in x] # remove anatomical sessions
#to_process = [x for x in to_process if ('025' and 'msit') not in x] # only msit sessions
#to_process = [x for x in to_process if ('rbrevl') not in x] # only msit sessions
to_process
# print(to_process)

In [None]:
# run

# task names in the behavioral files are not the same as in the MRI data, so we need a mapping
task_name_mapping = {'sst': 'SST', 'msit': 'MSIT', 'rlsat': 'SAT-learning', 'rb': 'ReferenceBack', 'revl': 'reversal-learning'}

for sub, ses, task in to_process:
    
#     if sub=='009' and ses=='rbrevl': # fix this at some point
#         continue
    
    print(f'sub: {sub} ses: {ses} task: {task}')

    prepBehavior(sub=sub, ses=ses, task=task_name_mapping[task]).process()
    


In [None]:
# how many responses per subject and condition? 

for sub in range(2,20):
    print('\n')
    sub_id = str(sub).zfill(3)
    print('sub-'+sub_id)
    
    filename = Path('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')

    if filename.exists():

        dat = pd.read_csv('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')
        print('trials: ' +str(len(pd.unique(dat['trial_nr'])))+' \n')
        for trial_type in pd.unique(dat['trial_type']):
            print(trial_type)
            print(len(dat.loc[(dat['trial_type'] == trial_type)]))
            
    else:
        print('file not found')

In [None]:
# how many trials per subject and condition? 

# if less than 64, because non-responses removed? why is number of trials still 258 then?


for sub in range(2, len(to_process)+2):
    print('\n')
    sub_id = str(sub).zfill(3)
    print('sub-'+sub_id)
    
    filename = Path('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')

    if filename.exists():

        dat = pd.read_csv('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')
        print('trials: ' +str(len(pd.unique(dat['trial_nr'])))+' \n')
        for trial_type in pd.unique(dat['condition']):
            print(trial_type)
            print(len(dat.loc[(dat['condition'] == trial_type)]))
            
    else:
        print('file not found')

In [None]:
# how many erros per subject?

# how many trials per subject and condition? 

# if less than 64, because non-responses removed? why is number of trials still 258 then?

from pathlib import Path

for sub in range(2, len(to_process)+2):
    #print('\n')
    sub_id = str(sub).zfill(3)
    print('sub-'+sub_id)
    filename = Path('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')

    if filename.exists():
        dat = pd.read_csv('../derivatives/behavior/sub-'+sub_id+'/ses-rbrevl/func/sub-'+sub_id+'_ses-rbrevl_task-ReferenceBack_behavior.tsv', sep='\t')
        for trial_type in pd.unique(dat['trial_type']):
            if trial_type == 'error':
                print(len(dat.loc[(dat['trial_type'] == trial_type)]))
                
    else:
        print('file not found')
    

In [None]:
prepBehavior('003', 'rbrevl', 'ReferenceBack').process()


In [None]:
prepBehavior('003', 'rbrevl', 'ReferenceBack').process()


In [None]:
prepBehavior(sub='024', ses='rbrevl', task='reversal-learning').process()

In [None]:
prepBehavior(sub='002', ses='wmrevl', task='reversal-learning').process()

### What about SSRTs in SST?

In [None]:
import pandas as pd
import numpy as np
import glob

#tmp = pd.read_csv('../derivatives/behavior/sub-002/ses-sstmsit/func/sub-002_ses-sstmsit_task-SST_behavior.tsv', sep='\t')

all_behavs = sorted(glob.glob('../derivatives/behavior/sub*/ses-sstmsit/func/sub*_ses-sstmsit_task-SST_*'))
dfs = []
for fn in all_behavs:
    dfs.append(pd.read_csv(fn, sep='\t'))
    
df = pd.concat(dfs)
df = df.rename(columns={'rt_y': 'rt'})
df.head(10)

In [None]:
responses = df.loc[df['event_type'] == 'response']
responses['correct_response'] = False
responses['response_direction'] = responses.trial_type.replace({'response_left': 1, 'response_right': 0})
responses.loc[(responses.trial_type == 'response_left') & (responses.correct_response_direction == 1), 'correct_response'] = True
responses.loc[(responses.trial_type == 'response_right') & (responses.correct_response_direction == 0), 'correct_response'] = True
responses

df = pd.merge(left=df, right=responses[['trial_nr', 'subject', 'correct_response', 'response_direction']], on=['trial_nr', 'subject'], how='outer')
df

In [None]:
df = df.loc[df.event_type == 'stimulus']

In [None]:
exclude_for_RT_idx = (((np.in1d(df['trial_type'], ['go', 'fs'])) & (df['rt'] > 1.2)) |
                      ((np.in1d(df['trial_type'], ['go', 'fs'])) & (df['rt'] < .15)) |
                      ((np.in1d(df['trial_type'], ['go', 'fs'])) & (pd.isnull(df['rt']))))
print('Excluding {:.3f}% of go trials based on RT'.format(exclude_for_RT_idx.sum()/df.shape[0]))

# How many errors?
error_trial = (np.in1d(df['trial_type'], ['go', 'fs']) & (df['correct_response']==False))
print('{:.3f}% of trials were errors'.format(error_trial.sum()/df.shape[0]*100))

In [None]:
df.loc[df['current_ssd']<0, 'current_ssd'] = np.nan

In [None]:
df.groupby(['subject', 'block_nr']).current_ssd.first()

In [None]:
def get_ssrt(x):
    failed_stop_rate = np.sum(x.trial_type=='fs') / np.sum(x.stopsig_trial==1)
    # mean SSD
#    mean_ssd = x.loc[x.current_ssd>-1, 'ssd'].mean()/1000
    mean_ssd = x.current_ssd.mean(skipna=True) #/1000  # to s
    # go RT at failed_stop_rate percentile
    go_rt_at_percentile = np.percentile(x.loc[(x.trial_type=='go') & pd.notnull(x.rt) & (x.correct_response==True), 'rt'], failed_stop_rate*100)
    ssrt = go_rt_at_percentile - mean_ssd
    return ssrt
    
def get_descriptives(x):
    mean_go_rt = x.loc[(x.trial_type=='go') & (pd.notnull(x.rt)), 'rt'].mean()
    median_go_rt = np.median(x.loc[(x.trial_type=='go') & (pd.notnull(x.rt)), 'rt'])
    
    mean_failed_stop_rt = x.loc[x.trial_type=='fs', 'rt'].mean()
    median_failed_stop_rt = np.median(x.loc[x.trial_type=='fs', 'rt'])
    
    # SSRT
    ssrt = get_ssrt(x)
#     failed_stop_rate = np.sum(x.event_type=='failed_stop') / np.sum(x.trial_type=='stop')
#     # mean SSD
    
    mean_ssd = x.current_ssd.mean(skipna=True) #/1000  # to s
#     # go RT at failed_stop_rate percentile
#     go_rt_at_percentile = np.percentile(x.loc[(x.trial_type=='go') & pd.notnull(x.rt) & (x.correct_response==True), 'rt'], failed_stop_rate*100)
#     ssrt = go_rt_at_percentile - mean_ssd

    # accuracy
    accuracy = x.loc[x.trial_type == 'go', 'correct_response'].mean() #(x.trial_type=='go_trial').sum() / ( (x.event_type=='go_trial').sum() + (x.event_type=='go_error').sum() )
    
    failed_stop_rate = np.sum(x.trial_type=='fs') / np.sum(x.stopsig_trial==1)
    
    misses = np.sum((x.trial_type=='go') & (pd.isnull(x.response_direction))) / np.sum(x.trial_type=='go')
    
    out = pd.Series({'Median go RT': median_go_rt,
                     'Mean go RT': mean_go_rt,
                     'Median failed stop RT': median_failed_stop_rt,
                     'Mean failed stop RT': mean_failed_stop_rt,
                     'SSD': mean_ssd,
                     'SSRT': ssrt,
                     'Go accuracy': accuracy,
                     'Perc. inhibition': 1-failed_stop_rate,
                     'Misses (go)': misses})
    return out
    
aggregated_per_run = df.groupby(['subject', 'block_nr']).apply(get_descriptives)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(aggregated_per_run)

In [None]:
aggregated = df.groupby(['subject']).apply(get_descriptives).reset_index()

aggregated

In [None]:
aggregated['Error rate'] = (1-aggregated['Go accuracy'])*100
aggregated['Perc. inhibition'] *= 100
means = aggregated.mean()
stds = aggregated.std()


table1 = pd.concat([means, stds], axis=1).T
table1.index = ['mean', 'SD']
# table1.index = ['mean', 'SD']

# 
table1 = table1[['Median go RT', 'Mean go RT', 'Median failed stop RT', 'Mean failed stop RT', 'SSD', 'SSRT', 'Go accuracy', 'Perc. inhibition', 'Error rate']]
table1
# #table1 = pd.DataFrame(index=[0], columns=means.columns)
# for row in range(means.shape[0]):
#     for col in range(means.shape[1]):
#         table1.iloc[row, col] = '{:.2f} ({:.2f})'.format(means.iloc[row, col], stds.iloc[row, col])
# #table1 = table1.reindex(['se', 'me'])
# table1 = table1[['Median go RT', 'Mean failed stop RT', 'Error rate', 'SSRT', 'SSD', 'Perc. inhibition']]
# # table1.to_csv('figures/table_behavior.tsv', sep='\t')
# table1