In [1]:
import os
import pandas as pd
# import numpy as np
from tqdm import tqdm
from datetime import datetime, timedelta

In [2]:
input_df = pd.read_csv(
    '../sample_data/MAB_test_run_G5-G8_204080-21.11.30 - Copy.csv',
    encoding="utf_16",
    sep=';'
)
input_df.head()

Unnamed: 0,DateTime,IdRFID,IdLabel,unitLabel,eventDuration,sense1duration,sense1Events,senseRFIDrecords,reinforce1value,reinforce1Total,reinforce1Account,outFuncLabel,outLabel,SystemMsg,MsgValue1,MsgValue2,MsgValue3
0,#ID-Device,Sorter1.1,0.0,0.0,0.0,Sorter,,,,,,,,,,,
1,#ID-Device,Sorter1.2,0.0,0.0,0.0,Sorter,,,,,,,,,,,
2,#ID-Device,Sorter1.3,0.0,0.0,0.0,Sorter,,,,,,,,,,,
3,#Sorter,Sorter1,,,,,,,,,,,,,,,
4,#RfidCondition,CondMod1,,,,,,,,,,,,,,,


In [3]:
def initial_cleaning(input_df):
    # sort the values since for some reason observations sometimes mixed in time
    input_df['DateTime'] = input_df['DateTime'].astype(float)
    input_df.sort_values(by='DateTime', inplace=True)
    input_df.reset_index(drop=True, inplace=True)

    # some datetime manipulations
    input_df['Timestamp'] = input_df['DateTime'].apply(lambda x: datetime.timestamp(from_ordinal(x)))
    input_df['DateTime'] = input_df['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))

    return input_df


def from_ordinal(ordinal, _epoch=datetime(1899, 12, 30)):
    """Converts serial date-time to DateTime object.
    Parameters
    ----------
    ordinal : float or int
        Original serial date-time.
    _epoch : datetime
        Start of the count.
        NOTE: for some reason timestamp is shifted by 2 days
        backwards from 01-01-1900, that is why default value
        is set to 30-12-1899.
    """
    return _epoch + timedelta(days=ordinal)

In [4]:
input_df = input_df[~input_df['DateTime'].astype(str).apply(lambda x: x.startswith('#'))]

In [5]:
input_df = initial_cleaning(input_df)

In [6]:
ids = input_df['IdLabel'][~input_df['IdLabel'].isnull()].unique()
ids.sort()
ids_dict = dict(input_df[['IdLabel','IdRFID']].drop_duplicates().dropna().values)
ids_dict

{'G7': '041A7166B9',
 'G5': '041A717119',
 'G6': '041A7170F1',
 'G8': '041A71661F'}

In [7]:
final_output = pd.DataFrame({})

for animal_id in tqdm(ids):
    indices_start = input_df[(input_df['IdLabel'] == animal_id) & (input_df['SystemMsg'] == 'start exp')].index
    indices_end = input_df[(input_df['IdLabel'] == animal_id) & (input_df['SystemMsg'] == 'end exp')].index

    for session_i in range(len(indices_start)):
        ind_start = indices_start[session_i]
        ind_end = indices_end[session_i]
        subj_data = input_df.iloc[ind_start:ind_end+1, :].reset_index(drop=True)

        cndtn = subj_data['SystemMsg'].apply(
            lambda x: x.startswith('start trial') if type(x) == str else False)
        total_trials = subj_data['SystemMsg'][cndtn].apply(lambda x: int(x.split(' ')[2])).max()
        total_outcomes = (subj_data['SystemMsg'] == 'Reward?').sum()

        wait_poke_ts = subj_data['Timestamp'][subj_data['SystemMsg'] == 'wait poke']\
            .reset_index(drop=True)

        trial_start_ts = subj_data['Timestamp'][cndtn]\
            .reset_index(drop=True)
        trial_start_ts.name = 'trialStart'

        trial_end_ts = subj_data['Timestamp'][subj_data['SystemMsg'] == 'start iti']\
            .reset_index(drop=True)
        if len(trial_start_ts) != len(trial_end_ts):
            trial_end_ts = trial_end_ts.append(
                pd.Series(
                    subj_data['Timestamp'][subj_data['SystemMsg'] == 'end exp']
                ),
                ignore_index=True)
        trial_end_ts.name = 'trialEnd'

        if len(wait_poke_ts) != len(trial_start_ts):
            wait_poke_ts = wait_poke_ts[:len(wait_poke_ts)-1]

        start_latency = trial_start_ts - wait_poke_ts
        start_latency.name = 'startLatency'

        trial_duration = trial_end_ts - trial_start_ts
        trial_duration.name = 'trialDuration'

        decision = subj_data['MsgValue1'][subj_data['SystemMsg'] == 'decision:']\
            .reset_index(drop=True)
        decision_n = decision.apply(lambda x: x.split(' ')[1])
        decision_n.name = 'decisionNumber'
        decision_pos = decision.apply(lambda x: x.split(' ')[2][2])
        decision_pos.name = 'decisionPosition'
        decision_img = decision.apply(lambda x: x.split(' ')[2][4])
        decision_img.name = 'decisionImage'

        decision_ts = subj_data['Timestamp'][subj_data['SystemMsg'] == 'decision:']\
            .reset_index(drop=True)
        decision_latency = decision_ts - trial_start_ts
        decision_latency.name = 'decisionLatency'

        reward = subj_data['MsgValue1'][subj_data['SystemMsg'] == 'Reward?']\
            .reset_index(drop=True)
        reward.name = 'reward'
        reward = reward == 'True'

        reward_ready_ts = subj_data['Timestamp'][subj_data['SystemMsg'] == 'reward ready']\
            .reset_index(drop=True)

        reward_collected_ts = subj_data['Timestamp'][subj_data['SystemMsg'] == 'reward collected']\
            .reset_index(drop=True)

        reward_latency = reward_collected_ts - reward_ready_ts
        reward_latency.index = reward[reward == True].index
        reward_latency.name = 'rewardLatency'

        session_out = pd.concat(
            [trial_start_ts, trial_end_ts, trial_duration, start_latency,
             decision_n, decision_pos, decision_img, decision_latency, reward],
            axis=1)
        session_out = session_out.join(reward_latency)

        if total_trials != total_outcomes:
            session_out = session_out.iloc[:total_trials-1, :]

        session_out['trial'] = session_out.index + 1
        session_out['animalID'] = animal_id
        session_out['session'] = session_i + 1

        final_output = final_output.append(session_out).reset_index(drop=True)

100%|██████████| 4/4 [00:00<00:00,  5.71it/s]


In [8]:
final_output = final_output[[
    'animalID', 'session', 'trial', 'trialStart', 'trialEnd', 'trialDuration',
    'startLatency', 'decisionNumber', 'decisionPosition', 'decisionImage', 'decisionLatency',
    'reward', 'rewardLatency'
]]

In [15]:
pd.to_datetime(final_output['trialStart'], unit='s')

0     2021-11-30 16:15:32.404000000
1     2021-11-30 16:16:01.759998976
2     2021-11-30 16:16:34.412002048
3     2021-11-30 16:17:09.381001984
4     2021-11-30 16:17:30.321000960
                   ...             
484   2021-11-30 20:26:33.989996800
485   2021-11-30 20:26:46.581000192
486   2021-11-30 20:27:20.651000064
487   2021-11-30 20:27:38.241003008
488   2021-11-30 20:27:53.345001984
Name: trialStart, Length: 489, dtype: datetime64[ns]

In [16]:
final_output['trialStart']

0      1.638289e+09
1      1.638289e+09
2      1.638289e+09
3      1.638289e+09
4      1.638289e+09
           ...     
484    1.638304e+09
485    1.638304e+09
486    1.638304e+09
487    1.638304e+09
488    1.638304e+09
Name: trialStart, Length: 489, dtype: float64