# Process events files and generates trials for sessions after to 20240416

In [1]:
import os
import shutil

import session_processing_helper as helper
import utils

import pandas as pd

In [2]:
data_dir = '/Users/rebekahzhang/data/behavior_data'
period = 'post_meta_change'
exp = "exp2"
cohort = "cohort_7"
data_folder = os.path.join(data_dir, period, exp, cohort)
print(data_folder)

/Users/rebekahzhang/data/behavior_data/post_meta_change/exp2/cohort_7


# Quality Control

## Check session folders have both meta and events

In [3]:
missing_events, missing_meta, empty_meta, empty_events = helper.check_session_files(data_folder)
if not (missing_meta.empty and missing_events.empty and empty_meta.empty and empty_events.empty):
    print("\nFile check results:")
    if not missing_meta.empty:
        print("\nSessions missing meta files:")
        display(missing_meta)
    if not missing_events.empty:
        print("\nSessions missing events files:")
        display(missing_events)
    if not empty_meta.empty:
        print("\nSessions with empty meta files:")
        display(empty_meta)
    if not empty_events.empty:
        print("\nSessions with empty events files:")
        display(empty_events)
else:
    print("\nAll sessions have non-empty meta and events files.")


All sessions have non-empty meta and events files.


In [4]:
utils.delete_folders(missing_meta.dir.tolist(), data_folder)

no sessions to delete


In [5]:
utils.delete_folders(missing_events.dir.tolist(), data_folder)

no sessions to delete


In [6]:
utils.delete_folders(empty_meta.dir.tolist(), data_folder)

no sessions to delete


In [7]:
utils.delete_folders(empty_events.dir.tolist(), data_folder)

no sessions to delete


## Generate and save sessions log

generate session log using meta data from each session and add columns of basic info to each session

### re-run after every quality control steps

In [8]:
sessions_all, sessions_training = helper.generate_session_logs(data_folder)
sessions_training.tail()

555 sessions in total


Unnamed: 0,index,date,time,mouse,exp,training,rig,trainer,record,total_reward,total_trial,avg_tw,ending_code,dir,group,session
550,468,2025-03-17,13-13-38,RZ059,2,regular,rig1,Rebekah,True,875,283,8.47,miss,2025-03-17_13-13-38_RZ059,l,50
551,166,2025-03-18,11-12-43,RZ059,2,regular,rig1,Rebekah,True,900,433,2.55,reward,2025-03-18_11-12-43_RZ059,l,51
552,291,2025-03-19,12-19-18,RZ059,2,regular,rig1,Rebekah,True,900,271,5.41,reward,2025-03-19_12-19-18_RZ059,l,52
553,431,2025-03-20,13-48-44,RZ059,2,regular,rig1,Rebekah,True,900,315,5.55,reward,2025-03-20_13-48-44_RZ059,l,53
554,42,2025-03-21,10-35-48,RZ059,2,regular,rig1,Rebekah,True,355,99,10.72,miss,2025-03-21_10-35-48_RZ059,l,54


## Remove unwanted sessions
doesn't need to run when data folder is cleaned 
<br>
sessions_all needs to be regenerated after every cleaning step

### Remove crashed sessions
remove the sessions with ending_code==nan

In [9]:
sessions_ended_nan = sessions_all[sessions_all.ending_code.isna()]
utils.remove_sessions(sessions_ended_nan, data_folder)

no sessions to delete


reomve sessions that crashed

In [10]:
sessions_crashed = pd.DataFrame(columns=sessions_training.columns)
problematic_sessions = pd.DataFrame(columns=sessions_training.columns)

for _, session_info in sessions_training.iterrows():
    try:
        events_path = utils.generate_events_path(data_folder, session_info)
        events = pd.read_csv(events_path, low_memory=False)
        session_end = events.loc[(events.key=='session') & (events.value==0)]

        if not len(session_end)==1:
            sessions_crashed = pd.concat([sessions_crashed, session_info.to_frame().T], ignore_index=True)
    
    except:
        problematic_sessions = pd.concat([problematic_sessions, session_info.to_frame().T], ignore_index=True)

if len(problematic_sessions) > 0:
    print("cannot open: ")
    display(problematic_sessions)

if len(sessions_crashed) > 0:
    print("crashed sessions: ")
    display(sessions_crashed)
else:
    print("all sessions are perfect! woohoo!")

all sessions are perfect! woohoo!


In [11]:
utils.remove_sessions(sessions_crashed, data_folder)

no sessions to delete


### Check for short sessions

In [12]:
short_threshold = 20
sessions_short = sessions_all[(sessions_all['total_trial'] < short_threshold) | sessions_all['total_trial'].isna()]
if len(sessions_short)>0:
    display(sessions_short)
else: 
    print('no short sessions to be checked!')

no short sessions to be checked!


remove short sessions if needed

In [13]:
utils.remove_sessions(sessions_short, data_folder)

no sessions to delete


# Process Events

load session log

In [14]:
sessions_training = utils.load_data(os.path.join(data_folder, 'sessions_training.csv'))

In [15]:
problematic_sessions = pd.DataFrame(columns=sessions_training.columns)

for _, session_info in sessions_training.iterrows():
    try:
        events_processed_path = utils.generate_events_processed_path(data_folder, session_info)
        # if os.path.isfile(events_processed_path):
        #     continue
        events = pd.read_csv(utils.generate_events_path(data_folder, session_info), low_memory=False)
        events = helper.process_events(session_info, events)
        events_processed = events.groupby('session_trial_num', group_keys=False).apply(helper.add_trial_time)
        events_processed.to_csv(events_processed_path)
    except:
        problematic_sessions = pd.concat([problematic_sessions, session_info.to_frame().T], ignore_index=True)

if len(problematic_sessions) > 0:
    display(problematic_sessions)
else:
    print("all sessions are perfect! woohoo!")

all sessions are perfect! woohoo!


# Data set curation 

In [16]:
# mouse_list = utils.generate_mouse_list(sessions_all)
# mouse_list = ['RZ047','RZ049','RZ050','RZ051','RZ052','RZ053','RZ054','RZ055','RZ056']
mouse_list = ["RZ057", "RZ058", "RZ059", "RZ061", "RZ062", "RZ063",
                     "RZ065", "RZ067", "RZ068", "RZ069", "RZ070"]
print(mouse_list)

sessions_by_date = sessions_training.groupby('date')

['RZ057', 'RZ058', 'RZ059', 'RZ061', 'RZ062', 'RZ063', 'RZ065', 'RZ067', 'RZ068', 'RZ069', 'RZ070']


### Deal with missing sessions

In [17]:
no_missing_sessions = True
for date, data in sessions_by_date:
    for mouse in mouse_list:
        mouse_by_date = data.loc[data['mouse'] == mouse]
        if len(mouse_by_date) < 1:
            no_missing_sessions = False
            print(f"on {date}, {mouse} has missing sessions")
if no_missing_sessions:
    print("no missing sessions!")

on 2024-11-15, RZ069 has missing sessions
on 2024-11-15, RZ070 has missing sessions
on 2024-12-12, RZ059 has missing sessions
on 2025-02-06, RZ061 has missing sessions
on 2025-02-06, RZ062 has missing sessions
on 2025-02-06, RZ063 has missing sessions
on 2025-02-11, RZ069 has missing sessions
on 2025-02-12, RZ057 has missing sessions
on 2025-02-12, RZ058 has missing sessions
on 2025-02-12, RZ059 has missing sessions
on 2025-02-12, RZ061 has missing sessions
on 2025-02-12, RZ062 has missing sessions
on 2025-02-12, RZ063 has missing sessions
on 2025-02-12, RZ065 has missing sessions
on 2025-02-12, RZ067 has missing sessions
on 2025-02-12, RZ068 has missing sessions
on 2025-02-12, RZ069 has missing sessions
on 2025-02-13, RZ069 has missing sessions
on 2025-02-14, RZ057 has missing sessions
on 2025-02-14, RZ058 has missing sessions
on 2025-02-14, RZ059 has missing sessions
on 2025-02-14, RZ061 has missing sessions
on 2025-02-14, RZ062 has missing sessions
on 2025-02-14, RZ063 has missing s

In [18]:
sessions_by_date.get_group('2024-12-12').sort_values('mouse')

Unnamed: 0,index,date,time,mouse,exp,training,rig,trainer,record,total_reward,total_trial,avg_tw,ending_code,dir,group,session
168,3,2024-12-12,10-26-58,RZ057,2,regular,rig7,Rebekah,False,700,267,5.02,reward,2024-12-12_10-26-58_RZ057,l,13
169,254,2024-12-12,10-28-34,RZ058,2,regular,rig6,Rebekah,False,255,73,18.51,miss,2024-12-12_10-28-34_RZ058,l,13
173,449,2024-12-12,11-33-29,RZ061,2,regular,rig4,Rebekah,False,700,382,2.32,reward,2024-12-12_11-33-29_RZ061,s,13
174,204,2024-12-12,11-35-30,RZ062,2,regular,rig5,Rebekah,False,700,706,0.93,reward,2024-12-12_11-35-30_RZ062,s,13
175,243,2024-12-12,11-38-49,RZ063,2,regular,rig7,Rebekah,False,700,349,2.28,reward,2024-12-12_11-38-49_RZ063,s,13
176,425,2024-12-12,12-10-07,RZ064,2,regular,rig6,Rebekah,False,700,558,1.11,reward,2024-12-12_12-10-07_RZ064,s,12
170,143,2024-12-12,10-33-05,RZ065,2,regular,rig4,Rebekah,False,700,273,3.25,reward,2024-12-12_10-33-05_RZ065,l,13
171,351,2024-12-12,10-38-58,RZ067,2,regular,rig2,Rebekah,False,700,375,2.13,reward,2024-12-12_10-38-58_RZ067,l,13
172,595,2024-12-12,10-40-12,RZ068,2,regular,rig3,Rebekah,False,585,438,2.51,miss,2024-12-12_10-40-12_RZ068,l,13
177,391,2024-12-12,12-25-15,RZ069,2,regular,rig4,Rebekah,False,650,231,6.74,miss,2024-12-12_12-25-15_RZ069,s,12


#### duplicate if you are sussed out of having to redo this

In [None]:
utils.backup(data_folder)

### Deal with multiple sessions

In [19]:
days_to_stitch = []
mice_to_stitch = []
for date, data in sessions_by_date:
    for mouse in mouse_list:
        mouse_by_date = data.loc[data['mouse'] == mouse]
        if len(mouse_by_date) > 1:
            days_to_stitch.append(date)
            mice_to_stitch.append(mouse)
            print(f"on {date}, {mouse} has {len(mouse_by_date)} sessions")
if not days_to_stitch:
    print("no sessions to stitch!")

no sessions to stitch!


In [20]:
# run it if session stitching is needed, nothing would happen otherwise
# has to run more than once if there are more than 2 sessions. fix it for the next round pls
if not days_to_stitch:
    print("no sessions to stitch!")
else:
    for d, m in zip(days_to_stitch, mice_to_stitch):
        day = sessions_by_date.get_group(d)
        sessions_to_stitch = day[day['mouse'] == m]

        session_1_dir = utils.generate_events_processed_path(data_folder, sessions_to_stitch.iloc[0])
        session_2_dir = utils.generate_events_processed_path(data_folder, sessions_to_stitch.iloc[1])

        if os.path.exists(session_1_dir) and os.path.exists(session_2_dir):
            session_1 = pd.read_csv(session_1_dir)
            session_2 = pd.read_csv(session_2_dir)
            stitched_session = helper.stitch_sessions(session_1, session_2) 
            #should change to stitch events. stitch sessions should be deleted. to follow nomanclature, all session should be renamed to events!!

            stitched_session.to_csv(session_1_dir, index=False)
            shutil.rmtree(os.path.join(data_folder, sessions_to_stitch.iloc[1].dir))
            print(f"{d} {m} session 2 deleted")
        else:
            print("one of the sessions do not exist")

no sessions to stitch!


In [21]:
sessions_all, sessions_training = helper.generate_session_logs(data_folder)

sessions_training.tail()

555 sessions in total


Unnamed: 0,index,date,time,mouse,exp,training,rig,trainer,record,total_reward,total_trial,avg_tw,ending_code,dir,group,session
550,468,2025-03-17,13-13-38,RZ059,2,regular,rig1,Rebekah,True,875,283,8.47,miss,2025-03-17_13-13-38_RZ059,l,50
551,166,2025-03-18,11-12-43,RZ059,2,regular,rig1,Rebekah,True,900,433,2.55,reward,2025-03-18_11-12-43_RZ059,l,51
552,291,2025-03-19,12-19-18,RZ059,2,regular,rig1,Rebekah,True,900,271,5.41,reward,2025-03-19_12-19-18_RZ059,l,52
553,431,2025-03-20,13-48-44,RZ059,2,regular,rig1,Rebekah,True,900,315,5.55,reward,2025-03-20_13-48-44_RZ059,l,53
554,42,2025-03-21,10-35-48,RZ059,2,regular,rig1,Rebekah,True,355,99,10.72,miss,2025-03-21_10-35-48_RZ059,l,54


# Finalize sessions log

In [22]:
def correct_sessions_training(data_folder, save_log=True):
    _, sessions_training = helper.generate_session_logs(data_folder, save_logs=False)
    session_info_list = []
    for _, session_info in sessions_training.iterrows():
        events_processed = pd.read_csv(utils.generate_events_processed_path(data_folder, session_info), low_memory=False)
        session_basics = helper.get_session_basics(events_processed)
        session_basics['dir'] = session_info['dir']
        session_info_list.append(session_basics)
    sessions_info = pd.DataFrame(session_info_list)
    corrected_sessions_training = pd.merge(sessions_training, sessions_info, on="dir")
    corrected_sessions_training = corrected_sessions_training.drop(columns=['index', 'total_reward', 'total_trial', 'total_reward'])
    corrected_sessions_training = corrected_sessions_training.groupby('mouse', group_keys=False).apply(helper.assign_session_numbers)
    corrected_sessions_training['cohort'] = cohort
    if save_log:
        utils.save_as_csv(df=corrected_sessions_training, folder=data_folder, filename=f'sessions_training_{exp}_{cohort}.csv')
    return corrected_sessions_training

In [23]:
sessions_training = correct_sessions_training(data_folder)

555 sessions in total


# Analyze events

In [24]:
sessions_training = utils.load_data(os.path.join(data_folder, f'sessions_training_{exp}_{cohort}.csv'))

## Generate Trials

In [25]:
# generate all trials based on events processed
problematic_sessions = pd.DataFrame(columns=sessions_training.columns)
for _, session_info in sessions_training.iterrows():
    try: 
        trials_path = utils.generate_trials_path(data_folder, session_info)
        # if os.path.isfile(trials_path):
        #     continue
        
        events_processed = pd.read_csv(utils.generate_events_processed_path(data_folder, session_info))
        trials = helper.generate_trials(session_info, events_processed)

        trials.to_csv(trials_path)
    except:
        problematic_sessions = pd.concat([problematic_sessions, session_info.to_frame().T], ignore_index=True)

if len(problematic_sessions) > 0:
    display(problematic_sessions)
else:
    print("all sessions are perfect! woohoo!")

all sessions are perfect! woohoo!


## Analyze trials

In [26]:
for _, session_info in sessions_training.iterrows():
    try:
        trials_analyzed_path = utils.generate_trials_analyzed_path(data_folder, session_info)
        # if os.path.isfile(trials_analyzed_path):
        #     continue
        
        session_by_trial = utils.load_data(utils.generate_events_processed_path(data_folder, session_info)).groupby('session_trial_num')
        trials = utils.load_data(utils.generate_trials_path(data_folder, session_info))
        trials_data = helper.get_trial_data_df(session_by_trial)
        trials_analyzed = pd.merge(trials, trials_data, on='session_trial_num')
        trials_analyzed['group'] = session_info.group #assigning trial type manually
        trials_analyzed.to_csv(trials_analyzed_path)
    except:
        display(session_info)

debug debug debug

In [None]:
# data_dir = '/Users/rebekahzhang/data/behavior_data'
# exp = "exp2"
# cohort = "c567"
# data_folder = os.path.join(data_dir, exp, cohort)
# sessions_training = pd.read_csv(os.path.join(data_folder, 'sessions_training.csv'), index_col=0)

# sessions_training_post_meta_change = sessions_training.loc[sessions_training.date < "2024-05-03"]

# for _, session_info in sessions_training_post_meta_change.iterrows():
#     try:
#         trials_analyzed_path = utils.generate_trials_analyzed_path(data_folder, session_info)
#         # if os.path.isfile(trials_analyzed_path):
#         #     continue
        
#         session_by_trial = utils.load_data(utils.generate_events_processed_path(data_folder, session_info)).groupby('session_trial_num')
#         trials = utils.load_data(utils.generate_trials_path(data_folder, session_info))
#         trials_data = helper.get_trial_data_df(session_by_trial)
#         trials_analyzed = pd.merge(trials, trials_data, on='session_trial_num')
#         trials_analyzed['group'] = session_info.group #assigning trial type manually
#         trials_analyzed.to_csv(trials_analyzed_path)
#     except:
#         display(session_info)