In [2]:
import os
import json
import shutil

import session_processing_helper_c5 as helper
import utils_c5 as utils

import pandas as pd

# Data cleaning

create a folder with the time period as name (eg. '20250531-20240806')
<br>
create a folder named 'raw' and put all session files in 'raw'

## Back up raw

In [3]:
period = '20240123-20240415'
data_dir = '/Users/rebekahzhang/data/behavior_data'

In [4]:
def backup(source_dir, destination_dir):
    if not os.path.isdir(destination_dir):
        shutil.copytree(source_dir, destination_dir)
        print(f"{os.path.basename(source_dir)} folder backed up")
    else:
        print(f"{os.path.basename(destination_dir)} already exist")

In [5]:
raw = os.path.join(data_dir, period, "raw")
clean = os.path.join(data_dir, period, "clean")
backup(raw, clean)

raw folder backed up


## Check if sessions have both meta and events

In [6]:
data_folder = os.path.join(data_dir, 'exp2')
print(data_folder)

/Users/rebekahzhang/data/behavior_data/exp2


In [7]:
def check_session_files(data_folder):
  files_check = []
  for entry in os.scandir(data_folder):
    if entry.is_dir():
      dir = entry.name
      session_path = os.path.join(data_folder, dir)

      events_found = False
      meta_found = False

      required_files = [f.name for f in os.scandir(session_path) if f.is_file() and not f.name.startswith('.')]
      for filename in required_files:
        if filename.startswith("events_"):
          events_found = True
        elif filename.startswith("meta_"):
          meta_found = True

      files_check.append({
          'dir': dir,
          'events': events_found,
          'meta': meta_found
      })

  files_check_df = pd.DataFrame(files_check).sort_values("dir")
  missing_meta = files_check_df[files_check_df.meta==False]
  missing_events = files_check_df[files_check_df.events==False]

  return missing_meta, missing_events

In [8]:
missing_events, missing_meta = check_session_files(data_folder)
if len(missing_meta) == 0 and len(missing_events) == 0:
    print("no sessions with missing files!")
else:
    print("missing meta")
    display(missing_meta)
    print("missing events")
    display(missing_events)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rebekahzhang/data/behavior_data/exp2'

## Generate and save sessions log

generate session log using meta data from each session and add columns of basic info to each session

In [205]:
def modify_total_trial(row):
    if row['ending_code'] == 'pygame':
        return row['total_trial'] - 1
    elif row['ending_code'] == 'miss':
        return row['total_trial'] - 5
    else:
        return row['total_trial']

In [206]:
def modify_sessions_all(sessions_all):
    sessions_all['dir'] = sessions_all['date']+ '_' + sessions_all['time'] + '_' + sessions_all['mouse']
    sessions_all = sessions_all.sort_values('dir')
    sessions_all[['exp', 'group']] = sessions_all['exp'].str.extract(r'exp(\d)_(short|long)')
    sessions_all['group'] = sessions_all['group'].map({'short': 's', 'long': 'l'})
    sessions_all['total_trial'] = sessions_all.apply(modify_total_trial, axis=1)
    sessions_all = sessions_all.drop(['forward_file'], axis=1)
    return sessions_all

In [207]:
def generate_sessions_all(data_folder):
    """Generates a DataFrame using session metadata from JSON files.
    Args:
        data_folder (str): Path to the directory containing JSON files.
    Returns:
        pd.DataFrame: DataFrame containing session metadata, sorted by 'dir' column.
    """

    data = []
    for root, _, files in os.walk(data_folder):
        for file in files:
            if file.startswith("meta_") and file.endswith(".json"):
                path = os.path.join(root, file)
                try:
                    with open(path) as f:
                        session_data = json.load(f)['session_config']
                        data.append(session_data)
                except Exception as e:
                    print(f"Error processing file {file}: {e}")

    sessions_all = pd.DataFrame(data)
    sessions_all = modify_sessions_all(sessions_all)
    return sessions_all

In [208]:
def generate_sessions_training(sessions_all):
    sessions_training = sessions_all.loc[sessions_all.training == 'regular'].reset_index()
    sessions_training = sessions_training.groupby('mouse', group_keys=False).apply(helper.assign_session_numbers)
    return sessions_training

### re-run after every quality control steps

In [209]:
sessions_all = generate_sessions_all(data_folder)
sessions_training = generate_sessions_training(sessions_all)

utils.save_as_csv(df=sessions_all, folder=data_folder, filename='sessions_all.csv')
utils.save_as_csv(df=sessions_training, folder=data_folder, filename='sessions_training.csv')

sessions_training

Error processing file meta_2024-04-12_10-23-31_RZ036.json: 'session_config'
Error processing file meta_2024-04-05_11-21-57_RZ036.json: 'session_config'
Error processing file meta_2024-01-24_10-14-35_RZ036.json: 'session_config'
Error processing file meta_2024-04-05_12-01-59_RZ037.json: 'session_config'
Error processing file meta_2024-03-27_11-12-26_RZ038.json: 'session_config'
Error processing file meta_2024-02-01_13-11-32_RZ039.json: 'session_config'
Error processing file meta_2024-03-04_13-25-49_RZ038.json: 'session_config'
Error processing file meta_2024-04-01_12-46-49_RZ037.json: 'session_config'
Error processing file meta_2024-02-23_12-20-38_RZ038.json: 'session_config'
Error processing file meta_2024-03-20_10-07-53_RZ034.json: 'session_config'
Error processing file meta_2024-02-02_11-37-54_RZ038.json: 'session_config'
Error processing file meta_2024-03-25_11-32-23_RZ034.json: 'session_config'
Error processing file meta_2024-01-30_09-52-34_RZ034.json: 'session_config'
Error proces

Unnamed: 0,index,date,time,mouse,exp,training,rig,trainer,record,total_reward,total_trial,avg_tw,ending_code,dir,group,session
0,197,2024-05-03,12-04-42,RZ036,2,regular,rig3,Rebekah,False,700,389,1.36,reward,2024-05-03_12-04-42_RZ036,s,0
1,108,2024-05-03,12-05-32,RZ034,2,regular,rig2,Rebekah,False,700,508,1.28,reward,2024-05-03_12-05-32_RZ034,s,0
2,143,2024-05-03,12-47-24,RZ037,2,regular,rig3,Rebekah,False,700,437,1.92,reward,2024-05-03_12-47-24_RZ037,l,0
3,165,2024-05-03,12-59-12,RZ038,2,regular,rig2,Rebekah,False,630,233,11.40,miss,2024-05-03_12-59-12_RZ038,l,0
4,30,2024-05-03,14-16-43,RZ039,2,regular,rig3,Rebekah,False,700,344,2.68,reward,2024-05-03_14-16-43_RZ039,l,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,0,2024-07-18,15-07-36,RZ038,2,regular,rig1,Rebekah,True,625,201,6.60,miss,2024-07-18_15-07-36_RZ038,l,43
211,210,2024-07-18,17-22-52,RZ039,2,regular,rig1,Rebekah,True,900,401,3.01,reward,2024-07-18_17-22-52_RZ039,l,39
212,133,2024-07-19,11-50-39,RZ037,2,regular,rig1,Rebekah,True,340,101,8.24,pygame,2024-07-19_11-50-39_RZ037,l,42
213,208,2024-07-19,12-38-14,RZ038,2,regular,rig1,Rebekah,True,505,176,9.30,miss,2024-07-19_12-38-14_RZ038,l,44


## Quality control
doesn't need to run when data folder is cleaned 
<br>
sessions_all needs to be regenerated after every cleaning step, so run the step above

### Remove test sessions

In [185]:
def remove_sessions(sessions_to_remove, data_folder):
    for _, session_info in sessions_to_remove.iterrows():
        shutil.rmtree(os.path.join(data_folder, session_info.dir))

In [186]:
sessions_test = sessions_all.loc[sessions_all.mouse=='test']
if len(sessions_test) > 0:
    remove_sessions(sessions_test, data_folder)
    print("test sessions removed")
else:
    print("no test sessions to be delted!")

no test sessions to be delted!


### Check for short sessions

In [187]:
short_threshold = 30

In [188]:
sessions_short = sessions_all[(sessions_all['total_trial'] < short_threshold) | sessions_all['total_trial'].isna()]
if len(sessions_short)>0:
    display(sessions_short)
else: 
    print('no short sessions to be checked!')

no short sessions to be checked!


remove short sessions if needed

In [189]:
remove_sessions(sessions_short, data_folder)

# Process events

load session log

In [190]:
sessions_training = utils.load_data(os.path.join(data_folder, 'sessions_training.csv'))

In [191]:
# trim events to based on ending code
def process_events(session_info, events):
    ending_to_adjust = ['pygame', 'miss']
    ending_smooth = ['time', 'reward', 'trial']
    if session_info['ending_code'] in ending_to_adjust:
        events = events.loc[events['session_trial_num'].between(0, session_info['total_trial'])]
    elif session_info['ending_code'] in ending_smooth:
        events = events.iloc[2:-1]
    else:
        print(session_info['dir'])
        raise "ending code unknown"
    return events

In [192]:
problematic_sessions = pd.DataFrame(columns=sessions_training.columns)

for _, session_info in sessions_training.iterrows():
    try:
        events_processed_path = utils.generate_events_processed_path(data_folder, session_info)
        if os.path.isfile(events_processed_path):
            continue
        events = pd.read_csv(utils.generate_events_path(data_folder, session_info), low_memory=False)
        events = process_events(session_info, events)
        events_processed = events.groupby('session_trial_num', group_keys=False).apply(helper.add_trial_time)
        events_processed.to_csv(events_processed_path)
    except:
        problematic_sessions = pd.concat([problematic_sessions, session_info.to_frame().T], ignore_index=True)

In [193]:
if len(problematic_sessions) > 0:
    display(problematic_sessions)
else:
    print("all sessions are perfect! woohoo!")

all sessions are perfect! woohoo!


!!! can use move dir to new folder function to do for problematic sections !!!

# Generate trials

In [194]:
def get_trial_basics(trial):
    """gets the df of a trial, extracts 5 things, and outputs as a dictionary"""
    trial_start = trial.loc[(trial['key'] == 'trial') & (trial['value'] == 1)].iloc[0]
    trial_end = trial.loc[(trial['key'] == 'trial') & (trial['value'] == 0)].iloc[0]

    trial_basics = {'session_trial_num': trial_start['session_trial_num'],
                    'block_trial_num': trial_start['block_trial_num'],
                    'block_num': trial_start['block_num'],
                    'start_time': trial_start['session_time'],
                    'end_time': trial_end['session_time']}
    return trial_basics

In [195]:
def generate_trials(session_info, events):
    trial_info_list = []
    for t in range(int(session_info.total_trial)):
        trial = events.loc[events['session_trial_num'] == t]
        trial_basics = get_trial_basics(trial)
        trial_info_list.append(trial_basics)
    trials = pd.DataFrame(trial_info_list)
    return trials

In [196]:
# generate all trials based on events processed
problematic_sessions = pd.DataFrame(columns=sessions_training.columns)
for _, session_info in sessions_training.iterrows():
    try: 
        trials_path = utils.generate_trials_path(data_folder, session_info)
        if os.path.isfile(trials_path):
            continue
        
        events_processed = pd.read_csv(utils.generate_events_processed_path(data_folder, session_info))
        trials = generate_trials(session_info, events_processed)

        trials.to_csv(trials_path)
    except:
        problematic_sessions = pd.concat([problematic_sessions, session_info.to_frame().T], ignore_index=True)

In [197]:
if len(problematic_sessions) > 0:
    display(problematic_sessions)
else:
    print("all sessions are perfect! woohoo!")

all sessions are perfect! woohoo!


## Analyze trials

In [198]:
for _, session_info in sessions_training.iterrows():
    trials_analyzed_path = utils.generate_trials_analyzed_path(data_folder, session_info)
    if os.path.isfile(trials_analyzed_path):
        continue
    
    session_by_trial = utils.load_data(utils.generate_events_processed_path(data_folder, session_info)).groupby('session_trial_num')
    trials = utils.load_data(utils.generate_trials_path(data_folder, session_info))
    trials_data = helper.get_trial_data_df(session_by_trial)
    trials_analyzed = pd.merge(trials, trials_data, on='session_trial_num')
    trials_analyzed['group'] = session_info.group #assigning trial type manually
    trials_analyzed.to_csv(trials_analyzed_path)