This notebook analyzes missing behavioral sessions by:
1. Loading session data and cohort information
2. Cross-referencing with mouse death dates (DOD)
3. Cross-referencing with documented missing behavior reasons
4. Identifying unexplained missing sessions that need investigation

In [1]:
import os
import json
import pandas as pd
import session_processing_helper as helper
import utils


## 1. Configuration and Data Loading


In [None]:
# Configuration
data_dir = '/Users/rebekahzhang/data/behavior_data'
exp = "exp2"
data_folder = os.path.join(data_dir, exp)
print(f"Data folder: {data_folder}")

Data folder: /Users/rebekahzhang/data/behavior_data/exp2


In [None]:
# Google Sheets IDs
mouse_log_sheet_id = "1oE_O-Q8froULSTef7NLFG6CdMh8UdJBSdITxBFFZ6MY"
missing_log_sheet_id = "16bCQtwIla-uFT3Xx-dP1RFWksRHQBkOn3-p2tAO_orY"

## 2. Load External Data Sources


In [3]:
# Load cohort information
with open('exp_cohort_info.json', 'r') as f:
    training_info = json.load(f)
cohort_info = training_info['cohorts']

In [4]:
# Load mouse death dates (DOD) from Google Sheets
mouse_dod = (pd.read_csv(f"https://docs.google.com/spreadsheets/d/{mouse_log_sheet_id}/export?format=csv&gid=0")
             [['Mouse', 'DOD']]
             .dropna(subset=['DOD'])
             .assign(DOD=lambda x: pd.to_datetime(x['DOD'])))
print(f"Loaded {len(mouse_dod)} mice with DOD records")

# Load documented missing behavior reasons from Google Sheets
missing_log_clean = (pd.read_csv(f"https://docs.google.com/spreadsheets/d/{missing_log_sheet_id}/export?format=csv&gid=0")
                     .dropna(subset=['Date', 'Mouse', 'Reason'])
                     .assign(Date=lambda x: pd.to_datetime(x['Date']))
                     .sort_values('Date'))
print(f"Loaded {len(missing_log_clean)} documented missing behavior entries")

Loaded 47 mice with DOD records
Loaded 22 documented missing behavior entries


In [5]:
# Load and process deletion record
deletion_df = (pd.read_csv(os.path.join(data_dir, 'raw', 'deletion_record.csv'))
               .sort_values('session')
               .assign(
                   date=lambda x: x['session'].str[:10],  # Extract date (YYYY-MM-DD)
                   mouse=lambda x: x['session'].str.split('_').str[-1]  # Extract mouse ID
               ))
print(f"Loaded {len(deletion_df)} deletion records")

Loaded 213 deletion records


## 3. Helper Functions


In [6]:
def add_cohort_column(sessions_all, cohort_info):
    """Add cohort column based on mouse name and cohort info."""
    # Create reverse mapping
    mouse_to_cohort = {}
    for cohort, mice in cohort_info.items():
        for mouse in mice:
            mouse_to_cohort[mouse] = cohort
    sessions_all['cohort'] = sessions_all['mouse'].map(mouse_to_cohort)
    return sessions_all

In [7]:
def generate_sessions_all(data_folder):
    """Generate DataFrame from session metadata JSON files."""
    data = []
    
    for root, _, files in os.walk(data_folder):
        for file in files: 
            if file.startswith("meta_") and file.endswith(".json"):
                path = os.path.join(root, file)
                try:
                    with open(path) as f:
                        session_data = json.load(f)

                    date_str = file.split('_')[1]
                    if date_str < '2024-04-16':
                        data.append(session_data)
                    else:
                        data.append(session_data.get('session_config', session_data))
                        
                except Exception as e:
                    print(f"Error processing file {file}: {e}")

    sessions_all = pd.DataFrame(data)
    sessions_all['dir'] = sessions_all['date'] + '_' + sessions_all['time'] + '_' + sessions_all['mouse']
    sessions_all = add_cohort_column(sessions_all, cohort_info)
    sessions_all = sessions_all.drop(columns=['trainer', 'record', 'forward_file', 'pump_ul_per_turn'])
    return sessions_all.sort_values('dir')

In [8]:
def should_ignore_mouse_due_to_dod(mouse, session_date, mouse_dod_df):
    """
    Check if a mouse should be ignored for missing sessions because it died before the session date.
    Returns:
        bool: True if mouse should be ignored (died before session), False otherwise
    """
    session_dt = pd.to_datetime(session_date)
    # Check if mouse has a DOD record
    mouse_dod_record = mouse_dod_df[mouse_dod_df['Mouse'] == mouse]
    if len(mouse_dod_record) == 0:
        return False  # No DOD record, don't ignore
    
    dod_date = mouse_dod_record['DOD'].iloc[0]

    return dod_date <= session_dt

In [9]:
def has_documented_reason(mouse, date, missing_log_df):
    """Check if a mouse has a documented reason for being missing on a specific date."""
    date_dt = pd.to_datetime(date)
    mouse_entries = missing_log_df[
        (missing_log_df['Mouse'] == mouse) & 
        (missing_log_df['Date'] == date_dt)
    ]
    return len(mouse_entries) > 0, mouse_entries['Reason'].tolist() if len(mouse_entries) > 0 else []

## 4. Missing Sessions Analysis (Cohort-Based)

In [10]:
sessions_all = generate_sessions_all(data_folder)
sessions_by_date = sessions_all.groupby('date')
all_cohorts = list(cohort_info.keys())

In [11]:
# Find unexplained missing sessions (within cohorts)
# This analysis identifies missing mice that:
# 1. Are not dead (DOD check)
# 2. Have no documented reason for being missing
# 3. Are missing on days when other mice in the same cohort have multiple sessions

missing_sessions_unexplained = {}

for date, data in sessions_by_date:
    missing_sessions_unexplained[date] = {}
    
    for cohort in all_cohorts:
        # Get mice for this specific cohort
        cohort_mice = utils.generate_mouse_list(sessions_all.loc[sessions_all['cohort'] == cohort])
        
        missing_mice_unexplained = []
        mice_with_multiple = []
        
        for mouse in cohort_mice:
            mouse_by_date = data.loc[data['mouse'] == mouse]
            
            if len(mouse_by_date) < 1:
                # Mouse is missing - check if we should ignore it
                if (not should_ignore_mouse_due_to_dod(mouse, date, mouse_dod) and 
                    not has_documented_reason(mouse, date, missing_log_clean)):
                    missing_mice_unexplained.append(mouse)
                    
            elif len(mouse_by_date) > 1:
                # Mouse has multiple sessions (indicates system was working)
                mice_with_multiple.append(mouse)
        
        # Only flag if there are unexplained missing mice AND other mice have multiple sessions
        if missing_mice_unexplained and mice_with_multiple:
            missing_sessions_unexplained[date][cohort] = {
                'missing_unexplained': missing_mice_unexplained,
                'multiple_sessions': mice_with_multiple
            }

# Check if there are any dates with actual unexplained missing mice
dates_with_unexplained = [date for date, cohorts in missing_sessions_unexplained.items() if cohorts]
if dates_with_unexplained:
    print(f"🔍 Found {len(dates_with_unexplained)} days with UNEXPLAINED missing mice:")
    for date in dates_with_unexplained:
        cohorts = missing_sessions_unexplained[date]
        print(f"📅 {date}:")
        for cohort, info in cohorts.items():
            print(f"   {cohort}:")
            print(f"     ❌ missing: {', '.join(info['missing_unexplained'])}")
            print(f"     ✅ multiple: {', '.join(info['multiple_sessions'])}")
        print()
else:
    print("✅ No new missing sessionsfound!")


✅ No new missing sessionsfound!


In [12]:
date_to_investigate = '2024-08-02'

In [13]:
sessions_by_date.get_group(date_to_investigate).sort_values('mouse')

Unnamed: 0,date,time,mouse,exp,training,rig,total_reward,total_trial,avg_tw,ending_code,dir,cohort
744,2024-08-02,09-53-19,RZ047,exp2_short,regular,rig2,700.0,495.0,1.34,reward,2024-08-02_09-53-19_RZ047,cohort_6
2186,2024-08-02,09-52-14,RZ049,exp2_short,regular,rig3,700.0,217.0,10.07,reward,2024-08-02_09-52-14_RZ049,cohort_6
1077,2024-08-02,10-44-54,RZ050,exp2_short,regular,rig2,700.0,364.0,3.26,reward,2024-08-02_10-44-54_RZ050,cohort_6
1161,2024-08-02,11-01-17,RZ051,exp2_short,regular,rig3,700.0,320.0,2.58,reward,2024-08-02_11-01-17_RZ051,cohort_6
1486,2024-08-02,11-42-18,RZ052,exp2_long,regular,rig2,195.0,107.0,2.13,miss,2024-08-02_11-42-18_RZ052,cohort_6
1360,2024-08-02,14-47-36,RZ052,exp2_long,regular,rig2,345.0,179.0,2.26,miss,2024-08-02_14-47-36_RZ052,cohort_6
551,2024-08-02,11-44-35,RZ053,exp2_long,regular,rig3,505.0,252.0,2.19,miss,2024-08-02_11-44-35_RZ053,cohort_6
1609,2024-08-02,14-01-42,RZ053,exp2_long,regular,rig2,330.0,109.0,4.07,miss,2024-08-02_14-01-42_RZ053,cohort_6
1193,2024-08-02,12-12-28,RZ054,exp2_long,regular,rig2,700.0,422.0,2.08,reward,2024-08-02_12-12-28_RZ054,cohort_6
2016,2024-08-02,12-51-55,RZ055,exp2_long,regular,rig3,290.0,136.0,3.07,miss,2024-08-02_12-51-55_RZ055,cohort_6


In [15]:
deletion_df.loc[deletion_df['date'] == date_to_investigate]

Unnamed: 0,session,reason,deleted,timestamp,date,mouse
