This notebook analyzes missing behavioral sessions by:
1. Loading session data and cohort information
2. Cross-referencing with mouse death dates (DOD)
3. Cross-referencing with documented missing behavior reasons
4. Identifying unexplained missing sessions that need investigation

In [1]:
import os
import json

import utils
import session_processing_helper as helper

import pandas as pd


## 1. Configuration and Data Loading


In [2]:
# Configuration
data_dir = '/Users/rebekahzhang/data/behavior_data'
exp = "exp2"
data_folder = os.path.join(data_dir, exp)
print(f"Data folder: {data_folder}")

Data folder: /Users/rebekahzhang/data/behavior_data/exp2


In [3]:
# Google Sheets IDs
mouse_log_sheet_id = "1oE_O-Q8froULSTef7NLFG6CdMh8UdJBSdITxBFFZ6MY"
missing_log_sheet_id = "16bCQtwIla-uFT3Xx-dP1RFWksRHQBkOn3-p2tAO_orY"

## 2. Load External Data Sources


In [4]:
# Load cohort information
with open('exp_cohort_info.json', 'r') as f:
    training_info = json.load(f)
cohort_info = training_info['cohorts']

In [5]:
# Load mouse death dates (DOD) from Google Sheets
mouse_dod = (pd.read_csv(f"https://docs.google.com/spreadsheets/d/{mouse_log_sheet_id}/export?format=csv&gid=0")
             [['Mouse', 'DOD']]
             .dropna(subset=['DOD'])
             .assign(DOD=lambda x: pd.to_datetime(x['DOD'])))
print(f"Loaded {len(mouse_dod)} mice with DOD records")

# Load documented missing behavior reasons from Google Sheets
missing_log_clean = (pd.read_csv(f"https://docs.google.com/spreadsheets/d/{missing_log_sheet_id}/export?format=csv&gid=0")
                     .dropna(subset=['Date', 'Mouse', 'Reason'])
                     .assign(Date=lambda x: pd.to_datetime(x['Date']))
                     .sort_values('Date'))
print(f"Loaded {len(missing_log_clean)} documented missing behavior entries")

Loaded 47 mice with DOD records
Loaded 22 documented missing behavior entries


In [6]:
# Load and process deletion record
deletion_df = (pd.read_csv(os.path.join(data_dir, 'raw', 'deletion_record.csv'))
               .sort_values('session')
               .assign(
                   date=lambda x: x['session'].str[:10],  # Extract date (YYYY-MM-DD)
                   mouse=lambda x: x['session'].str.split('_').str[-1]  # Extract mouse ID
               ))
print(f"Loaded {len(deletion_df)} deletion records")

Loaded 217 deletion records


## 3. Helper Functions


In [7]:
def should_ignore_mouse_due_to_dod(mouse, session_date, mouse_dod_df):
    """
    Check if a mouse should be ignored for missing sessions because it died before the session date.
    Returns:
        bool: True if mouse should be ignored (died before session), False otherwise
    """
    session_dt = pd.to_datetime(session_date)
    # Check if mouse has a DOD record
    mouse_dod_record = mouse_dod_df[mouse_dod_df['Mouse'] == mouse]
    if len(mouse_dod_record) == 0:
        return False  # No DOD record, don't ignore
    
    dod_date = mouse_dod_record['DOD'].iloc[0]

    return dod_date <= session_dt

In [8]:
def has_documented_reason(mouse, date, missing_log_df):
    """Check if a mouse has a documented reason for being missing on a specific date."""
    date_dt = pd.to_datetime(date)
    mouse_entries = missing_log_df[
        (missing_log_df['Mouse'] == mouse) & 
        (missing_log_df['Date'] == date_dt)
    ]
    return len(mouse_entries) > 0, mouse_entries['Reason'].tolist() if len(mouse_entries) > 0 else []

## 4. Missing Sessions Analysis

In [9]:
sessions_all = helper.generate_sessions_all(data_folder)
sessions_by_date = sessions_all.groupby('date')
all_cohorts = list(cohort_info.keys())

In [10]:
missing_sessions_unexplained = {}

for date, data in sessions_by_date:
    missing_sessions_unexplained[date] = {}
    
    for cohort in all_cohorts:
        # Get mice for this specific cohort
        cohort_mice = utils.generate_mouse_list(sessions_all.loc[sessions_all['cohort'] == cohort])
        
        missing_mice_unexplained = []
        mice_with_multiple = []
        
        for mouse in cohort_mice:
            mouse_by_date = data.loc[data['mouse'] == mouse]
            
            if len(mouse_by_date) < 1:
                # Mouse is missing - check if we should ignore it
                if (not should_ignore_mouse_due_to_dod(mouse, date, mouse_dod) and 
                    not has_documented_reason(mouse, date, missing_log_clean)):
                    missing_mice_unexplained.append(mouse)
                    
            elif len(mouse_by_date) > 1:
                # Mouse has multiple sessions (indicates system was working)
                mice_with_multiple.append(mouse)
        
        # Only flag if there are unexplained missing mice AND other mice have multiple sessions
        if missing_mice_unexplained and mice_with_multiple:
            missing_sessions_unexplained[date][cohort] = {
                'missing_unexplained': missing_mice_unexplained,
                'multiple_sessions': mice_with_multiple
            }

# Check if there are any dates with actual unexplained missing mice
dates_with_unexplained = [date for date, cohorts in missing_sessions_unexplained.items() if cohorts]
if dates_with_unexplained:
    print(f"🔍 Found {len(dates_with_unexplained)} days with UNEXPLAINED missing mice:")
    for date in dates_with_unexplained:
        cohorts = missing_sessions_unexplained[date]
        print(f"📅 {date}:")
        for cohort, info in cohorts.items():
            print(f"   {cohort}:")
            print(f"     ❌ missing: {', '.join(info['missing_unexplained'])}")
            print(f"     ✅ multiple: {', '.join(info['multiple_sessions'])}")
        print()
else:
    print("✅ No new missing sessions found!")


✅ No new missing sessions found!


### Manual check to cross validate between training log and deletion record

In [11]:
# date_to_investigate = '2024-08-02'

In [None]:
# sessions_by_date.get_group(date_to_investigate).sort_values('mouse')

In [None]:
# deletion_df.loc[deletion_df['date'] == date_to_investigate]