# Session Mapping

Map preprocessing data files to their session groups and compute visit numbers.

In [22]:
import pandas as pd
import numpy as np
from datetime import datetime
import glob
import os

## Define Session Schedule

In [23]:
# Session schedule from experimental design
# Format: Team -> Session -> (date, time_slot)

session_groups = {
    'Team 4': {
        'Session 1': {'date': '07/27/2021', 'time': '14:00-16:00'},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '08/02/2021', 'time': '14:00-16:00'}
    },
    'Team 6': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '07/31/2021', 'time': '10:00-12:00'}
    },
    'Team 7': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '08/18/2021', 'time': '16:00-18:00'}
    },
    'Team 8': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '08/06/2021', 'time': '10:00-12:00'}
    },
    'Team 10': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '08/13/2021', 'time': '10:00-12:00'}
    },
    'Team 12': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '08/13/2021', 'time': '16:00-18:00'},
        'Session 3': {'date': '08/16/2021', 'time': '14:00-16:00'}
    },
    'Team 14': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '08/17/2021', 'time': '10:00-12:00'},
        'Session 3': {'date': '08/18/2021', 'time': '10:00-12:00'}
    },
    'Team 15': {
        'Session 1': {'date': '08/19/2021', 'time': '14:00-16:00'},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '08/23/2021', 'time': '14:00-16:00'}
    },
    'Team 16': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '08/27/2021', 'time': '10:00-12:00'},
        'Session 3': {'date': '08/28/2021', 'time': '13:00-15:00'}
    },
    'Team 17': {
        'Session 1': {'date': '08/24/2021', 'time': '10:00-12:00'},
        'Session 2': {'date': '08/25/2021', 'time': '10:00-12:00'},
        'Session 3': {'date': '08/26/2021', 'time': '10:00-12:00'}
    },
    'Team 18': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '08/25/2021', 'time': '13:00-15:00'},
        'Session 3': {'date': '08/26/2021', 'time': '13:00-15:00'}
    },
    'Team 19': {
        'Session 1': {'date': '08/24/2021', 'time': '16:00-18:00'},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': None, 'time': None}
    },
    'Team 20': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '08/31/2021', 'time': '10:00-12:00'},
        'Session 3': {'date': '09/01/2021', 'time': '10:00-12:00'}
    },
    'Team 21': {
        'Session 1': {'date': '08/30/2021', 'time': '13:00-15:00'},
        'Session 2': {'date': '08/31/2021', 'time': '13:00-15:00'},
        'Session 3': {'date': '09/01/2021', 'time': '13:00-15:00'}
    },
    'Team 22': {
        'Session 1': {'date': '09/15/2021', 'time': '10:00-12:00'},
        'Session 2': {'date': '09/17/2021', 'time': '10:30-12:30'},
        'Session 3': {'date': None, 'time': None}
    },
    'Team 23': {
        'Session 1': {'date': None, 'time': None},
        'Session 2': {'date': '09/22/2021', 'time': '10:00-12:00'},
        'Session 3': {'date': '09/23/2021', 'time': '10:00-12:00'}
    },
    'Team 24': {
        'Session 1': {'date': '09/20/2021', 'time': '16:00-18:00'},
        'Session 2': {'date': '09/23/2021', 'time': '16:00-18:00'},
        'Session 3': {'date': '09/24/2021', 'time': '16:00-18:00'}
    },
    'Team 25': {
        'Session 1': {'date': '09/27/2021', 'time': '09:30-11:30'},
        'Session 2': {'date': '09/30/2021', 'time': '17:00-19:00'},
        'Session 3': {'date': None, 'time': None}
    },
    'Team 26': {
        'Session 1': {'date': '09/28/2021', 'time': '16:00-18:00'},
        'Session 2': {'date': None, 'time': None},
        'Session 3': {'date': '10/05/2021', 'time': '16:00-18:00'}
    }
}

## Parse Preprocessing Files

Extract date, time, and user ID from filename: `preprocessing_MMDD_HHMM_USERID.json`

In [24]:
def parse_filename(filename):
    """Parse preprocessing filename to extract date, time, and user ID."""
    basename = os.path.basename(filename)
    parts = basename.replace('preprocessing_', '').replace('.json', '').split('_')
    
    if len(parts) == 3:
        mmdd, hhmm, user_id = parts
        month = mmdd[:2]
        day = mmdd[2:4]
        hour = hhmm[:2]
        minute = hhmm[2:4]
        
        date_str = f"{month}/{day}/2021"
        time_str = f"{hour}:{minute}"
        
        return {
            'filename': basename,
            'date': date_str,
            'time': time_str,
            'user_id': user_id,
            'mmdd': mmdd,
            'hhmm': hhmm
        }
    return None

# Get all preprocessing files
data_dir = '../data/results/preprocessing/'
files = glob.glob(os.path.join(data_dir, 'preprocessing_*.json'))

# Parse all files
parsed_files = []
for f in files:
    parsed = parse_filename(f)
    if parsed:
        parsed_files.append(parsed)

df_files = pd.DataFrame(parsed_files)
print(f"Found {len(df_files)} preprocessing files")
df_files.head()

Found 99 preprocessing files


Unnamed: 0,filename,date,time,user_id,mmdd,hhmm
0,preprocessing_0816_1400_9M4VCHG.json,08/16/2021,14:00,9M4VCHG,816,1400
1,preprocessing_0924_1600_9M4VCHG.json,09/24/2021,16:00,9M4VCHG,924,1600
2,preprocessing_0825_1000_9M4VCHG.json,08/25/2021,10:00,9M4VCHG,825,1000
3,preprocessing_0813_1600_539136F.json,08/13/2021,16:00,539136F,813,1600
4,preprocessing_0826_1000_539136F.json,08/26/2021,10:00,539136F,826,1000


## Map Files to Sessions

In [25]:
def time_matches(file_time, session_time_range):
    """Check if file time falls within session time range."""
    if not session_time_range:
        return False
    
    file_hour = int(file_time.split(':')[0])
    file_minute = int(file_time.split(':')[1])
    file_minutes_total = file_hour * 60 + file_minute
    
    if '-' in session_time_range:
        start_time, end_time = session_time_range.split('-')
        start_hour, start_min = int(start_time.split(':')[0]), int(start_time.split(':')[1])
        end_hour, end_min = int(end_time.split(':')[0]), int(end_time.split(':')[1])
        
        start_minutes = start_hour * 60 + start_min
        end_minutes = end_hour * 60 + end_min
        
        return start_minutes <= file_minutes_total < end_minutes
    
    return False

def map_file_to_session(file_info, session_groups):
    """Map a file to its team and session based on date and time."""
    file_date = file_info['date']
    file_time = file_info['time']
    
    for team_name, sessions in session_groups.items():
        for session_name, session_info in sessions.items():
            if session_info['date'] == file_date and time_matches(file_time, session_info['time']):
                return team_name, session_name
    
    return None, None

# Map each file
df_files['team'] = None
df_files['session'] = None

for idx, row in df_files.iterrows():
    team, session = map_file_to_session(row, session_groups)
    df_files.at[idx, 'team'] = team
    df_files.at[idx, 'session'] = session

print(f"Mapped: {df_files['team'].notna().sum()} / {len(df_files)} files")
df_files.head(10)

Mapped: 93 / 99 files


Unnamed: 0,filename,date,time,user_id,mmdd,hhmm,team,session
0,preprocessing_0816_1400_9M4VCHG.json,08/16/2021,14:00,9M4VCHG,816,1400,Team 12,Session 3
1,preprocessing_0924_1600_9M4VCHG.json,09/24/2021,16:00,9M4VCHG,924,1600,Team 24,Session 3
2,preprocessing_0825_1000_9M4VCHG.json,08/25/2021,10:00,9M4VCHG,825,1000,Team 17,Session 2
3,preprocessing_0813_1600_539136F.json,08/13/2021,16:00,539136F,813,1600,Team 12,Session 2
4,preprocessing_0826_1000_539136F.json,08/26/2021,10:00,539136F,826,1000,Team 17,Session 3
5,preprocessing_0824_1600_9M4VCHG.json,08/24/2021,16:00,9M4VCHG,824,1600,Team 19,Session 1
6,preprocessing_0817_1000_539136F.json,08/17/2021,10:00,539136F,817,1000,Team 14,Session 2
7,preprocessing_0923_1000_539136F.json,09/23/2021,10:00,539136F,923,1000,Team 23,Session 3
8,preprocessing_0727_1400_A6I5HI6.json,07/27/2021,14:00,A6I5HI6,727,1400,Team 4,Session 1
9,preprocessing_0831_1000_U9TEJGM.json,08/31/2021,10:00,U9TEJGM,831,1000,Team 20,Session 2


## Compute Visit Numbers

Renumber sessions chronologically: first visit = 1, second = 2, third = 3

In [26]:
# Add visit_number column
df_files['visit_number'] = None

for team in df_files[df_files['team'].notna()]['team'].unique():
    team_data = df_files[df_files['team'] == team]
    
    # Get unique sessions sorted by date
    team_sessions = team_data.groupby('session')['date'].first().reset_index()
    team_sessions['date_parsed'] = pd.to_datetime(team_sessions['date'], format='%m/%d/%Y')
    team_sessions = team_sessions.sort_values('date_parsed')
    
    # Create session to visit number mapping
    session_to_visit = {session: idx + 1 for idx, session in enumerate(team_sessions['session'])}
    
    # Apply visit numbers
    for session, visit_num in session_to_visit.items():
        mask = (df_files['team'] == team) & (df_files['session'] == session)
        df_files.loc[mask, 'visit_number'] = visit_num

print(f"Visit number distribution:")
print(df_files[df_files['visit_number'].notna()]['visit_number'].value_counts().sort_index())
print(f"\nSample data:")
df_files[['filename', 'user_id', 'team', 'session', 'visit_number']].head(10)

Visit number distribution:
visit_number
1    51
2    34
3     8
Name: count, dtype: int64

Sample data:


Unnamed: 0,filename,user_id,team,session,visit_number
0,preprocessing_0816_1400_9M4VCHG.json,9M4VCHG,Team 12,Session 3,2
1,preprocessing_0924_1600_9M4VCHG.json,9M4VCHG,Team 24,Session 3,3
2,preprocessing_0825_1000_9M4VCHG.json,9M4VCHG,Team 17,Session 2,2
3,preprocessing_0813_1600_539136F.json,539136F,Team 12,Session 2,1
4,preprocessing_0826_1000_539136F.json,539136F,Team 17,Session 3,3
5,preprocessing_0824_1600_9M4VCHG.json,9M4VCHG,Team 19,Session 1,1
6,preprocessing_0817_1000_539136F.json,539136F,Team 14,Session 2,1
7,preprocessing_0923_1000_539136F.json,539136F,Team 23,Session 3,2
8,preprocessing_0727_1400_A6I5HI6.json,A6I5HI6,Team 4,Session 1,1
9,preprocessing_0831_1000_U9TEJGM.json,U9TEJGM,Team 20,Session 2,1


## Export Session Mapping

In [27]:
# Export to CSV
output_file = '../data/results/session_mapping.csv'
df_files.to_csv(output_file, index=False)
print(f"✓ Saved to: {output_file}")
print(f"\nColumns: {df_files.columns.tolist()}")

✓ Saved to: ../data/results/session_mapping.csv

Columns: ['filename', 'date', 'time', 'user_id', 'mmdd', 'hhmm', 'team', 'session', 'visit_number']
