# Exploratory Data Analysis - BFRB Detection

This notebook explores the Child Mind Institute BFRB Detection competition dataset. We'll examine:

1. Subject demographics
2. Class distribution
3. Time series patterns from different sensors
4. Visualizations of ToF data as images
5. Missing data patterns

In [None]:
# Import required libraries
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from tqdm.notebook import tqdm
import sys

# Add the parent directory to path to import our modules
sys.path.append('..')
from src.preprocessing import SensorDataProcessor

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Path to the competition data
ZIP_PATH = '/Users/shail/Downloads/cmi-detect-behavior-with-sensor-data.zip'

# Check if file exists
if os.path.exists(ZIP_PATH):
    print(f"Zip file found: {os.path.getsize(ZIP_PATH) / (1024*1024):.2f} MB")
else:
    print("Zip file not found. Please update the path.")

## 1. Examine Demographics Data

Let's first look at the demographics data to understand who the subjects are.

In [None]:
# Initialize the processor
processor = SensorDataProcessor(ZIP_PATH)

# Load demographics data
train_demo, test_demo = processor.load_demographics()

print("=== Training Demographics ===\n")
display(train_demo.head())
print(f"\nShape: {train_demo.shape}")

print("\n=== Test Demographics ===\n")
display(test_demo.head())
print(f"\nShape: {test_demo.shape}")

In [None]:
# Basic statistics for demographics data
train_demo.describe()

## 2. Get Sample Data

In [None]:
# Function to get sample data while being memory efficient
def get_sample_data(num_sequences=5):
    sequence_ids = processor.get_sequence_ids('train.csv', max_chunks=10)
    sample_ids = sequence_ids[:num_sequences]
    
    sequences = []
    for seq_id in tqdm(sample_ids):
        seq_df = processor.extract_sequence('train.csv', seq_id)
        sequences.append(seq_df)
    
    # Extract metadata for all sequences for class distribution analysis
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        with zip_ref.open('train.csv') as f:
            # Read just the necessary columns
            metadata = pd.read_csv(
                f, 
                usecols=['sequence_id', 'gesture', 'sequence_type', 'behavior'],
                chunksize=100000
            )
            # Extract just one row per sequence
            metadata = pd.concat([chunk.drop_duplicates('sequence_id') for chunk in metadata])
    
    return sequences, metadata

# Get sample sequences and metadata
sample_sequences, metadata = get_sample_data(num_sequences=5)

In [None]:
# Display a sample sequence
print(f"Sample sequence information:")
print(f"Shape: {sample_sequences[0].shape}")
print(f"Sequence ID: {sample_sequences[0]['sequence_id'].iloc[0]}")
print(f"Gesture: {sample_sequences[0]['gesture'].iloc[0]}")
print(f"Sequence type: {sample_sequences[0]['sequence_type'].iloc[0]}")
print(f"Subject: {sample_sequences[0]['subject'].iloc[0]}")
print("\nFirst few rows:")
display(sample_sequences[0][['sequence_counter', 'behavior', 'acc_x', 'acc_y', 'acc_z']].head())

## 3. IMU Data Visualization

In [None]:
def plot_imu_data(sequence_df, title):
    """Plot IMU data (accelerometer and rotation)"""
    fig, axs = plt.subplots(2, 1, figsize=(14, 10))
    
    # Plot accelerometer data
    axs[0].plot(sequence_df['sequence_counter'], sequence_df['acc_x'], label='acc_x')
    axs[0].plot(sequence_df['sequence_counter'], sequence_df['acc_y'], label='acc_y')
    axs[0].plot(sequence_df['sequence_counter'], sequence_df['acc_z'], label='acc_z')
    axs[0].set_title(f'Accelerometer Data - {title}')
    axs[0].legend()
    
    # Plot rotation data
    axs[1].plot(sequence_df['sequence_counter'], sequence_df['rot_w'], label='rot_w')
    axs[1].plot(sequence_df['sequence_counter'], sequence_df['rot_x'], label='rot_x')
    axs[1].plot(sequence_df['sequence_counter'], sequence_df['rot_y'], label='rot_y')
    axs[1].plot(sequence_df['sequence_counter'], sequence_df['rot_z'], label='rot_z')
    axs[1].set_title('Rotation Data')
    axs[1].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Find BFRB and non-BFRB sequences
target_sequences = [seq for seq in sample_sequences if seq['sequence_type'].iloc[0] == 'Target']
non_target_sequences = [seq for seq in sample_sequences if seq['sequence_type'].iloc[0] == 'Non-Target']

# Plot one of each if available
if target_sequences:
    plot_imu_data(target_sequences[0], f"BFRB - {target_sequences[0]['gesture'].iloc[0]}")

if non_target_sequences:
    plot_imu_data(non_target_sequences[0], f"Non-BFRB - {non_target_sequences[0]['gesture'].iloc[0]}")

## 4. Thermopile Data Visualization

In [None]:
def plot_thermopile_data(sequence_df, title):
    """Plot thermopile sensor data"""
    plt.figure(figsize=(14, 6))
    
    for i in range(1, 6):
        plt.plot(sequence_df['sequence_counter'], sequence_df[f'thm_{i}'], 
                label=f'Thermopile {i}')
    
    plt.title(f'Thermopile Data - {title}')
    plt.xlabel('Sequence Counter')
    plt.ylabel('Temperature (°C)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# Plot thermopile data
if target_sequences:
    plot_thermopile_data(target_sequences[0], f"BFRB - {target_sequences[0]['gesture'].iloc[0]}")

## 5. Time-of-Flight Data Visualization

In [None]:
def visualize_tof_frame(sequence_df, time_step, title):
    """Visualize all ToF sensors at a specific time step"""
    fig, axs = plt.subplots(1, 5, figsize=(20, 4))
    
    row = sequence_df.iloc[time_step]
    
    for sensor in range(1, 6):
        tof_data = np.zeros((8, 8))
        for i in range(64):
            col_name = f'tof_{sensor}_v{i}'
            value = row[col_name]
            
            # Replace -1 with NaN for better visualization
            if value == -1:
                value = np.nan
                
            tof_data[i // 8, i % 8] = value
        
        # Plot as heatmap
        im = axs[sensor-1].imshow(tof_data, cmap='viridis')
        axs[sensor-1].set_title(f'ToF Sensor {sensor}')
        fig.colorbar(im, ax=axs[sensor-1])
    
    plt.suptitle(f'{title} - Time step {time_step}')
    plt.tight_layout()
    plt.show()

In [None]:
# Visualize ToF data
if target_sequences and len(target_sequences[0]) > 10:
    visualize_tof_frame(target_sequences[0], 10, f"BFRB - {target_sequences[0]['gesture'].iloc[0]}")

## 6. Class Distribution Analysis

In [None]:
# Analyze gesture distribution
plt.figure(figsize=(12, 8))
gesture_counts = metadata['gesture'].value_counts()
sns.barplot(x=gesture_counts.index, y=gesture_counts.values)
plt.title('Gesture Distribution')
plt.xlabel('Gesture')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Target vs Non-target distribution
plt.figure(figsize=(8, 6))
seq_type_counts = metadata['sequence_type'].value_counts()
sns.barplot(x=seq_type_counts.index, y=seq_type_counts.values)
plt.title('Target vs Non-target Distribution')
plt.xlabel('Sequence Type')
plt.ylabel('Count')
plt.show()

## 7. Missing Data Analysis

In [None]:
# Check missing data
if sample_sequences:
    sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z',
                  'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5']
    
    msno.matrix(sample_sequences[0][sensor_cols])
    plt.title(f'Missing Data Pattern')
    plt.show()