# Child Mind Institute - BFRB Detection
## Final Submission with Parquet Output

In [None]:
# Import necessary libraries
import os
import zipfile
import pandas as pd
import numpy as np
import gc
from tqdm.notebook import tqdm

# For parquet support
try:
    import pyarrow
    import pyarrow.parquet as pq
    PARQUET_AVAILABLE = True
except ImportError:
    print("Installing pyarrow for parquet support...")
    !pip install pyarrow --quiet
    import pyarrow
    import pyarrow.parquet as pq
    PARQUET_AVAILABLE = True

# Set competition path
ZIP_PATH = '/path/to/cmi-detect-behavior-with-sensor-data.zip'

In [None]:
class SensorProcessor:
    """Simple processor to extract sequence data"""
    
    def __init__(self, zip_path):
        self.zip_path = zip_path
    
    def get_sequence_ids(self, file_path, max_chunks=None):
        """Extract unique sequence IDs from the CSV file"""
        # Handle case where data isn't available
        if not os.path.exists(self.zip_path):
            print("Using mock sequence IDs.")
            return [f'SEQ_{i:04d}' for i in range(1, 11)]
        
        # Process real data
        sequence_ids = []
        with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
            with zip_ref.open(file_path) as f:
                chunks = pd.read_csv(f, chunksize=100000)
                for i, chunk in enumerate(chunks):
                    if max_chunks is not None and i >= max_chunks:
                        break
                    sequence_ids.extend(chunk['sequence_id'].unique())
                    
        return sorted(list(set(sequence_ids)))
    
    def extract_sequence(self, file_path, sequence_id):
        """Extract a specific sequence from the CSV file"""
        # Handle case where data isn't available
        if not os.path.exists(self.zip_path):
            # Create a simple mock sequence
            seq_num = int(sequence_id.split('_')[1])
            is_target = seq_num % 2 == 1  # Odd numbers are targets
            return {'is_target': is_target}
        
        # Process real data
        sequence_df = None
        with zipfile.ZipFile(self.zip_path, 'r') as zip_ref:
            with zip_ref.open(file_path) as f:
                chunks = pd.read_csv(f, chunksize=100000)
                for chunk in chunks:
                    seq_data = chunk[chunk['sequence_id'] == sequence_id]
                    if len(seq_data) > 0:
                        if sequence_df is None:
                            sequence_df = seq_data
                        else:
                            sequence_df = pd.concat([sequence_df, seq_data])
        
        if sequence_df is not None:
            return sequence_df
        else:
            return None
    
    def predict_gesture(self, sequence_data):
        """Make a prediction for a sequence"""
        # For real implementation, use a trained model
        # This is just a placeholder implementation
        
        # If we have a mock sequence
        if isinstance(sequence_data, dict) and 'is_target' in sequence_data:
            if sequence_data['is_target']:
                # Choose a gesture for targets
                gestures = ['hair_pull_scalp', 'hair_pull_eyebrow', 'skin_pick_face',
                          'skin_pick_cuticle', 'hair_pull_eyelash']
                return np.random.choice(gestures)
            else:
                return 'non_target'
        
        # If we have real sequence data
        if isinstance(sequence_data, pd.DataFrame):
            # Get sequence ID and use it for mock prediction
            sequence_id = sequence_data['sequence_id'].iloc[0]
            seq_num = int(sequence_id.split('_')[1]) if '_' in sequence_id else 0
            
            # Simple logic for demonstration
            # In a real implementation, you would use extracted features and a trained model
            if seq_num % 2 == 1:  # Odd numbers are targets
                gestures = ['hair_pull_scalp', 'hair_pull_eyebrow', 'skin_pick_face',
                          'skin_pick_cuticle', 'hair_pull_eyelash']
                return gestures[seq_num % len(gestures)]
            else:
                return 'non_target'
        
        # Default fallback
        return 'non_target'

In [None]:
def generate_submission():
    """Generate a submission file in the required parquet format"""
    processor = SensorProcessor(ZIP_PATH)
    
    # Get test sequence IDs (limit for demonstration)
    test_ids = processor.get_sequence_ids('test.csv', max_chunks=1)
    test_ids = test_ids[:10]
    print(f"Processing {len(test_ids)} test sequences...")
    
    # Process each sequence
    results = []
    for seq_id in tqdm(test_ids):
        # Extract sequence
        sequence_data = processor.extract_sequence('test.csv', seq_id)
        
        # Make prediction
        predicted_gesture = processor.predict_gesture(sequence_data)
        
        # Store result
        results.append({
            'sequence_id': seq_id,
            'gesture': predicted_gesture
        })
        
        # Free memory
        del sequence_data
        gc.collect()
    
    # Create DataFrame
    submission_df = pd.DataFrame(results)
    
    # Display preview
    print("Preview of submission:")
    display(submission_df.head())
    
    # Save as parquet
    output_file = 'submission.parquet'
    submission_df.to_parquet(output_file, index=False)
    
    print(f"Submission saved to {output_file}")
    
    # Verify parquet file was created
    if os.path.exists(output_file):
        file_size = os.path.getsize(output_file) / 1024  # Size in KB
        print(f"Verified: {output_file} exists ({file_size:.2f} KB)")
        
        # Read back to verify
        test_df = pd.read_parquet(output_file)
        print(f"Successfully read back parquet file with {len(test_df)} rows")
    
    return submission_df

In [None]:
# This is the main execution cell - Run this to create submission.parquet
submission_df = generate_submission()

# Display the final result again
print("\nFinal submission format:")
submission_df.info()
submission_df