In [7]:
import os
import pandas as pd
import numpy as np
import multiprocessing
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from pyts.image import GramianAngularField
import shutil
import logging
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
CONFIG = {
    'RANDOM_SEED': 42,
    'GPU_ID': 0,
    'USE_GPU': True,
    'BASE_FOLDER': os.path.join("..", "input", "tlvmc-parkinsons-freezing-gait-prediction"),
    'BATCH_SIZE': 32
}

# Set random seeds for reproducibility
random.seed(CONFIG['RANDOM_SEED'])
np.random.seed(CONFIG['RANDOM_SEED'])
tf.random.set_seed(CONFIG['RANDOM_SEED'])
os.environ['PYTHONHASHSEED'] = str(CONFIG['RANDOM_SEED'])

# GPU Setup
try:
    if tf.config.list_physical_devices('GPU') and CONFIG['USE_GPU']:
        gpus = tf.config.list_physical_devices('GPU')
        
        # Configure GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        gpu_name = gpus[CONFIG['GPU_ID']].name
        device = f'/GPU:{CONFIG["GPU_ID"]}'
        logger.info(f"Successfully configured GPU - {gpu_name}")
    else:
        device = '/CPU:0'
        logger.info("No GPU available or disabled. Using CPU.")
except Exception as e:
    device = '/CPU:0'
    logger.warning(f"GPU initialization failed: {str(e)}")
    logger.info("Falling back to CPU")

# Set global device variable
DEVICE = device

N_CPU_CORES = multiprocessing.cpu_count()

logger.info(f"Using device: {DEVICE}")
logger.info(f"Number of CPU cores available: {N_CPU_CORES}")
logger.info(f"Base folder: {CONFIG['BASE_FOLDER']}")

from IPython.display import display, HTML

display(HTML("""
<style>
    .config-table {
        border-collapse: collapse;
        margin: 10px 0;
        font-size: 0.9em;
        font-family: sans-serif;
        min-width: 400px;
        box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
    }
    .config-table thead tr {
        background-color: #009879;
        color: #ffffff;
        text-align: left;
    }
    .config-table th,
    .config-table td {
        padding: 12px 15px;
    }
    .config-table tbody tr {
        border-bottom: 1px solid #dddddd;
    }
</style>
<table class="config-table">
    <thead>
        <tr>
            <th>Configuration</th>
            <th>Value</th>
        </tr>
    </thead>
    <tbody>
        <tr><td>Device</td><td>""" + DEVICE + """</td></tr>
        <tr><td>CPU Cores</td><td>""" + str(N_CPU_CORES) + """</td></tr>
        <tr><td>Random Seed</td><td>""" + str(CONFIG['RANDOM_SEED']) + """</td></tr>
    </tbody>
</table>
"""))

with tf.device(DEVICE):
    pass

2024-11-13 04:26:52,919 - INFO - Successfully configured GPU - /physical_device:GPU:0
2024-11-13 04:26:52,921 - INFO - Using device: /GPU:0
2024-11-13 04:26:52,921 - INFO - Number of CPU cores available: 12
2024-11-13 04:26:52,922 - INFO - Base folder: ..\input\tlvmc-parkinsons-freezing-gait-prediction


Configuration,Value
Device,/GPU:0
CPU Cores,12
Random Seed,42


In [8]:
def reduce_memory_usage(df):
    """
    Reduce memory usage of a pandas DataFrame by optimizing data types.
    Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN
    
    Args:
        df (pd.DataFrame): Input DataFrame
    
    Returns:
        pd.DataFrame: Memory-optimized DataFrame
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        
        # Skip datetime and category types
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                # Integer optimization
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                # Float optimization
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype('category')
                
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: {:.2f} MB".format(mem_usg))
    print("Memory reduced by {:.1f}%".format(100 * (start_mem - mem_usg) / start_mem))
    
    return df

In [None]:
DOWN_SAMPLE = True
FROM_FREQ = 128  # Original sampling frequency
TO_FREQ = 64    # Target sampling frequency
FREQ_RATIO = FROM_FREQ // TO_FREQ

def load_tdcsfog_data(base_folder):
    """
    Load and downsample TDCSFOG dataset.
    
    Args:
        base_folder (str): Base path to the dataset
        
    Returns:
        pd.DataFrame: Concatenated and processed dataset
    """
    DATA_ROOT_TDCSFOG = os.path.join(base_folder, 'tdcsfog')
    tdcsfog = pd.DataFrame()
    
    # Use tqdm for progress tracking
    files_list = []
    for root, _, files in os.walk(DATA_ROOT_TDCSFOG):
        files_list.extend([os.path.join(root, name) for name in files])
    
    logger.info(f"Found {len(files_list)} files to process")
    
    for file_path in tqdm(files_list, desc='Loading TDCSFOG data'):
        try:
            df_list = pd.read_csv(file_path)
            file_name = os.path.basename(file_path).split('.')[0]
            
            if DOWN_SAMPLE:
                df_list = df_list.groupby(np.arange(len(df_list)) // FREQ_RATIO).max()
            
            df_list['file'] = file_name
            
            tdcsfog = pd.concat([tdcsfog, df_list], axis=0)
            
        except Exception as e:
            logger.error(f"Error processing file {file_path}: {str(e)}")
    
    logger.info(f"Loaded dataset with shape: {tdcsfog.shape}")
    return tdcsfog

BASE_FOLDER = 'data/csv'
# Load the data
tdcsfog = load_tdcsfog_data(BASE_FOLDER)

In [12]:
display(tdcsfog.head())

Unnamed: 0,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking,file
0,1,-9.533939,0.566322,-1.413525,0,0,0,003f117e14
1,3,-9.529345,0.564227,-1.41549,0,0,0,003f117e14
2,5,-9.536585,0.561854,-1.413949,0,0,0,003f117e14
3,7,-9.524494,0.552772,-1.413802,0,0,0,003f117e14
4,9,-9.529338,0.55296,-1.415914,0,0,0,003f117e14


In [None]:
tdcsfog = reduce_memory_usage(tdcsfog)

In [13]:
def get_merged_tdcsfog_with_meta_data(tdcsfog: pd.DataFrame) -> pd.DataFrame:
    """
    Merge TDCSFOG data with metadata and sort by subject and timestamp.
    
    Args:
        tdcsfog (pd.DataFrame): The main TDCSFOG dataset
        
    Returns:
        pd.DataFrame: Merged and sorted dataset
    """
    try:
        metadata_path = os.path.join(BASE_FOLDER, "tdcsfog_metadata.csv")
        tdcsfog_metadata = pd.read_csv(metadata_path)
        
        logger.info(f"Metadata shape before merge: {tdcsfog_metadata.shape}")
        logger.info(f"TDCSFOG shape before merge: {tdcsfog.shape}")
        
        tdcsfog_merged = tdcsfog_metadata.merge(
            tdcsfog, 
            how='inner', 
            left_on='Id', 
            right_on='file',
            validate='1:m'  # Validate one-to-many relationship
        )
        
        # Remove redundant column
        tdcsfog_merged.drop(columns=['file'], axis=1, inplace=True)
        
        # Sort by specified columns
        tdcsfog_merged = tdcsfog_merged.sort_values(
            by=['Id', 'Subject', 'Time'],
            ignore_index=True
        )
        
        logger.info(f"Final merged shape: {tdcsfog_merged.shape}")
        
        # Verify no data was unexpectedly lost
        expected_rows = len(tdcsfog)
        if len(tdcsfog_merged) != expected_rows:
            logger.warning(
                f"Row count mismatch! Expected {expected_rows}, got {len(tdcsfog_merged)}"
            )
            
        return tdcsfog_merged
        
    except FileNotFoundError:
        logger.error(f"Metadata file not found at {metadata_path}")
        raise
    except Exception as e:
        logger.error(f"Error during merge operation: {str(e)}")
        raise

try:
    tdcsfog_merged = get_merged_tdcsfog_with_meta_data(tdcsfog)
    display(tdcsfog_merged.head())
except Exception as e:
    logger.error(f"Failed to merge data: {str(e)}")

2024-11-13 04:29:21,184 - INFO - Metadata shape before merge: (833, 5)
2024-11-13 04:29:21,184 - INFO - TDCSFOG shape before merge: (3531552, 8)
2024-11-13 04:29:22,996 - INFO - Final merged shape: (3531552, 12)


Unnamed: 0,Id,Subject,Visit,Test,Medication,Time,AccV,AccML,AccAP,StartHesitation,Turn,Walking
0,003f117e14,4dc2f8,3,2,on,1,-9.533939,0.566322,-1.413525,0,0,0
1,003f117e14,4dc2f8,3,2,on,3,-9.529345,0.564227,-1.41549,0,0,0
2,003f117e14,4dc2f8,3,2,on,5,-9.536585,0.561854,-1.413949,0,0,0
3,003f117e14,4dc2f8,3,2,on,7,-9.524494,0.552772,-1.413802,0,0,0
4,003f117e14,4dc2f8,3,2,on,9,-9.529338,0.55296,-1.415914,0,0,0


In [14]:
tdcsfog_m = tdcsfog_merged

In [15]:
def preprocess_event_and_label(df_input: pd.DataFrame, drop_unnecessary: bool = False) -> pd.DataFrame:
    """
    Preprocess event and label columns in the dataframe.
    
    Args:
        df_input (pd.DataFrame): Input dataframe
        drop_unnecessary (bool): Whether to drop original event columns
        
    Returns:
        pd.DataFrame: Processed dataframe with encoded events and labels
    """
    pd.set_option('future.no_silent_downcasting', True)
    
    df = df_input.copy()
    
    MEDICATION_MAP = {'off': 0, 'on': 1}
    LABEL_MAP = {
        'Normal': 0,
        'StartHesitation': 1,
        'Turn': 2,
        'Walking': 3
    }
    EVENT_MAP = {
        'Normal': 0,
        'StartHesitation': 1,
        'Turn': 1,
        'Walking': 1
    }
    
    conditions = [
        (df['StartHesitation'] == 1),
        (df['Turn'] == 1),
        (df['Walking'] == 1)
    ]
    choices = ['StartHesitation', 'Turn', 'Walking']
    
    df['Event'] = np.select(conditions, choices, default='Normal')
    df['Label'] = df['Event'].copy()
    
    df['Medication'] = df['Medication'].map(MEDICATION_MAP)
    df['Label'] = df['Label'].map(LABEL_MAP)
    df['Event'] = df['Event'].map(EVENT_MAP)
    
    # Display value distributions with gradient
    logger.info("Label distribution:")
    display(df['Label'].value_counts().to_frame().style.background_gradient())
    
    logger.info("Event distribution:")
    display(df['Event'].value_counts().to_frame().style.background_gradient())
    
    # Drop unnecessary columns if required
    if drop_unnecessary:
        cols_to_drop = ['StartHesitation', 'Turn', 'Walking']
        df.drop(columns=cols_to_drop, axis=1, inplace=True)
        logger.info(f"Dropped columns: {cols_to_drop}")
    
    # Verify data integrity
    assert df['Event'].isin([0, 1]).all(), "Invalid values in Event column"
    assert df['Label'].isin([0, 1, 2, 3]).all(), "Invalid values in Label column"
    assert df['Medication'].isin([0, 1]).all(), "Invalid values in Medication column"
    
    return df

try:
    processed_df = preprocess_event_and_label(df_input=tdcsfog_merged, drop_unnecessary=True)
    logger.info(f"Processed dataframe shape: {processed_df.shape}")
except Exception as e:
    logger.error(f"Error during preprocessing: {str(e)}")

2024-11-13 04:29:33,354 - INFO - Label distribution:


Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,2435287
2,839866
1,152442
3,103957


2024-11-13 04:29:33,401 - INFO - Event distribution:


Unnamed: 0_level_0,count
Event,Unnamed: 1_level_1
0,2435287
1,1096265


2024-11-13 04:29:33,495 - INFO - Dropped columns: ['StartHesitation', 'Turn', 'Walking']
2024-11-13 04:29:33,573 - INFO - Processed dataframe shape: (3531552, 11)


In [16]:
df_tdcs_meta_combined = processed_df

In [17]:
df = df_tdcs_meta_combined.copy()

In [18]:
df.reset_index(drop=True, inplace=True)

In [19]:
titles = ['AccV_mean', 'AccML_mean', 'AccAP_mean']
default_path = 'mean_subtract/gaf_images'

def create_image(df, window_no, majority_event, save_dir=default_path, no_image_creation=False):
    """
    Create Gramian Angular Field images for accelerometer data.
    
    Args:
        df: DataFrame containing accelerometer data
        window_no: Window number identifier
        majority_event: Event type for the window
        save_dir: Directory to save images
        no_image_creation: If True, only returns filenames without creating images
    
    Returns:
        list: List of created image filenames
    """
    if no_image_creation:
        return [f"img_{title}_{window_no}_{majority_event}.jpg" for title in titles]

    data = np.array([df['AccV_mean'].values, df['AccML_mean'].values, df['AccAP_mean'].values])
    
    # Create GAF transformation
    gasf = GramianAngularField(image_size=data.shape[1])
    gasf_transformed = gasf.fit_transform(data)

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    file_names = []
    for i, title in enumerate(titles):
        file_name = f"img_{title}_{window_no}_{majority_event}.jpg"
        file_path_jpg = os.path.join(save_dir, file_name)
        
        fig, ax = plt.subplots(figsize=(5, 5))
        ax.set_title(title)
        ax.imshow(gasf_transformed[i], cmap='viridis')
        ax.axis('off')
        
        plt.savefig(file_path_jpg, format='jpg', dpi=200, bbox_inches='tight', pad_inches=0)
        plt.clf()
        plt.close(fig)
        
        file_names.append(file_name)
    
    # Clean up
    del gasf
    return file_names

In [21]:
def create_windowed_data(df, window_size=4*64):
    """
    Create windows from time series data with dynamic overlap.
    
    Args:
        df: Input DataFrame
        window_size: Size of each window in samples
    
    Returns:
        tuple: (windowed_df, label_df, window_stats)
    """
    subgroups = []
    majority_label_subgroups = []
    
    window_stats = {
        'event_0': 0,
        'event_1': 0,
        'normal': 0,
        'start_hesitation': 0,
        'turn': 0,
        'walking': 0
    }
    
    window_no = 0
    
    for name, group in df.groupby(['Subject', 'Id', 'Visit']):
        group_len = len(group)
        i = 0
        
        while i < group_len:
            subgroup = group.iloc[i:i + window_size].copy()
            if len(subgroup) < window_size:
                break
                
            event_counts = subgroup['Event'].value_counts()
            count_1s = event_counts.get(1, 0)
            count_0s = event_counts.get(0, 0)
            majority_event = event_counts.idxmax()
            
            label_counts = subgroup['Label'].value_counts()
            majority_label = label_counts.idxmax()
            
            # Skip mixed windows with minority FOG events
            if count_1s > 0 and count_1s < count_0s:
                i += window_size
                continue
            
            label_subgroup = subgroup.copy()
            
            subgroup['Window'] = majority_event
            label_subgroup['Window'] = majority_event
            subgroup['Window_No'] = window_no
            label_subgroup['Window_No'] = window_no
            label_subgroup['Label'] = majority_label
            
            if majority_event == 0:
                window_stats['event_0'] += 1
            else:
                window_stats['event_1'] += 1
                
            if majority_label == 0:
                window_stats['normal'] += 1
            elif majority_label == 1:
                window_stats['start_hesitation'] += 1
            elif majority_label == 2:
                window_stats['turn'] += 1
            elif majority_label == 3:
                window_stats['walking'] += 1
            
            subgroups.append(subgroup)
            majority_label_subgroups.append(label_subgroup)
            
            window_no += 1
            jump_to = 2*64 if majority_event == 1 else 4*64
            i += jump_to
    
    windowed_df = pd.concat(subgroups).reset_index(drop=True)
    label_df = pd.concat(majority_label_subgroups).reset_index(drop=True)
    
    logger.info(f"Created {window_no} windows")
    logger.info(f"Window statistics: {window_stats}")
    
    return windowed_df, label_df, window_stats

new_df, new_df_subgroups, stats = create_windowed_data(df)

# Display window statistics
for key, value in stats.items():
    print(f"{key}: {value}")

2024-11-13 04:33:15,903 - INFO - Created 16126 windows
2024-11-13 04:33:15,903 - INFO - Window statistics: {'event_0': 8035, 'event_1': 8091, 'normal': 8040, 'start_hesitation': 1135, 'turn': 6175, 'walking': 776}


event_0: 8035
event_1: 8091
normal: 8040
start_hesitation: 1135
turn: 6175
walking: 776


In [31]:
NO_IMAGE_CREATION = True
GAF_PATH = 'data/gaf_images'

def create_per_window_image_ranged(
    df, 
    chunk_size=500, 
    range_no=1, 
    file_path=GAF_PATH
):
    """
    Create GAF images for windows in specified range.
    
    Args:
        df: DataFrame containing window data
        chunk_size: Number of windows to process at once
        range_no: Range number (1, 2, or 3)
        file_path: Directory to save images
    """
    # Get unique windows
    unique_windows = df['Window_No'].unique()
    total_windows = len(unique_windows)
    range_size = total_windows // 3
    
    # Calculate range boundaries
    range_mapping = {
        1: (0, range_size),
        2: (range_size, 2 * range_size),
        3: (2 * range_size, total_windows)
    }
    
    if range_no not in range_mapping:
        raise ValueError("Invalid range number. Must be 1, 2, or 3.")
    
    start_idx, end_idx = range_mapping[range_no]
    logger.info(f"Processing range {range_no}: {start_idx} to {end_idx}")
    
    # Get windows for the specified range
    range_windows = unique_windows[start_idx:end_idx]
    
    # Process windows in chunks
    for chunk_start in tqdm(range(0, len(range_windows), chunk_size), 
                          desc=f"Processing range {range_no}"):
        # Get current chunk
        chunk_windows = range_windows[chunk_start:chunk_start + chunk_size]
        
        for window_id in chunk_windows:
            # Filter data for current window
            filtered_df = df[df['Window_No'] == window_id]
            event_type = filtered_df['Window'].values[0]
            
            # Create images
            image_names = create_image(
                filtered_df, 
                window_id, 
                event_type,
                no_image_creation=NO_IMAGE_CREATION,
                save_dir=file_path
            )
            
            # Update DataFrame with image paths
            for title, image_name in zip(titles, image_names):
                df.loc[df['Window_No'] == window_id, f'GAF_{title}'] = image_name
            
        # Clean up memory
        del filtered_df
        
    logger.info(f"Completed processing range {range_no}")
    return df

try:
    if NO_IMAGE_CREATION:
        # This will only set the image file names to the dataframe
        for i in range(1, 4):
            new_df = create_per_window_image_ranged(new_df, range_no=i)
        logger.info("Image creation completed successfully")
except Exception as e:
    logger.error(f"Error during image creation: {str(e)}")

2024-11-13 04:43:29,520 - INFO - Processing range 3: 10750 to 16126
Processing range 3: 100%|██████████| 11/11 [01:24<00:00,  7.70s/it]
2024-11-13 04:44:54,239 - INFO - Completed processing range 3
2024-11-13 04:44:54,239 - INFO - Image creation completed successfully


In [23]:
def save_df(df: pd.DataFrame, name: str) -> None:
    """
    Save DataFrame to CSV file.
    
    Args:
        df: DataFrame to save
        name: Name of output file (without .csv extension)
    """
    try:
        output_path = f"{name}.csv"
        output_path = os.path.join("data/csv", output_path)
        df.to_csv(output_path, index=False)
        logger.info(f"Successfully saved DataFrame to {output_path}")
        logger.info(f"Shape: {df.shape}")
    except Exception as e:
        logger.error(f"Error saving DataFrame to {name}.csv: {str(e)}")
        raise

# Checkpoint 1 => Save what we computed so far

In [None]:
save_df(new_df, 'windowed_tdcsfog')

In [None]:
save_df(new_df_subgroups, 'subgrouped_windowed_tdcsfog')

In [None]:
new_df = pd.read_csv("windowed_tdcsfog.csv", low_memory=False)

In [24]:
new_df_copy = new_df.copy()

In [25]:
def add_mean_subtraction_columns(
    df: pd.DataFrame,
    group_col: str = 'Window_No',
    target_cols: list = ['AccV', 'AccML', 'AccAP']
) -> pd.DataFrame:
    """
    Subtract mean values from target columns grouped by specified column.
    
    Args:
        df: Input DataFrame
        group_col: Column to group by
        target_cols: List of columns to perform mean subtraction on
        
    Returns:
        DataFrame with added mean-subtracted columns
    """
    try:
        # Create new column names
        new_columns = [f"{col}_mean" for col in target_cols]
        
        # Perform mean subtraction
        for col, new_col in zip(target_cols, new_columns):
            mean_values = df.groupby(group_col)[col].transform('mean')
            df[new_col] = df[col] - mean_values
            
        logger.info(f"Added mean-subtracted columns: {new_columns}")
        return df
        
    except Exception as e:
        logger.error(f"Error in mean subtraction: {str(e)}")
        raise

In [26]:
new_df_copy = add_mean_subtraction_columns(new_df_copy)

2024-11-13 04:36:12,861 - INFO - Added mean-subtracted columns: ['AccV_mean', 'AccML_mean', 'AccAP_mean']


# Checkpoint 2 => Save for future use

In [None]:
save_df(new_df_copy, 'windowed_mean_subtracted_tdcsfog')

## Checkpoint 3 => Start from here for image generation for each ranges after restart

In [None]:
new_df_copy = pd.read_csv("data/csv/windowed_mean_subtracted_tdcsfog.csv", low_memory=False)

## Create image for range 1 to 3
Kaggle RAM will overflow and end current session if we pass all at once. So, we will pass in a chunk.
First pass range_no = 1, then after it is executed, restart the session. Start from **Checkpoint 3**.
Then create images for range_no = 2. Restart the session after completion.
Then do the same as above for range_no = 3,

In [None]:
create_per_window_image_ranged(new_df_copy, range_no=1)

In [32]:
SHOW_UNIQUE_SUBJECTS = True

# Get unique subjects
if SHOW_UNIQUE_SUBJECTS:
    try:
        unique_subjects = new_df['Subject'].unique()
        logger.info(f"Found {len(unique_subjects)} unique subjects")
        display(unique_subjects)
        
    except Exception as e:
        logger.error(f"Error getting unique subjects: {str(e)}")

def split_list(data: list, test_size: float = 0.2) -> tuple[list, list]:
    """
    Split a list into training and test sets.
    
    Args:
        data: List of items to split
        test_size: Fraction of data to use for test set
        
    Returns:
        tuple: (train_data, test_data)
    """
    try:
        # Make copy to avoid modifying original data
        data_copy = data.copy()
        
        # Shuffle the data
        random.shuffle(data_copy)
        
        # Calculate split point
        train_size = int((1 - test_size) * len(data_copy))
        
        # Split data
        train_data = data_copy[:train_size]
        test_data = data_copy[train_size:]
        
        logger.info(f"Split sizes - Train: {len(train_data)}, Test: {len(test_data)}")
        return train_data, test_data
        
    except Exception as e:
        logger.error(f"Error splitting data: {str(e)}")
        raise

# Perform splits
try:
    # First split: 70/30
    train_subjects, test_subjects = split_list(unique_subjects, test_size=0.3)
    
    # Second split: Split test into 20/10
    test_subjects, valid_subjects = split_list(test_subjects, test_size=0.33)
    
    print("\nData split results:")
    print(f"Training subjects ({len(train_subjects)}): {sorted(train_subjects)}")
    print(f"Testing subjects ({len(test_subjects)}): {sorted(test_subjects)}")
    print(f"Validation subjects ({len(valid_subjects)}): {sorted(valid_subjects)}")
    
    total = len(unique_subjects)
    print(f"\nSplit percentages based on user:")
    print(f"Train: {len(train_subjects)/total:.1%}")
    print(f"Test: {len(test_subjects)/total:.1%}")
    print(f"Valid: {len(valid_subjects)/total:.1%}")
    
except Exception as e:
    logger.error(f"Error in split process: {str(e)}")

print(f"\nTotal records in dataset: {len(new_df)}")

2024-11-13 04:44:54,406 - INFO - Found 62 unique subjects


array(['02bc69', '07285e', '082f01', '194d1d', '19ea47', '220a17',
       '231c3b', '242a3e', '24a59d', '251738', '2a39f8', '2c98f7',
       '2d57c2', '301ada', '312788', '31d269', '364459', '3b2403',
       '3b2b7a', '48fd62', '4b39ac', '4ba1d3', '4bb5d0', '4ca9b3',
       '4dc2f8', '4f13b4', '51574c', '516a67', '54ee6e', '59f492',
       '5c0b8a', '66341b', '69cc45', '6a3e93', '743f4e', '7688c1',
       '79011a', '7eb666', '7fcee9', '87174c', '8db7dd', '93f49f',
       '9f85da', 'a03db7', 'a80ae4', 'af82b2', 'b19f77', 'bc3908',
       'c7fee4', 'c85fdf', 'c8e721', 'c95ab0', 'd8836b', 'd9312a',
       'e39bc5', 'e8919c', 'e9fc55', 'eeaff0', 'f2c8aa', 'f62eec',
       'f686f0', 'fa8764'], dtype=object)

2024-11-13 04:44:54,422 - INFO - Split sizes - Train: 43, Test: 19
2024-11-13 04:44:54,422 - INFO - Split sizes - Train: 12, Test: 7



Data split results:
Training subjects (43): ['02bc69', '194d1d', '19ea47', '251738', '2a39f8', '2c98f7', '2d57c2', '301ada', '364459', '3b2b7a', '48fd62', '4b39ac', '4ba1d3', '4bb5d0', '4ca9b3', '4dc2f8', '4f13b4', '51574c', '54ee6e', '59f492', '5c0b8a', '66341b', '69cc45', '6a3e93', '7688c1', '79011a', '7fcee9', '87174c', '93f49f', '9f85da', 'a80ae4', 'af82b2', 'b19f77', 'c85fdf', 'c8e721', 'c95ab0', 'd8836b', 'e39bc5', 'e8919c', 'e9fc55', 'eeaff0', 'f686f0', 'fa8764']
Testing subjects (12): ['07285e', '220a17', '231c3b', '242a3e', '24a59d', '31d269', '3b2403', '7eb666', 'a03db7', 'bc3908', 'c7fee4', 'd9312a']
Validation subjects (7): ['082f01', '312788', '516a67', '743f4e', '8db7dd', 'f2c8aa', 'f62eec']

Split percentages:
Train: 69.4%
Test: 19.4%
Valid: 11.3%

Total records in dataset: 4128256


In [33]:
def create_data_splits(df: pd.DataFrame, train_subjects: list, valid_subjects: list, test_subjects: list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
   """
   Create train, validation and test splits based on subject IDs.
   
   Args:
       df: Input DataFrame
       train_subjects: List of subjects for training
       valid_subjects: List of subjects for validation
       test_subjects: List of subjects for testing
   
   Returns:
       tuple: (train_df, valid_df, test_df)
   """
   try:
       df_train = df[df['Subject'].isin(train_subjects)]
       df_valid = df[df['Subject'].isin(valid_subjects)]
       df_test = df[df['Subject'].isin(test_subjects)]
       
       logger.info(f"\nDataset split sizes:")
       logger.info(f"Training set: {len(df_train):,} records")
       logger.info(f"Validation set: {len(df_valid):,} records")
       logger.info(f"Test set: {len(df_test):,} records")
       
       # Verify no overlap
       assert not set(df_train['Subject']).intersection(df_valid['Subject']), "Train-Valid subject overlap found"
       assert not set(df_train['Subject']).intersection(df_test['Subject']), "Train-Test subject overlap found"
       assert not set(df_valid['Subject']).intersection(df_test['Subject']), "Valid-Test subject overlap found"
       
       return df_train, df_valid, df_test
       
   except Exception as e:
       logger.error(f"Error creating data splits: {str(e)}")
       raise

try:
   df_train, df_valid, df_test = create_data_splits(
       new_df, 
       train_subjects, 
       valid_subjects, 
       test_subjects
   )
   
   total_records = len(new_df)
   print(f"\nSplit proportions based on sensor data:")
   print(f"Train: {len(df_train)/total_records:.1%}")
   print(f"Valid: {len(df_valid)/total_records:.1%}")
   print(f"Test: {len(df_test)/total_records:.1%}")
   
except Exception as e:
   logger.error(f"Failed to create splits: {str(e)}")

2024-11-13 04:45:46,289 - INFO - 
Dataset split sizes:
2024-11-13 04:45:46,289 - INFO - Training set: 3,049,984 records
2024-11-13 04:45:46,289 - INFO - Validation set: 306,944 records
2024-11-13 04:45:46,289 - INFO - Test set: 771,328 records



Split proportions:
Train: 73.9%
Valid: 7.4%
Test: 18.7%


# Checkpoint 4 => We are storing this so that we won't be computing this in the future.

In [None]:
save_df(df_train, "df_train")
save_df(df_test, "df_test")
save_df(df_valid, "df_valid")

# Checkpoint 5 => Read already saved df_train, df_valid, df_test from folder so we won't need to recompute

In [None]:
df_train = pd.read_csv("df_train.csv", low_memory=False)
df_test = pd.read_csv("df_test.csv", low_memory=False)
df_valid = pd.read_csv("df_valid.csv", low_memory=False)

# Move our generated images into their specific folder based on the df_train, df_test and df_valid dataset.
For AccV the images will be moved to AccV/train, AccV/valid, and AccV/test

In [None]:
SOURCE_DIRECTORIES = {
   "AccV": "gaf_images/AccV",
   "AccAP": "gaf_images/AccAP",
   "AccML": "gaf_images/AccML"
}

def move_to_specific_dir(df_local: pd.DataFrame, directories: dict) -> None:
   """
   Move GAF images to their respective directories based on acceleration type.
   
   Args:
       df_local: DataFrame containing window information
       directories: Dictionary mapping acceleration types to target directories
   """
   try:
       # Create target directories
       for folder_name, path in directories.items():
           os.makedirs(path, exist_ok=True)
           logger.info(f"Created directory: {path}")
       
       total_copied = 0
       for cur_window_no in tqdm(df_local['Window_No'].unique(), desc="Processing windows"):
           window_str = f"_{cur_window_no}_"
           
           for folder_name, source_dir in SOURCE_DIRECTORIES.items():
               if not os.path.exists(source_dir):
                   logger.warning(f"Source directory not found: {source_dir}")
                   continue
                   
               target_file = f"{folder_name}_mean{window_str}"
               
               for filename in os.listdir(source_dir):
                   if target_file in filename:
                       source_file = os.path.join(source_dir, filename)
                       destination_file = os.path.join(directories[folder_name], filename)
                       
                       shutil.copy2(source_file, destination_file)
                       total_copied += 1
       
       logger.info(f"Successfully copied {total_copied} files to their respective directories")
       
   except Exception as e:
       logger.error(f"Error during file movement: {str(e)}")
       raise

train_directories = {
   "AccV": "gaf_images/AccV/train",
   "AccAP": "gaf_images/AccAP/train",
   "AccML": "gaf_images/AccML/train"
}

valid_directories = {
   "AccV": "gaf_images/AccV/valid",
   "AccAP": "gaf_images/AccAP/valid",
   "AccML": "gaf_images/AccML/valid"
}

test_directories = {
   "AccV": "gaf_images/AccV/test",
   "AccAP": "gaf_images/AccAP/test",
   "AccML": "gaf_images/AccML/test"
}

try:
   # Move files to their respective directories
   for split_name, df_split, dirs in [
       ("train", df_train, train_directories),
       ("valid", df_valid, valid_directories),
       ("test", df_test, test_directories)
   ]:
       logger.info(f"\nProcessing {split_name} split...")
       move_to_specific_dir(df_split, dirs)
       
except Exception as e:
   logger.error(f"Failed to move files: {str(e)}")

In [34]:
def analyze_directories():
    """
    Analyze distribution of images across train/test/valid splits and classes.
    """
    parent_dirs = ['AccV', 'AccAP', 'AccML']
    splits = ['train', 'test', 'valid']
    base_path = 'data'
    
    for parent in parent_dirs:
        print(f"\n=== Analysis for {parent} ===")
        
        total_images = 0
        split_counts = {}
        class_counts = {split: {0: 0, 1: 0} for split in splits}
        
        for split in splits:
            path = os.path.join(base_path, parent, split)
            if not os.path.exists(path):
                print(f"Warning: Path does not exist: {path}")
                continue
                
            # Count images in this split
            files = os.listdir(path)
            split_counts[split] = len(files)
            total_images += len(files)
            
            # Count classes
            for file in files:
                # Extract class from filename (last digit before .jpg)
                try:
                    class_label = int(file.split('_')[-1].split('.')[0])
                    class_counts[split][class_label] += 1
                except:
                    print(f"Warning: Could not parse class from filename: {file}")
        
        # Print split proportions
        print("\nSplit Proportions:")
        for split, count in split_counts.items():
            proportion = (count / total_images * 100) if total_images > 0 else 0
            print(f"{split}: {count} images ({proportion:.1f}%)")
            
        # Print class distributions
        print("\nClass Distributions:")
        for split in splits:
            if split in class_counts:
                total_split = sum(class_counts[split].values())
                print(f"\n{split}:")
                for class_label, count in class_counts[split].items():
                    proportion = (count / total_split * 100) if total_split > 0 else 0
                    print(f"Class {class_label}: {count} images ({proportion:.1f}%)")

# Run analysis
try:
    analyze_directories()
except Exception as e:
    print(f"Error during analysis: {str(e)}")


=== Analysis for AccV ===

Split Proportions:
train: 8288 images (48.9%)
test: 6255 images (36.9%)
valid: 2410 images (14.2%)

Class Distributions:

train:
Class 0: 5106 images (61.6%)
Class 1: 3182 images (38.4%)

test:
Class 0: 2052 images (32.8%)
Class 1: 4203 images (67.2%)

valid:
Class 0: 1700 images (70.5%)
Class 1: 710 images (29.5%)

=== Analysis for AccAP ===

Split Proportions:
train: 8288 images (48.9%)
test: 6255 images (36.9%)
valid: 2410 images (14.2%)

Class Distributions:

train:
Class 0: 5106 images (61.6%)
Class 1: 3182 images (38.4%)

test:
Class 0: 2052 images (32.8%)
Class 1: 4203 images (67.2%)

valid:
Class 0: 1700 images (70.5%)
Class 1: 710 images (29.5%)

=== Analysis for AccML ===

Split Proportions:
train: 8288 images (48.9%)
test: 6255 images (36.9%)
valid: 2410 images (14.2%)

Class Distributions:

train:
Class 0: 5106 images (61.6%)
Class 1: 3182 images (38.4%)

test:
Class 0: 2052 images (32.8%)
Class 1: 4203 images (67.2%)

valid:
Class 0: 1700 images

# AFTER THIS IS DONE.... WE CAN MOVE TO TRAINING OUR MULTI CHANNEL CNN MODEL