In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
from datetime import timedelta
import subprocess

# Define parameters for data retrieval
study_id = 3445611111


# Read acceleration data using pandas
acc_data = pd.read_csv("/mnt/EAS_shared/baboon/working/data/raw/2025/acc/Baboons MBRP Mpala Kenya_acc_20240701_20251110.csv")

# Replace hyphens with underscores in column names
acc_data.columns = acc_data.columns.str.replace('-', '_')

# Change working directory
os.chdir("/mnt/EAS_shared/baboon/working/data/processed/2025/acc")

# metadata = pd.read_csv('movebank_metadata.csv')
# acc_data = acc_data.merge(metadata[['individual_local_identifier', 'group_id', 'sex']], 
#                           on='individual_local_identifier', how='left')

# Clean and process the data - rename columns for clarity
acc_data = acc_data.rename(columns={
    'individual_local_identifier': 'animal',
    'tag_local_identifier': 'tag',
    'eobs:accelerations_raw': 'eobs_accelerations_raw_str'
})

# Select necessary columns and ensure timestamp is in datetime format
acc_data = acc_data[['animal', 'tag', 'timestamp', 'eobs_accelerations_raw_str']]
acc_data['timestamp'] = pd.to_datetime(acc_data['timestamp'], utc=True)

# Filter out rows with empty acceleration data
acc_data = acc_data[acc_data['eobs_accelerations_raw_str'] != '']

# Create output directory
output_dir = "acc_v1"
Path(output_dir).mkdir(exist_ok=True)

# Read calibration data upfront
acc_calib = pd.read_csv('acc_calib.csv')
acc_calib['tag'] = acc_calib['tag'].astype(str)
acc_calib['x0'] = pd.to_numeric(acc_calib['x0'])
acc_calib['y0'] = pd.to_numeric(acc_calib['y0'])
acc_calib['z0'] = pd.to_numeric(acc_calib['z0'])
acc_calib['Sx'] = pd.to_numeric(acc_calib['Sx'])
acc_calib['Sy'] = pd.to_numeric(acc_calib['Sy'])
acc_calib['Sz'] = pd.to_numeric(acc_calib['Sz'])

# Loop over each unique animal with progress bar
unique_animals = acc_data['animal'].unique()

for current_animal in tqdm(unique_animals, desc="Processing animals"):
    # Filter data for the current animal (may have multiple tags)
    animal_acc_data = acc_data[acc_data['animal'] == current_animal].copy()
    
    # Get all tags for this animal
    animal_tags = animal_acc_data['tag'].unique()
    print(f"Processing animal: {current_animal} with tags: {animal_tags}")
    
    # Store data from all tags for this animal
    all_tag_data = []
    
    # Process each tag separately
    for current_tag in animal_tags:
        # Filter data for the current tag
        individual_acc_data = animal_acc_data[animal_acc_data['tag'] == current_tag].copy()
        
        # Remove high res data, more than a single burst per minute or long bursts
        # Create a temporary minute column for grouping
        individual_acc_data['minute_temp'] = individual_acc_data['timestamp'].dt.strftime('%Y-%m-%d %H:%M')
        
        # Select first row per minute
        individual_acc_data = individual_acc_data.sort_values('timestamp').groupby('minute_temp').first().reset_index()
        
        # Remove the temporary minute column
        individual_acc_data = individual_acc_data.drop(columns=['minute_temp'])
        
        # Split the acceleration string into columns
        acc_cols = individual_acc_data['eobs_accelerations_raw_str'].str.split(' ', expand=True)
        
        # Take first 120 columns
        if acc_cols.shape[1] > 120:
            acc_cols = acc_cols.iloc[:, :120]
        
        # Convert to numeric
        acc_cols = acc_cols.apply(pd.to_numeric, errors='coerce')
        
        col_count = acc_cols.shape[1]
        
        # If there's no acceleration data for this tag, skip to the next
        if col_count == 0:
            continue
        
        # Dynamically name the new columns (x1, y1, z1, x2, y2, z2, ...)
        n_samples = col_count // 3
        xyz_names = [f"{axis}{i+1}" for i in range(n_samples) for axis in ['x', 'y', 'z']]
        acc_cols.columns = xyz_names[:col_count]
        
        # Add burst_timestamp and tag info
        acc_cols['burst_timestamp'] = individual_acc_data['timestamp'].values
        acc_cols['tag'] = current_tag
        
        # Reshape data from wide to long format
        # Create index columns for x, y, z
        x_cols = [col for col in acc_cols.columns if col.startswith('x') and col != 'x']
        y_cols = [col for col in acc_cols.columns if col.startswith('y') and col != 'y']
        z_cols = [col for col in acc_cols.columns if col.startswith('z') and col != 'z']
        
        # Melt for each axis
        id_vars = ['burst_timestamp', 'tag']
        
        tag_long_data = pd.DataFrame()
        
        for i in range(len(x_cols)):
            temp_df = acc_cols[id_vars + [x_cols[i], y_cols[i], z_cols[i]]].copy()
            temp_df.columns = ['burst_timestamp', 'tag', 'X', 'Y', 'Z']
            temp_df['index'] = i + 1
            tag_long_data = pd.concat([tag_long_data, temp_df], ignore_index=True)
        
        # Calculate timestamp: burst_timestamp + (index - 1) * 0.05 seconds
        tag_long_data['timestamp'] = tag_long_data['burst_timestamp'] + \
                                      pd.to_timedelta((tag_long_data['index'] - 1) * 0.05, unit='s')
        
        # Apply calibration (commented out as in original)
        # calib_row = acc_calib[acc_calib['tag'] == str(current_tag)]
        # if len(calib_row) > 0:
        #     tag_long_data['X'] = (tag_long_data['X'] - calib_row['x0'].iloc[0]) * calib_row['Sx'].iloc[0] * 9.81
        #     tag_long_data['Y'] = (tag_long_data['Y'] - calib_row['y0'].iloc[0]) * calib_row['Sy'].iloc[0] * -9.81
        #     tag_long_data['Z'] = (tag_long_data['Z'] - calib_row['z0'].iloc[0]) * calib_row['Sz'].iloc[0] * 9.81
        
        # Sort data for this tag
        tag_long_data = tag_long_data.sort_values(['timestamp', 'burst_timestamp', 'index', 'X', 'Y', 'Z'])
        
        # Add to list of all tag data for this animal
        all_tag_data.append(tag_long_data)
    
    # Skip if no valid data for any tag
    if len(all_tag_data) == 0:
        continue
    
    # Combine data from all tags for this animal
    animal_combined_data = pd.concat(all_tag_data, ignore_index=True)
    
    # Sort the combined data by timestamp
    animal_combined_data = animal_combined_data.sort_values(['timestamp'])
    # animal_combined_data = animal_combined_data.sort_values(['timestamp', 'X', 'Y', 'Z', 'burst_timestamp', 'index'])
    
    # Save the processed data for the current animal to a Parquet file
    output_filename = os.path.join(output_dir, f"{current_animal}.parquet")
    animal_combined_data.to_parquet(output_filename, index=False)

print("Processing complete!")


Processing animals:   0%|          | 0/198 [00:00<?, ?it/s]

Processing animal: 24AE08_4T5U with tags: [10335]


Processing animals:   1%|          | 1/198 [00:15<51:55, 15.81s/it]

Processing animal: 24AC18_9L0M with tags: [10338]


Processing animals:   1%|          | 2/198 [01:57<3:36:33, 66.29s/it]

Processing animal: 24AA16_9Q8P with tags: [10337]


Processing animals:   2%|▏         | 3/198 [03:48<4:42:36, 86.96s/it]

Processing animal: 24AA05_4I0L with tags: [10370]


Processing animals:   2%|▏         | 4/198 [04:14<3:22:20, 62.58s/it]

Processing animal: 24AA01_5O8B with tags: [10346]


Processing animals:   3%|▎         | 5/198 [05:58<4:09:49, 77.66s/it]

Processing animal: 24AB04_0V2Z with tags: [10362]


Processing animals:   3%|▎         | 6/198 [07:30<4:23:48, 82.44s/it]

Processing animal: 24AA14_4N0F with tags: [10371]


Processing animals:   4%|▎         | 7/198 [09:15<4:46:19, 89.95s/it]

Processing animal: 24AA03_2A1P with tags: [10340]


Processing animals:   4%|▍         | 8/198 [10:57<4:56:25, 93.61s/it]

Processing animal: 24AA06_5I8Y with tags: [10358]


Processing animals:   5%|▍         | 9/198 [12:28<4:52:40, 92.91s/it]

Processing animal: 24AA12_6P8Q with tags: [10372]


Processing animals:   5%|▌         | 10/198 [13:55<4:45:19, 91.06s/it]

Processing animal: 24AD08_0F1G with tags: [10339]


Processing animals:   6%|▌         | 11/198 [15:21<4:38:56, 89.50s/it]

Processing animal: 24AC14_1D2E with tags: [10363]


Processing animals:   6%|▌         | 12/198 [17:02<4:48:13, 92.98s/it]

Processing animal: 24AC19_2N30 with tags: [10345]


Processing animals:   7%|▋         | 13/198 [17:44<3:59:15, 77.59s/it]

Processing animal: 24AB03_4D7N with tags: [10373]


Processing animals:   7%|▋         | 14/198 [19:32<4:26:17, 86.84s/it]

Processing animal: 24AA10_4R7W with tags: [10341]


Processing animals:   8%|▊         | 15/198 [21:14<4:38:28, 91.30s/it]

Processing animal: 24AA11_9A7D with tags: [10336]


Processing animals:   8%|▊         | 16/198 [22:35<4:27:47, 88.28s/it]

Processing animal: 24AB06_3I2H with tags: [10360]


Processing animals:   9%|▊         | 17/198 [24:25<4:46:10, 94.87s/it]

Processing animal: 24AB02_0Y5R with tags: [10374]


Processing animals:   9%|▉         | 18/198 [25:51<4:36:06, 92.03s/it]

Processing animal: 24AB07_5J8U with tags: [10359]


Processing animals:  10%|▉         | 19/198 [27:37<4:47:07, 96.25s/it]

Processing animal: 24AC04_7G8H with tags: [14560]


Processing animals:  10%|█         | 20/198 [29:21<4:52:31, 98.61s/it]

Processing animal: 24AC02_3C4D with tags: [14544]


Processing animals:  11%|█         | 21/198 [31:00<4:51:32, 98.83s/it]

Processing animal: 24AD01_6R76 with tags: [10366]


Processing animals:  11%|█         | 22/198 [32:29<4:41:13, 95.87s/it]

Processing animal: 24AC03_5E6F with tags: [14540]


Processing animals:  12%|█▏        | 23/198 [34:15<4:47:58, 98.73s/it]

Processing animal: 24AC06_2K3L with tags: [14546]


Processing animals:  12%|█▏        | 24/198 [36:02<4:53:35, 101.24s/it]

Processing animal: 24AC11_5W6X with tags: [10342]
