In [2]:
# ---- Auto-install required packages if missing ----
import subprocess
import sys

required = ['pyreadr', 'numpy', 'pandas', 'tqdm', 'pyarrow']

for package in required:
    try:
        __import__(package)
    except ImportError:
        print(f"Package '{package}' not found. Installing...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
        
import pyreadr
import numpy as np
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed
import ast

import os
import pyarrow.parquet as pq

# ---- Helper function: dy_acc ----
def dy_acc(vect, win_size=7):
    """
    Calculate the dynamic acceleration (dy_acc) for a given vector.
    """
    if vect is None or len(vect) == 0:
        raise ValueError("Input vector is empty or invalid.")
    
    pad_size = int(win_size / 2 - 0.5)
    padded = np.pad(vect, (pad_size, pad_size), constant_values=np.nan)
    acc_vec = np.empty(len(vect))
    acc_vec[:] = np.nan

    for i in range(len(vect)):
        window = padded[i : i + (2 * pad_size + 1)]
        m_ave = np.nanmean(window)
        acc_vec[i] = vect[i] - m_ave
    
    return acc_vec

# ---- Vector Calculation Function ----
def process_row(row):
    """
    Process a single row to calculate dynamic acceleration components and derived metrics.
    """
    x_component = np.abs(dy_acc(row['x_cal_array']))
    y_component = np.abs(dy_acc(row['y_cal_array']))
    z_component = np.abs(dy_acc(row['z_cal_array']))

    vectorial_sum = np.sqrt(x_component**2 + y_component**2 + z_component**2)
    ave_vedba_value = np.nanmean(vectorial_sum)

    pitch = np.arctan2(x_component, np.sqrt(y_component**2 + z_component**2))
    ave_pitch = np.nanmean(pitch)

    return ave_vedba_value, ave_pitch


In [8]:
# ---- Main Processing ----
# Load the data
os.chdir('/mnt/EAS_ind/rharel/analysis/JK_ch1/')
acc_data_trim = pd.read_parquet('data/acc_v1_char_sep2025_leopard.parquet')
acc_data_trim.head()




Unnamed: 0,event.id,visible,timestamp,data.decoding.software,eobs.acceleration.axes,eobs.acceleration.sampling.frequency.per.axis,eobs.accelerations.raw,eobs.key.bin.checksum,eobs.start.timestamp,import.marked.outlier,...,individual.taxon.canonical.name,tag.local.identifier,individual.local.identifier,study.name,local_timestamp,tag_local_identifier,group_id,x_cal,y_cal,z_cal
0,42446680000.0,True,2025-09-05 00:00:00+00:00,21,XYZ,20.0,2319 2360 1744 2318 2360 1744 2318 2360 1744 2...,1993922000.0,2025-09-05 00:00:00.000,False,...,Papio anubis,10362,24AB04_0V2Z,Baboons MBRP Mpala Kenya,2025-09-05 03:00:00+03:00,10362,Periwinkle,"{'x1': 2319.0, 'x2': 2318.0, 'x3': 2318.0, 'x4...","{'y1': 2360.0, 'y2': 2360.0, 'y3': 2360.0, 'y4...","{'z1': 1744.0, 'z2': 1744.0, 'z3': 1744.0, 'z4..."
1,42446680000.0,True,2025-09-05 00:01:00+00:00,21,XYZ,20.0,2530 2123 1899 2532 2124 1900 2532 2122 1898 2...,3087902000.0,2025-09-05 00:01:00.000,False,...,Papio anubis,10362,24AB04_0V2Z,Baboons MBRP Mpala Kenya,2025-09-05 03:01:00+03:00,10362,Periwinkle,"{'x1': 2530.0, 'x2': 2532.0, 'x3': 2532.0, 'x4...","{'y1': 2123.0, 'y2': 2124.0, 'y3': 2122.0, 'y4...","{'z1': 1899.0, 'z2': 1900.0, 'z3': 1898.0, 'z4..."
2,42446680000.0,True,2025-09-05 00:02:00+00:00,21,XYZ,20.0,2524 2179 1897 2528 2162 1900 2529 2145 1903 2...,2130966000.0,2025-09-05 00:02:00.000,False,...,Papio anubis,10362,24AB04_0V2Z,Baboons MBRP Mpala Kenya,2025-09-05 03:02:00+03:00,10362,Periwinkle,"{'x1': 2524.0, 'x2': 2528.0, 'x3': 2529.0, 'x4...","{'y1': 2179.0, 'y2': 2162.0, 'y3': 2145.0, 'y4...","{'z1': 1897.0, 'z2': 1900.0, 'z3': 1903.0, 'z4..."
3,42446680000.0,True,2025-09-05 00:03:00+00:00,21,XYZ,20.0,2298 2411 1787 2300 2411 1788 2299 2411 1785 2...,1468880000.0,2025-09-05 00:03:00.000,False,...,Papio anubis,10362,24AB04_0V2Z,Baboons MBRP Mpala Kenya,2025-09-05 03:03:00+03:00,10362,Periwinkle,"{'x1': 2298.0, 'x2': 2300.0, 'x3': 2299.0, 'x4...","{'y1': 2411.0, 'y2': 2411.0, 'y3': 2411.0, 'y4...","{'z1': 1787.0, 'z2': 1788.0, 'z3': 1785.0, 'z4..."
4,42446680000.0,True,2025-09-05 00:04:00+00:00,21,XYZ,20.0,2286 2416 1782 2288 2415 1783 2288 2415 1782 2...,1544969000.0,2025-09-05 00:04:00.000,False,...,Papio anubis,10362,24AB04_0V2Z,Baboons MBRP Mpala Kenya,2025-09-05 03:04:00+03:00,10362,Periwinkle,"{'x1': 2286.0, 'x2': 2288.0, 'x3': 2288.0, 'x4...","{'y1': 2416.0, 'y2': 2415.0, 'y3': 2415.0, 'y4...","{'z1': 1782.0, 'z2': 1783.0, 'z3': 1782.0, 'z4..."


In [11]:
# ---- Main Processing ----
# Load the data
acc_data_trim = pd.read_parquet('data/acc_v0/24AA11_9A7D.parquet')
acc_data_trim.head()

Unnamed: 0,timestamp,tag,index,X,Y,Z
0,2024-07-01 00:10:00+00:00,24AA11_9A7D,1,2.027727,-9.786848,1.172884
1,2024-07-01 00:10:00+00:00,24AA11_9A7D,2,1.970339,-9.979124,0.96138
2,2024-07-01 00:10:00+00:00,24AA11_9A7D,3,1.587749,-10.421359,0.40378
3,2024-07-01 00:10:00+00:00,24AA11_9A7D,4,1.128641,-6.845026,-0.269186
4,2024-07-01 00:10:00+00:00,24AA11_9A7D,5,2.429447,-9.806076,1.019063


In [13]:
acc_data_trim.columns


Index(['event.id', 'visible', 'timestamp', 'data.decoding.software',
       'eobs.acceleration.axes',
       'eobs.acceleration.sampling.frequency.per.axis',
       'eobs.accelerations.raw', 'eobs.key.bin.checksum',
       'eobs.start.timestamp', 'import.marked.outlier', 'sensor.type',
       'individual.taxon.canonical.name', 'tag.local.identifier',
       'individual.local.identifier', 'study.name', 'local_timestamp',
       'tag_local_identifier', 'group_id', 'x_cal', 'y_cal', 'z_cal'],
      dtype='object')

In [10]:
# ---- Long Format Processing (Optimized) ----

# Create grouping column based on tag and timestamp
acc_data_trim['group_id'] = acc_data_trim['tag'] + '_' + acc_data_trim['timestamp'].astype(str)

print(f"Total number of groups (unique tag-timestamp combinations): {acc_data_trim['group_id'].nunique()}")
print(f"Total number of rows: {len(acc_data_trim)}")

# Function to process each group (burst of accelerometer data)
def process_group(group_data):
    """
    Process a group (burst) of accelerometer data to calculate VEDBA metrics.
    Each group represents all accelerometer readings for a specific tag-timestamp combination.
    """
    # Extract X, Y, Z arrays for this group and convert to float
    x_values = group_data['X'].values.astype(float)
    y_values = group_data['Y'].values.astype(float)
    z_values = group_data['Z'].values.astype(float)
    
    # Calculate dynamic acceleration components
    x_component = np.abs(dy_acc(x_values))
    y_component = np.abs(dy_acc(y_values))
    z_component = np.abs(dy_acc(z_values))
    
    # Calculate vectorial sum (VEDBA for each sample)
    vectorial_sum = np.sqrt(x_component**2 + y_component**2 + z_component**2)
    ave_vedba_value = np.nanmean(vectorial_sum)
    
    # Calculate pitch
    pitch = np.arctan2(x_component, np.sqrt(y_component**2 + z_component**2))
    ave_pitch = np.nanmean(pitch)
    
    # Return summary metrics for this group
    return pd.Series({
        'tag': group_data['tag'].iloc[0],
        'timestamp': group_data['timestamp'].iloc[0], 
        'ave_vedba': ave_vedba_value,
        'ave_pitch': ave_pitch,
        'n_samples': len(group_data)
    })

# Process groups with minimal output to avoid freezing
print("Processing groups... (this will run silently to avoid output overflow)")

# Process all groups at once (no chunking for cleaner output)
results_df = acc_data_trim.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)

# Log-transform like in R
results_df['log_vedba'] = np.log(results_df['ave_vedba'])

# Add individual identifier columns
results_df['individual_local_identifier'] = results_df['tag']
results_df['tag_local_identifier'] = results_df['tag']
results_df['local_timestamp'] = results_df['timestamp']

# Select relevant columns
filtered_data = results_df[['individual_local_identifier', 
                           'local_timestamp', 
                           'tag_local_identifier', 
                           'log_vedba', 
                           'ave_pitch',
                           'n_samples']]


KeyError: 'tag'

In [11]:
print(f"‚úì Successfully processed {len(filtered_data)} groups")
print(f"Sample size per group stats:")
print(f"  Mean: {results_df['n_samples'].mean():.1f}")
print(f"  Min: {results_df['n_samples'].min()}")
print(f"  Max: {results_df['n_samples'].max()}")

# Save the processed data
filtered_data.to_parquet('data/acc_vedba_long_format.parquet', index=False)
print("‚úì Data saved to 'data/acc_vedba_long_format.parquet'")

# Show just first few rows
print("\nFirst 3 rows of results:")
print(filtered_data.head(3))

‚úì Successfully processed 466470 groups
Sample size per group stats:
  Mean: 40.0
  Min: 40
  Max: 40
‚úì Data saved to 'data/acc_vedba_long_format.parquet'

First 3 rows of results:
  individual_local_identifier           local_timestamp tag_local_identifier  \
0                 24AA11_9A7D 2024-07-01 00:10:00+00:00          24AA11_9A7D   
1                 24AA11_9A7D 2024-07-01 00:11:00+00:00          24AA11_9A7D   
2                 24AA11_9A7D 2024-07-01 00:32:00+00:00          24AA11_9A7D   

   log_vedba  ave_pitch  n_samples  
0   2.718889   0.672855         40  
1   0.094771   0.631980         40  
2   0.050509   0.477942         40  


In [12]:
# ---- Process ALL Files and Combine ----

import glob
import os

# Get all parquet files in the acc_v0 directory
data_folder = 'data/acc_v0'
parquet_files = glob.glob(os.path.join(data_folder, '*.parquet'))

print(f"Found {len(parquet_files)} parquet files to process:")
for file in parquet_files:
    print(f"  - {os.path.basename(file)}")

# Function to process a single file
def process_single_file(file_path):
    """
    Process a single parquet file and return VEDBA results
    """
    print(f"Processing: {os.path.basename(file_path)}")
    
    # Load the data
    acc_data = pd.read_parquet(file_path)
    
    # Create grouping column based on tag and timestamp
    acc_data['group_id'] = acc_data['tag'] + '_' + acc_data['timestamp'].astype(str)
    
    # Process all groups
    results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)
    
    # Log-transform
    results_df['log_vedba'] = np.log(results_df['ave_vedba'])
    
    # Add individual identifier columns
    results_df['individual_local_identifier'] = results_df['tag']
    results_df['tag_local_identifier'] = results_df['tag']
    results_df['local_timestamp'] = results_df['timestamp']
    
    # Select relevant columns
    filtered_data = results_df[['individual_local_identifier', 
                               'local_timestamp', 
                               'tag_local_identifier', 
                               'log_vedba', 
                               'ave_pitch',
                               'n_samples']]
    
    print(f"  ‚úì Processed {len(filtered_data)} groups from {os.path.basename(file_path)}")
    return filtered_data

# Process all files and combine
print("\n" + "="*50)
print("PROCESSING ALL FILES")
print("="*50)

all_results = []
total_groups = 0

for file_path in parquet_files:
    try:
        file_results = process_single_file(file_path)
        all_results.append(file_results)
        total_groups += len(file_results)
    except Exception as e:
        print(f"  ‚ùå Error processing {os.path.basename(file_path)}: {str(e)}")

# Combine all results into one large dataframe
if all_results:
    combined_data = pd.concat(all_results, ignore_index=True)
    
    print("\n" + "="*50)
    print("SUMMARY")
    print("="*50)
    print(f"‚úì Successfully processed {len(parquet_files)} files")
    print(f"‚úì Total groups processed: {total_groups}")
    print(f"‚úì Combined dataset shape: {combined_data.shape}")
    print(f"‚úì Unique individuals: {combined_data['individual_local_identifier'].nunique()}")
    print(f"‚úì Date range: {combined_data['local_timestamp'].min()} to {combined_data['local_timestamp'].max()}")
    
    # Save the combined data
    output_file = 'data/all_acc_vedba_combined.parquet'
    combined_data.to_parquet(output_file, index=False)
    print(f"‚úì Combined data saved to: {output_file}")
    
    # Show summary statistics
    print(f"\nSample size per group stats:")
    print(f"  Mean: {combined_data['n_samples'].mean():.1f}")
    print(f"  Min: {combined_data['n_samples'].min()}")
    print(f"  Max: {combined_data['n_samples'].max()}")
    
    print(f"\nVEDBA statistics:")
    print(f"  Mean log_vedba: {combined_data['log_vedba'].mean():.3f}")
    print(f"  Std log_vedba: {combined_data['log_vedba'].std():.3f}")
    
    # Show first few rows
    print(f"\nFirst 5 rows of combined data:")
    print(combined_data.head())
    
else:
    print("‚ùå No files were successfully processed!")

Found 19 parquet files to process:
  - 24AA01_5O8B.parquet
  - 24AA03_2A1P.parquet
  - 24AA05_4I0L.parquet
  - 24AA06_5I8Y.parquet
  - 24AA10_4R7W.parquet
  - 24AA11_9A7D.parquet
  - 24AA12_6P8Q.parquet
  - 24AA14_4N0F.parquet
  - 24AA16_9Q8P.parquet
  - 24AB02_0Y5R.parquet
  - 24AB03_4D7N.parquet
  - 24AB04_0V2Z.parquet
  - 24AB06_3I2H.parquet
  - 24AB07_5J8U.parquet
  - 24AC14_1D2E.parquet
  - 24AC18_9L0M.parquet
  - 24AC19_2N30.parquet
  - 24AD08_0F1G.parquet
  - 24AE08_4T5U.parquet

PROCESSING ALL FILES
Processing: 24AA01_5O8B.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 443232 groups from 24AA01_5O8B.parquet
Processing: 24AA03_2A1P.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 435904 groups from 24AA03_2A1P.parquet
Processing: 24AA05_4I0L.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 161671 groups from 24AA05_4I0L.parquet
Processing: 24AA06_5I8Y.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 463650 groups from 24AA06_5I8Y.parquet
Processing: 24AA10_4R7W.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 469463 groups from 24AA10_4R7W.parquet
Processing: 24AA11_9A7D.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 466470 groups from 24AA11_9A7D.parquet
Processing: 24AA12_6P8Q.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 464010 groups from 24AA12_6P8Q.parquet
Processing: 24AA14_4N0F.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 459197 groups from 24AA14_4N0F.parquet
Processing: 24AA16_9Q8P.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 473707 groups from 24AA16_9Q8P.parquet
Processing: 24AB02_0Y5R.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 445905 groups from 24AB02_0Y5R.parquet
Processing: 24AB03_4D7N.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 483582 groups from 24AB03_4D7N.parquet
Processing: 24AB04_0V2Z.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 393064 groups from 24AB04_0V2Z.parquet
Processing: 24AB06_3I2H.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 475211 groups from 24AB06_3I2H.parquet
Processing: 24AB07_5J8U.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 464678 groups from 24AB07_5J8U.parquet
Processing: 24AC14_1D2E.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 438257 groups from 24AC14_1D2E.parquet
Processing: 24AC18_9L0M.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 444351 groups from 24AC18_9L0M.parquet
Processing: 24AC19_2N30.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 279214 groups from 24AC19_2N30.parquet
Processing: 24AD08_0F1G.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 350438 groups from 24AD08_0F1G.parquet
Processing: 24AE08_4T5U.parquet


  results_df = acc_data.groupby('group_id', group_keys=False).apply(process_group).reset_index(drop=True)


  ‚úì Processed 85873 groups from 24AE08_4T5U.parquet

SUMMARY
‚úì Successfully processed 19 files
‚úì Total groups processed: 7697877
‚úì Combined dataset shape: (7697877, 6)

SUMMARY
‚úì Successfully processed 19 files
‚úì Total groups processed: 7697877
‚úì Combined dataset shape: (7697877, 6)
‚úì Unique individuals: 19
‚úì Date range: 2024-07-01 00:00:00+00:00 to 2025-06-02 00:00:00+00:00
‚úì Unique individuals: 19
‚úì Date range: 2024-07-01 00:00:00+00:00 to 2025-06-02 00:00:00+00:00
‚úì Combined data saved to: data/all_acc_vedba_combined.parquet

Sample size per group stats:
  Mean: 40.0
  Min: 40
  Max: 40

VEDBA statistics:
  Mean log_vedba: 2.096
‚úì Combined data saved to: data/all_acc_vedba_combined.parquet

Sample size per group stats:
  Mean: 40.0
  Min: 40
  Max: 40

VEDBA statistics:
  Mean log_vedba: 2.096
  Std log_vedba: 1.638

First 5 rows of combined data:
  individual_local_identifier           local_timestamp tag_local_identifier  \
0                 24AA01_5O8B 2

In [13]:
# ---- SIMPLIFIED: Process ALL Files - VEDBA Only ----

import glob
import os

# Simplified function to process each group (VEDBA only)
def process_group_simple(group_data):
    """
    Simplified processing: only calculate VEDBA metrics
    """
    # Convert to float and calculate dynamic acceleration
    x_values = group_data['X'].values.astype(float)
    y_values = group_data['Y'].values.astype(float)
    z_values = group_data['Z'].values.astype(float)
    
    # Calculate VEDBA components
    x_component = np.abs(dy_acc(x_values))
    y_component = np.abs(dy_acc(y_values))
    z_component = np.abs(dy_acc(z_values))
    
    # VEDBA calculation
    vectorial_sum = np.sqrt(x_component**2 + y_component**2 + z_component**2)
    ave_vedba = np.nanmean(vectorial_sum)
    
    return pd.Series({
        'tag': group_data['tag'].iloc[0],
        'timestamp': group_data['timestamp'].iloc[0], 
        'ave_vedba': ave_vedba,
        'n_samples': len(group_data)
    })

# Process all files
data_folder = 'data/acc_v0'
parquet_files = glob.glob(os.path.join(data_folder, '*.parquet'))

print(f"Processing {len(parquet_files)} files (VEDBA only)...")

all_results = []
for i, file_path in enumerate(parquet_files, 1):
    print(f"[{i}/{len(parquet_files)}] {os.path.basename(file_path)}")
    
    # Load and process
    acc_data = pd.read_parquet(file_path)
    acc_data['group_id'] = acc_data['tag'] + '_' + acc_data['timestamp'].astype(str)
    
    # Process groups
    results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple).reset_index(drop=True)
    all_results.append(results)

# Combine and finalize
combined_data = pd.concat(all_results, ignore_index=True)

# Add log transformation
combined_data['log_vedba'] = np.log(combined_data['ave_vedba'])

# Final dataset with only essential columns
final_data = combined_data[['tag', 'timestamp', 'ave_vedba', 'log_vedba', 'n_samples']]

print(f"\n‚úì Processed {len(final_data)} total groups from {len(parquet_files)} files")
print(f"‚úì Dataset shape: {final_data.shape}")
print(f"‚úì Unique tags: {final_data['tag'].nunique()}")

# Save simplified result
final_data.to_parquet('data/vedba_simplified_all.parquet', index=False)
print(f"‚úì Saved to: data/vedba_simplified_all.parquet")

print(f"\nFirst 5 rows:")
print(final_data.head())

Processing 96 files (VEDBA only)...
[1/96] 24AC20_4M5N.parquet


KeyboardInterrupt: 

In [16]:
# ---- Create Separate VEDBA Files by Animal ----

import os

# Create the vedba directory if it doesn't exist
vedba_dir = 'data/vedba'
os.makedirs(vedba_dir, exist_ok=True)

# Load the combined VEDBA data (assuming it exists from previous cell)
if 'final_data' in locals():
    vedba_data = final_data.copy()
else:
    # If final_data doesn't exist, load from saved file
    vedba_data = pd.read_parquet('data/vedba_simplified_all.parquet')

print(f"Creating separate files for each animal...")
print(f"Total data shape: {vedba_data.shape}")
print(f"Unique animals: {vedba_data['tag'].nunique()}")

# Get unique tags (animal IDs)
unique_tags = vedba_data['tag'].unique()

# Create separate file for each animal
for i, tag in enumerate(unique_tags, 1):
    # Filter data for this animal
    animal_data = vedba_data[vedba_data['tag'] == tag].copy()
    
    # Sort by timestamp for chronological order
    animal_data = animal_data.sort_values('timestamp').reset_index(drop=True)
    
    # Create filename
    filename = f"{tag}.parquet"
    filepath = os.path.join(vedba_dir, filename)
    
    # Save the file
    animal_data.to_parquet(filepath, index=False)
    
    print(f"[{i}/{len(unique_tags)}] {tag}: {len(animal_data)} records ‚Üí {filename}")

print(f"\n‚úì Created {len(unique_tags)} individual VEDBA files in '{vedba_dir}/' folder")

# Show summary of what was created
print(f"\nFile summary:")
for tag in sorted(unique_tags):
    animal_data = vedba_data[vedba_data['tag'] == tag]
    date_range = f"{animal_data['timestamp'].min()} to {animal_data['timestamp'].max()}"
    print(f"  {tag}.parquet: {len(animal_data)} records, {date_range}")

print(f"\nFolder contents:")
vedba_files = os.listdir(vedba_dir)
print(f"  {len(vedba_files)} files created in data/vedba/")
for file in sorted(vedba_files)[:5]:  # Show first 5 files
    print(f"    - {file}")
if len(vedba_files) > 5:
    print(f"    ... and {len(vedba_files) - 5} more files")

FileNotFoundError: [Errno 2] No such file or directory: 'data/vedba_simplified_all.parquet'

In [15]:
# ---- Process Each Animal File Individually ----

import glob
import os

# Create the vedba directory if it doesn't exist
vedba_dir = 'data/vedba'
os.makedirs(vedba_dir, exist_ok=True)

# Get all input parquet files (each represents one animal)
data_folder = 'data/acc_v0'
parquet_files = glob.glob(os.path.join(data_folder, '*.parquet'))

print(f"Processing {len(parquet_files)} animal files individually...")
print("Each input file = 1 animal ‚Üí 1 output VEDBA file")

# Simplified function to process each group (VEDBA only)
def process_group_simple(group_data):
    """
    Simplified processing: only calculate VEDBA metrics
    """
    # Convert to float and calculate dynamic acceleration
    x_values = group_data['X'].values.astype(float)
    y_values = group_data['Y'].values.astype(float)
    z_values = group_data['Z'].values.astype(float)
    
    # Calculate VEDBA components
    x_component = np.abs(dy_acc(x_values))
    y_component = np.abs(dy_acc(y_values))
    z_component = np.abs(dy_acc(z_values))
    
    # VEDBA calculation
    vectorial_sum = np.sqrt(x_component**2 + y_component**2 + z_component**2)
    ave_vedba = np.nanmean(vectorial_sum)
    
    return pd.Series({
        'tag': group_data['tag'].iloc[0],
        'timestamp': group_data['timestamp'].iloc[0], 
        'ave_vedba': ave_vedba,
        'n_samples': len(group_data)
    })

def process_animal_file(file_path):
    """
    Process a single animal file and create individual VEDBA output
    """
    filename = os.path.basename(file_path)
    animal_id = filename.replace('.parquet', '')  # Extract animal ID from filename
    
    print(f"Processing: {animal_id}")
    
    # Load the animal's accelerometer data
    acc_data = pd.read_parquet(file_path)
    
    # Create grouping column for this animal's data
    acc_data['group_id'] = acc_data['tag'] + '_' + acc_data['timestamp'].astype(str)
    
    # Process all groups for this animal
    results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple).reset_index(drop=True)
    
    # Add log transformation
    results['log_vedba'] = np.log(results['ave_vedba'])
    
    # Sort by timestamp for chronological order
    results = results.sort_values('timestamp').reset_index(drop=True)
    
    # Select final columns
    vedba_data = results[['tag', 'timestamp', 'ave_vedba', 'log_vedba', 'n_samples']]
    
    # Create output filename using the same animal ID
    output_file = f"{animal_id}.parquet"
    output_path = os.path.join(vedba_dir, output_file)
    
    # Save individual VEDBA file
    vedba_data.to_parquet(output_path, index=False)
    
    return {
        'animal_id': animal_id,
        'input_file': filename,
        'output_file': output_file,
        'records': len(vedba_data),
        'date_range': f"{vedba_data['timestamp'].min()} to {vedba_data['timestamp'].max()}"
    }

# Process each animal file
print("\n" + "="*60)
print("PROCESSING INDIVIDUAL ANIMALS")
print("="*60)

summary_list = []
for i, file_path in enumerate(parquet_files, 1):
    try:
        result_summary = process_animal_file(file_path)
        summary_list.append(result_summary)
        
        print(f"[{i}/{len(parquet_files)}] {result_summary['animal_id']}: "
              f"{result_summary['records']} VEDBA records ‚Üí {result_summary['output_file']}")
        
    except Exception as e:
        print(f"[{i}/{len(parquet_files)}] ‚ùå Error processing {os.path.basename(file_path)}: {str(e)}")

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"‚úì Successfully processed {len(summary_list)} animals")
print(f"‚úì Created {len(summary_list)} individual VEDBA files in '{vedba_dir}/'")

# Show detailed summary
print(f"\nDetailed summary:")
for summary in summary_list:
    print(f"  {summary['animal_id']}: {summary['records']} records")
    print(f"    Date range: {summary['date_range']}")
    print(f"    File: {summary['output_file']}")
    print()

# Verify folder contents
vedba_files = [f for f in os.listdir(vedba_dir) if f.endswith('.parquet')]
print(f"Folder verification:")
print(f"  {len(vedba_files)} VEDBA files created in data/vedba/")
print(f"  Files: {', '.join(sorted(vedba_files))}")

print(f"\n‚úì Individual animal processing complete!")
print(f"Each animal now has its own VEDBA file: data/vedba/[AnimalID].parquet")

Processing 96 animal files individually...
Each input file = 1 animal ‚Üí 1 output VEDBA file

PROCESSING INDIVIDUAL ANIMALS
Processing: 24AA01_5O8B


  results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple).reset_index(drop=True)


[1/96] 24AA01_5O8B: 443232 VEDBA records ‚Üí 24AA01_5O8B.parquet
Processing: 24AA03_2A1P


KeyboardInterrupt: 

In [None]:
# ---- PARALLEL Processing: Process ALL Files Much Faster ----

import glob
import os
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, as_completed
import time

# Create the vedba directory if it doesn't exist
vedba_dir = 'data/vedba'
os.makedirs(vedba_dir, exist_ok=True)

def process_animal_file_parallel(file_path):
    """
    Process a single animal file and create individual VEDBA output (parallel version)
    This function will be run in separate processes
    """
    import pandas as pd
    import numpy as np
    import os
    
    # Re-define dy_acc function for parallel processing (each process needs its own copy)
    def dy_acc(vect, win_size=7):
        if vect is None or len(vect) == 0:
            raise ValueError("Input vector is empty or invalid.")
        
        pad_size = int(win_size / 2 - 0.5)
        padded = np.pad(vect, (pad_size, pad_size), constant_values=np.nan)
        acc_vec = np.empty(len(vect))
        acc_vec[:] = np.nan

        for i in range(len(vect)):
            window = padded[i : i + (2 * pad_size + 1)]
            m_ave = np.nanmean(window)
            acc_vec[i] = vect[i] - m_ave
        
        return acc_vec
    
    def process_group_simple_parallel(group_data):
        """Simplified processing for parallel execution"""
        x_values = group_data['X'].values.astype(float)
        y_values = group_data['Y'].values.astype(float)
        z_values = group_data['Z'].values.astype(float)
        
        x_component = np.abs(dy_acc(x_values))
        y_component = np.abs(dy_acc(y_values))
        z_component = np.abs(dy_acc(z_values))
        
        vectorial_sum = np.sqrt(x_component**2 + y_component**2 + z_component**2)
        ave_vedba = np.nanmean(vectorial_sum)
        
        return pd.Series({
            'tag': group_data['tag'].iloc[0],
            'timestamp': group_data['timestamp'].iloc[0], 
            'ave_vedba': ave_vedba,
            'n_samples': len(group_data)
        })
    
    try:
        filename = os.path.basename(file_path)
        animal_id = filename.replace('.parquet', '')
        
        # Load the animal's accelerometer data
        acc_data = pd.read_parquet(file_path)
        
        # Create grouping column
        acc_data['group_id'] = acc_data['tag'] + '_' + acc_data['timestamp'].astype(str)
        
        # Process all groups for this animal
        results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple_parallel).reset_index(drop=True)
        
        # Add log transformation
        results['log_vedba'] = np.log(results['ave_vedba'])
        
        # Sort by timestamp
        results = results.sort_values('timestamp').reset_index(drop=True)
        
        # Select final columns
        vedba_data = results[['tag', 'timestamp', 'ave_vedba', 'log_vedba', 'n_samples']]
        
        # Save individual VEDBA file
        output_file = f"{animal_id}.parquet"
        output_path = os.path.join(vedba_dir, output_file)
        vedba_data.to_parquet(output_path, index=False)
        
        return {
            'animal_id': animal_id,
            'input_file': filename,
            'output_file': output_file,
            'records': len(vedba_data),
            'date_range': f"{vedba_data['timestamp'].min()} to {vedba_data['timestamp'].max()}",
            'status': 'success'
        }
        
    except Exception as e:
        return {
            'animal_id': os.path.basename(file_path).replace('.parquet', ''),
            'input_file': os.path.basename(file_path),
            'error': str(e),
            'status': 'error'
        }

# Get all input files
data_folder = 'data/acc_v0'
parquet_files = glob.glob(os.path.join(data_folder, '*.parquet'))

# Determine optimal number of workers (use 75% of available cores)
n_cores = mp.cpu_count()
n_workers = max(1, int(n_cores * 0.75))  # Use 75% of cores, minimum 1

print(f"üöÄ PARALLEL PROCESSING SETUP")
print(f"üìä Found {len(parquet_files)} animal files to process")
print(f"üíª Available CPU cores: {n_cores}")
print(f"‚ö° Using {n_workers} parallel workers")
print(f"üìÅ Output directory: {vedba_dir}")

print(f"\n" + "="*60)
print("STARTING PARALLEL PROCESSING")
print("="*60)

start_time = time.time()
summary_list = []
failed_files = []

# Process files in parallel
with ProcessPoolExecutor(max_workers=n_workers) as executor:
    # Submit all jobs
    future_to_file = {executor.submit(process_animal_file_parallel, file_path): file_path 
                     for file_path in parquet_files}
    
    # Collect results as they complete
    for i, future in enumerate(as_completed(future_to_file), 1):
        file_path = future_to_file[future]
        result = future.result()
        
        if result['status'] == 'success':
            summary_list.append(result)
            print(f"‚úÖ [{i}/{len(parquet_files)}] {result['animal_id']}: {result['records']} records")
        else:
            failed_files.append(result)
            print(f"‚ùå [{i}/{len(parquet_files)}] {result['animal_id']}: ERROR - {result['error']}")

end_time = time.time()
processing_time = end_time - start_time

print(f"\n" + "="*60)
print("PARALLEL PROCESSING COMPLETE")
print("="*60)
print(f"‚è±Ô∏è  Total processing time: {processing_time:.2f} seconds ({processing_time/60:.1f} minutes)")
print(f"‚úÖ Successfully processed: {len(summary_list)} animals")
print(f"‚ùå Failed: {len(failed_files)} animals")
print(f"üìä Average time per animal: {processing_time/len(parquet_files):.2f} seconds")

if summary_list:
    total_records = sum(s['records'] for s in summary_list)
    print(f"üìà Total VEDBA records created: {total_records:,}")
    print(f"‚ö° Processing speed: {total_records/processing_time:.0f} records/second")

# Show successful processing summary
if summary_list:
    print(f"\nüìã Successfully processed animals:")
    for summary in summary_list[:10]:  # Show first 10
        print(f"   {summary['animal_id']}: {summary['records']:,} records")
    if len(summary_list) > 10:
        print(f"   ... and {len(summary_list) - 10} more animals")

# Show any failures
if failed_files:
    print(f"\n‚ö†Ô∏è  Failed processing:")
    for failed in failed_files:
        print(f"   {failed['animal_id']}: {failed['error']}")

# Verify output
vedba_files = [f for f in os.listdir(vedba_dir) if f.endswith('.parquet')]
print(f"\nüìÅ Output verification:")
print(f"   {len(vedba_files)} VEDBA files created in {vedba_dir}/")

print(f"\nüéâ Parallel processing complete! Each animal's VEDBA data is in: {vedba_dir}/[AnimalID].parquet")

üöÄ PARALLEL PROCESSING SETUP
üìä Found 96 animal files to process
üíª Available CPU cores: 104
‚ö° Using 78 parallel workers
üìÅ Output directory: data/vedba

STARTING PARALLEL PROCESSING


  results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple_parallel).reset_index(drop=True)


‚úÖ [1/96] 24AE47_3G6U: 6033 records


  results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple_parallel).reset_index(drop=True)


‚úÖ [2/96] 24AE53_5D3D: 40262 records


  results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple_parallel).reset_index(drop=True)


‚úÖ [3/96] 24AE40_0E0K: 48591 records


  results = acc_data.groupby('group_id', group_keys=False).apply(process_group_simple_parallel).reset_index(drop=True)


‚úÖ [4/96] 24AE08_4T5U: 85873 records
