In [1]:
import json
from pathlib import Path
from metagpt.tools.libs.editor import Editor

# Create the race data structure (same as before)
race_data = {
    "metadata": {
        "total_boats": int(df['id_athlete'].nunique()),
        "regatta_number": int(df['regatta_number'].iloc[0]),
        "geographic_bounds": {
            "latitude": {
                "min": float(df['latitude'].min()) if not pd.isna(df['latitude'].min()) else float(df['latitude_strapolated'].min()),
                "max": float(df['latitude'].max()) if not pd.isna(df['latitude'].max()) else float(df['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df['longitude'].min()) if not pd.isna(df['longitude'].min()) else float(df['longitude_strapolated'].min()),
                "max": float(df['longitude'].max()) if not pd.isna(df['longitude'].max()) else float(df['longitude_strapolated'].max())
            }
        },
        "time_range": {
            "start": df['sample_time'].min().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            "end": df['sample_time'].max().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        }
    },
    "boats": {}
}

# Process data for each boat
for boat_id in df['id_athlete'].unique():
    boat_df = df[df['id_athlete'] == boat_id].copy()
    
    # Sort by timestamp to ensure proper ordering
    boat_df = boat_df.sort_values('sample_time')
    
    # Use strapolated coordinates if original ones are missing
    coordinates = []
    for _, row in boat_df.iterrows():
        lat = row['latitude'] if not pd.isna(row['latitude']) else row['latitude_strapolated']
        lon = row['longitude'] if not pd.isna(row['longitude']) else row['longitude_strapolated']
        coordinates.append([float(lon), float(lat)])  # GeoJSON format: [longitude, latitude]
    
    # Format timestamps as ISO format strings
    timestamps = boat_df['sample_time'].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ').tolist()
    
    # Calculate statistics safely
    avg_speed = float(boat_df['speed'].mean()) if not pd.isna(boat_df['speed'].mean()) else 0.0
    max_speed = float(boat_df['speed'].max()) if not pd.isna(boat_df['speed'].max()) else 0.0
    total_distance = float(boat_df['distance_from_start'].max()) if not pd.isna(boat_df['distance_from_start'].max()) else 0.0
    
    race_data["boats"][str(int(boat_id))] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "properties": {
            "timestamps": timestamps,
            "speeds": boat_df['speed'].fillna(0).astype(float).tolist(),
            "directions": boat_df['direction'].fillna(0).astype(float).tolist(),
            "distance_from_start": boat_df['distance_from_start'].fillna(0).astype(float).tolist(),
            "distance_to_finish": boat_df['distance_to_finish'].fillna(0).astype(float).tolist()
        },
        "metadata": {
            "avg_speed": avg_speed,
            "max_speed": max_speed,
            "total_distance": total_distance
        }
    }

# Initialize editor
editor = Editor()

# Convert race_data to JSON string with proper indentation
json_content = json.dumps(race_data, indent=2)

# Save using the Editor tool
await editor.write('data/race_data.json', json_content)

print("Race data has been successfully exported to race_data.json")
print(f"Total boats processed: {len(race_data['boats'])}")
print(f"Geographic bounds: {race_data['metadata']['geographic_bounds']}")
print(f"Time range: {race_data['metadata']['time_range']}")

NameError: name 'df' is not defined

In [2]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from io import StringIO
from metagpt.tools.libs.editor import Editor

# Initialize editor
editor = Editor()

# Read the GPS data file
file_content = await editor.read('uploads/gpsdata.csv')

# Clean the raw content to get actual CSV data
raw_lines = str(file_content).split('\n')
start_idx = next(i for i, line in enumerate(raw_lines) if '|' in line and 'id_meeting' in line)
clean_lines = []
for line in raw_lines[start_idx:]:
    if '|' in line:
        content = line.split('|')[1].strip()
        if content and ('id_meeting' in content or content[0].isdigit()):
            clean_lines.append(content)

# Create DataFrame from cleaned data
df = pd.read_csv(
    StringIO('\n'.join(clean_lines)),
    delim_whitespace=True,  # Handle multiple spaces
    parse_dates=['sample_time', 'sample_time_strapolated']
)

# Create race data structure
race_data = {
    "metadata": {
        "total_boats": int(df['id_athlete'].nunique()),
        "regatta_number": int(df['regatta_number'].iloc[0]),
        "geographic_bounds": {
            "latitude": {
                "min": float(df['latitude'].min()) if not pd.isna(df['latitude'].min()) else float(df['latitude_strapolated'].min()),
                "max": float(df['latitude'].max()) if not pd.isna(df['latitude'].max()) else float(df['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df['longitude'].min()) if not pd.isna(df['longitude'].min()) else float(df['longitude_strapolated'].min()),
                "max": float(df['longitude'].max()) if not pd.isna(df['longitude'].max()) else float(df['longitude_strapolated'].max())
            }
        },
        "time_range": {
            "start": df['sample_time'].min().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            "end": df['sample_time'].max().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        }
    },
    "boats": {}
}

# Process data for each boat
for boat_id in df['id_athlete'].unique():
    boat_df = df[df['id_athlete'] == boat_id].copy()
    boat_df = boat_df.sort_values('sample_time')
    
    # Use strapolated coordinates if original ones are missing
    coordinates = []
    for _, row in boat_df.iterrows():
        lat = row['latitude'] if not pd.isna(row['latitude']) else row['latitude_strapolated']
        lon = row['longitude'] if not pd.isna(row['longitude']) else row['longitude_strapolated']
        if not pd.isna(lat) and not pd.isna(lon):
            coordinates.append([float(lon), float(lat)])  # GeoJSON format: [longitude, latitude]
    
    # Ensure all numeric values are properly handled
    avg_speed = float(boat_df['speed'].mean()) if not pd.isna(boat_df['speed'].mean()) else 0.0
    max_speed = float(boat_df['speed'].max()) if not pd.isna(boat_df['speed'].max()) else 0.0
    total_distance = float(boat_df['distance_from_start'].max()) if not pd.isna(boat_df['distance_from_start'].max()) else 0.0
    
    race_data["boats"][str(int(boat_id))] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "properties": {
            "timestamps": boat_df['sample_time'].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ').tolist(),
            "speeds": boat_df['speed'].fillna(0).astype(float).tolist(),
            "directions": boat_df['direction'].fillna(0).astype(float).tolist(),
            "distance_from_start": boat_df['distance_from_start'].fillna(0).astype(float).tolist(),
            "distance_to_finish": boat_df['distance_to_finish'].fillna(0).astype(float).tolist()
        },
        "metadata": {
            "avg_speed": avg_speed,
            "max_speed": max_speed,
            "total_distance": total_distance
        }
    }

# Convert to JSON and save using Editor
json_content = json.dumps(race_data, indent=2)
await editor.write('data/race_data.json', json_content)

print("Race data has been successfully exported to race_data.json")
print(f"Total boats processed: {len(race_data['boats'])}")
print(f"Geographic bounds: {race_data['metadata']['geographic_bounds']}")
print(f"Time range: {race_data['metadata']['time_range']}")

EmptyDataError: No columns to parse from file

In [3]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from io import StringIO
from metagpt.tools.libs.editor import Editor

# Initialize editor and read file
editor = Editor()
file_content = await editor.read('uploads/gpsdata.csv')

# Debug: Print first few lines to understand structure
raw_lines = str(file_content).split('\\n')
header = None
data_lines = []

# Process lines to extract clean CSV data
for line in raw_lines:
    if '|' in line:
        # Remove line numbers and get content after pipe
        parts = line.split('|')
        if len(parts) > 1:
            content = parts[1].strip()
            if 'id_meeting' in content:  # This is the header row
                header = content
            elif content and content[0].isdigit():  # This is a data row
                data_lines.append(content)

# Check if we have valid data
if header and data_lines:
    print("Found header and data")
    
    # Create DataFrame with careful parsing
    df = pd.read_csv(
        StringIO(header + '\n' + '\n'.join(data_lines)),
        sep=r'\s+',  # Handle multiple spaces between columns
        engine='python'  # More flexible parsing engine
    )
    
    # Convert date columns
    time_cols = ['sample_time', 'sample_time_strapolated']
    for col in time_cols:
        df[col] = pd.to_datetime(df[col])
    
    # Prepare race data structure
    race_data = {
        "metadata": {
            "total_boats": int(df['id_athlete'].nunique()),
            "regatta_number": int(df['regatta_number'].iloc[0]),
            "geographic_bounds": {
                "latitude": {
                    "min": float(df['latitude'].min()) if not pd.isna(df['latitude'].min()) else float(df['latitude_strapolated'].min()),
                    "max": float(df['latitude'].max()) if not pd.isna(df['latitude'].max()) else float(df['latitude_strapolated'].max())
                },
                "longitude": {
                    "min": float(df['longitude'].min()) if not pd.isna(df['longitude'].min()) else float(df['longitude_strapolated'].min()),
                    "max": float(df['longitude'].max()) if not pd.isna(df['longitude'].max()) else float(df['longitude_strapolated'].max())
                }
            },
            "time_range": {
                "start": df['sample_time'].min().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                "end": df['sample_time'].max().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
            }
        },
        "boats": {}
    }
    
    # Process each boat's data
    for boat_id in df['id_athlete'].unique():
        boat_df = df[df['id_athlete'] == boat_id].copy()
        boat_df = boat_df.sort_values('sample_time')
        
        # Process coordinates
        coordinates = []
        for _, row in boat_df.iterrows():
            lat = row['latitude'] if not pd.isna(row['latitude']) else row['latitude_strapolated']
            lon = row['longitude'] if not pd.isna(row['longitude']) else row['longitude_strapolated']
            if not pd.isna(lat) and not pd.isna(lon):
                coordinates.append([float(lon), float(lat)])
        
        # Calculate statistics safely
        stats = {
            "avg_speed": float(boat_df['speed'].mean()) if not pd.isna(boat_df['speed'].mean()) else 0.0,
            "max_speed": float(boat_df['speed'].max()) if not pd.isna(boat_df['speed'].max()) else 0.0,
            "total_distance": float(boat_df['distance_from_start'].max()) if not pd.isna(boat_df['distance_from_start'].max()) else 0.0
        }
        
        race_data["boats"][str(int(boat_id))] = {
            "track": {
                "type": "LineString",
                "coordinates": coordinates
            },
            "properties": {
                "timestamps": boat_df['sample_time'].dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ').tolist(),
                "speeds": boat_df['speed'].fillna(0).astype(float).tolist(),
                "directions": boat_df['direction'].fillna(0).astype(float).tolist(),
                "distance_from_start": boat_df['distance_from_start'].fillna(0).astype(float).tolist(),
                "distance_to_finish": boat_df['distance_to_finish'].fillna(0).astype(float).tolist()
            },
            "metadata": stats
        }
    
    # Save to JSON file using Editor
    json_content = json.dumps(race_data, indent=2)
    await editor.write('data/race_data.json', json_content)
    
    print("Race data has been successfully exported to race_data.json")
    print(f"Total boats processed: {len(race_data['boats'])}")
    print(f"Geographic bounds: {race_data['metadata']['geographic_bounds']}")
    print(f"Time range: {race_data['metadata']['time_range']}")
else:
    print("Error: Could not find valid header or data in the file content")
    print("First few lines of file content:")
    print('\n'.join(raw_lines[:5]))

Found header and data


ValueError: invalid literal for int() with base 10: '5030adba-5014-4e7e-a0d1-344cd501d81b'

In [4]:
# Process the data into properly structured JSON
race_data = {
    "metadata": {
        "total_boats": len(df['id_athlete'].unique()),
        "regatta_number": df['regatta_number'].iloc[0],  # Keep as is, no int casting
        "geographic_bounds": {
            "latitude": {
                "min": float(df['latitude_strapolated'].min()),
                "max": float(df['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df['longitude'].min()),
                "max": float(df['longitude'].max())
            }
        },
        "time_range": {
            "start": df['sample_time'].min().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            "end": df['sample_time'].max().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        }
    },
    "boats": {}
}

# Process each boat's data
for boat_id in df['id_athlete'].unique():
    boat_df = df[df['id_athlete'] == boat_id].copy()
    boat_df = boat_df.sort_values('sample_time')
    
    # Use strapolated coordinates where original ones are missing
    boat_df['final_latitude'] = boat_df['latitude'].fillna(boat_df['latitude_strapolated'])
    boat_df['final_longitude'] = boat_df['longitude'].fillna(boat_df['longitude_strapolated'])
    
    # Ensure numeric types for calculations
    numeric_cols = ['speed', 'direction', 'distance_from_start', 'distance_to_finish']
    for col in numeric_cols:
        boat_df[col] = pd.to_numeric(boat_df[col], errors='coerce').fillna(0)
    
    # Create track coordinates
    coordinates = [[float(lon), float(lat)] 
                  for lon, lat in zip(boat_df['final_longitude'], boat_df['final_latitude'])
                  if not (pd.isna(lon) or pd.isna(lat))]
    
    # Calculate statistics
    stats = {
        "avg_speed": float(boat_df['speed'].mean()),
        "max_speed": float(boat_df['speed'].max()),
        "total_distance": float(boat_df['distance_from_start'].max())
    }
    
    # Store boat data
    race_data["boats"][str(boat_id)] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "timepoints": [
            {
                "timestamp": ts.strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                "position": [float(lon), float(lat)],
                "speed": float(spd),
                "direction": float(dir),
                "distance_from_start": float(dist)
            }
            for ts, lon, lat, spd, dir, dist in zip(
                boat_df['sample_time'],
                boat_df['final_longitude'],
                boat_df['final_latitude'],
                boat_df['speed'],
                boat_df['direction'],
                boat_df['distance_from_start']
            )
        ],
        "metadata": stats
    }

# Save processed data using Editor
await editor.write('data/race_data.json', json.dumps(race_data, indent=2))
print("Race data has been successfully processed and saved")
print(f"Total boats processed: {len(race_data['boats'])}")
print(f"Time range: {race_data['metadata']['time_range']}")

TypeError: object str can't be used in 'await' expression

In [5]:
# Process the data into properly structured JSON
race_data = {
    "metadata": {
        "total_boats": len(df['id_athlete'].unique()),
        "regatta_number": df['regatta_number'].iloc[0],  # Keep as is, no int casting
        "geographic_bounds": {
            "latitude": {
                "min": float(df['latitude_strapolated'].min()),
                "max": float(df['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df['longitude'].min()),
                "max": float(df['longitude'].max())
            }
        },
        "time_range": {
            "start": df['sample_time'].min().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            "end": df['sample_time'].max().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        }
    },
    "boats": {}
}

# Process each boat's data
for boat_id in df['id_athlete'].unique():
    boat_df = df[df['id_athlete'] == boat_id].copy()
    boat_df = boat_df.sort_values('sample_time')
    
    # Use strapolated coordinates where original ones are missing
    boat_df['final_latitude'] = boat_df['latitude'].fillna(boat_df['latitude_strapolated'])
    boat_df['final_longitude'] = boat_df['longitude'].fillna(boat_df['longitude_strapolated'])
    
    # Ensure numeric types for calculations
    numeric_cols = ['speed', 'direction', 'distance_from_start', 'distance_to_finish']
    for col in numeric_cols:
        boat_df[col] = pd.to_numeric(boat_df[col], errors='coerce').fillna(0)
    
    # Create track coordinates
    coordinates = [[float(lon), float(lat)] 
                  for lon, lat in zip(boat_df['final_longitude'], boat_df['final_latitude'])
                  if not (pd.isna(lon) or pd.isna(lat))]
    
    # Calculate statistics
    stats = {
        "avg_speed": float(boat_df['speed'].mean()),
        "max_speed": float(boat_df['speed'].max()),
        "total_distance": float(boat_df['distance_from_start'].max())
    }
    
    # Store boat data
    race_data["boats"][str(boat_id)] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "timepoints": [
            {
                "timestamp": ts.strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                "position": [float(lon), float(lat)],
                "speed": float(spd),
                "direction": float(dir),
                "distance_from_start": float(dist)
            }
            for ts, lon, lat, spd, dir, dist in zip(
                boat_df['sample_time'],
                boat_df['final_longitude'],
                boat_df['final_latitude'],
                boat_df['speed'],
                boat_df['direction'],
                boat_df['distance_from_start']
            )
        ],
        "metadata": stats
    }

# Save processed data using Editor (without await since write is not async)
editor.write('data/race_data.json', json.dumps(race_data, indent=2))
print("Race data has been successfully processed and saved")
print(f"Total boats processed: {len(race_data['boats'])}")
print(f"Time range: {race_data['metadata']['time_range']}")

Race data has been successfully processed and saved
Total boats processed: 5
Time range: {'start': '1970-01-01T00:00:00.000000Z', 'end': '1970-01-01T00:00:00.000000Z'}


In [6]:
# First check and fix the timestamp data
print("Sample of raw timestamp data:")
print(df['sample_time'].head())

# Convert timestamps correctly (they are already in datetime format from previous code)
df_processed = df.copy()

# Fix boat ID processing - use regatta_number and id_athlete as composite key
df_processed['boat_id'] = df_processed['regatta_number'].astype(str) + '_' + df_processed['id_athlete'].astype(str)

# Process data into correct JSON structure
race_data = {
    "metadata": {
        "total_boats": len(df_processed['boat_id'].unique()),
        "regatta_number": int(df_processed['regatta_number'].iloc[0]),
        "geographic_bounds": {
            "latitude": {
                "min": float(df_processed['latitude_strapolated'].min()),
                "max": float(df_processed['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df_processed['longitude'].min()),
                "max": float(df_processed['longitude'].max())
            }
        },
        "time_range": {
            "start": df_processed['sample_time'].min().strftime('%Y-%m-%d %H:%M:%S'),
            "end": df_processed['sample_time'].max().strftime('%Y-%m-%d %H:%M:%S')
        }
    },
    "boats": {}
}

# Process each boat's data
for boat_id in df_processed['boat_id'].unique():
    boat_df = df_processed[df_processed['boat_id'] == boat_id].copy()
    boat_df = boat_df.sort_values('sample_time')
    
    # Ensure coordinates are properly handled
    boat_df['final_latitude'] = boat_df['latitude'].fillna(boat_df['latitude_strapolated'])
    boat_df['final_longitude'] = boat_df['longitude'].fillna(boat_df['longitude_strapolated'])
    
    # Process numeric data
    numeric_cols = ['speed', 'direction', 'distance_from_start', 'distance_to_finish']
    for col in numeric_cols:
        boat_df[col] = pd.to_numeric(boat_df[col], errors='coerce').fillna(0)
    
    # Create track coordinates
    coordinates = [[float(lon), float(lat)] 
                  for lon, lat in zip(boat_df['final_longitude'], boat_df['final_latitude'])
                  if not (pd.isna(lon) or pd.isna(lat))]
    
    # Calculate boat statistics
    stats = {
        "avg_speed": float(boat_df['speed'].mean()),
        "max_speed": float(boat_df['speed'].max()),
        "total_distance": float(boat_df['distance_from_start'].max()),
        "duration_seconds": (boat_df['sample_time'].max() - boat_df['sample_time'].min()).total_seconds()
    }
    
    # Create timepoints with proper timestamp formatting
    timepoints = []
    for _, row in boat_df.iterrows():
        timepoint = {
            "timestamp": row['sample_time'].strftime('%Y-%m-%d %H:%M:%S'),
            "position": [float(row['final_longitude']), float(row['final_latitude'])],
            "speed": float(row['speed']),
            "direction": float(row['direction']),
            "distance_from_start": float(row['distance_from_start'])
        }
        timepoints.append(timepoint)
    
    # Store boat data
    race_data["boats"][boat_id] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "timepoints": timepoints,
        "metadata": stats
    }

# Debug output
print("\nProcessed Data Summary:")
print(f"Total boats: {len(race_data['boats'])}")
print(f"Time range: {race_data['metadata']['time_range']}")
print("\nSample timepoint data for first boat:")
first_boat_id = list(race_data["boats"].keys())[0]
print(json.dumps(race_data["boats"][first_boat_id]["timepoints"][0], indent=2))

Sample of raw timestamp data:
0  95  1   1970-01-01
1  95  1   1970-01-01
2  95  1   1970-01-01
3  95  1   1970-01-01
4  95  1   1970-01-01
Name: sample_time, dtype: datetime64[ns]


ValueError: invalid literal for int() with base 10: '5030adba-5014-4e7e-a0d1-344cd501d81b'

In [7]:
# First inspect and fix the data
print("Data columns and types:")
print(df.dtypes)
print("\nSample of raw data:")
print(df.head(2))

# Create a processed copy of the dataframe
df_processed = df.copy()

# Keep id_athlete as the boat identifier (no composite key needed)
# Process data into JSON structure with proper type handling
race_data = {
    "metadata": {
        "total_boats": len(df_processed['id_athlete'].unique()),
        "regatta_number": str(df_processed['regatta_number'].iloc[0]),  # Keep as string
        "geographic_bounds": {
            "latitude": {
                "min": float(df_processed['latitude_strapolated'].min()),
                "max": float(df_processed['latitude_strapolated'].max())
            },
            "longitude": {
                "min": float(df_processed['longitude'].min()),
                "max": float(df_processed['longitude'].max())
            }
        }
    },
    "boats": {}
}

# Add time range only if timestamps are valid
valid_time = df_processed['sample_time'].min().year > 1970
if valid_time:
    race_data["metadata"]["time_range"] = {
        "start": df_processed['sample_time'].min().strftime('%Y-%m-%d %H:%M:%S'),
        "end": df_processed['sample_time'].max().strftime('%Y-%m-%d %H:%M:%S')
    }
else:
    print("Warning: Invalid timestamps detected, using original sample_time strings")
    # Try to parse the original timestamp strings if available
    if 'sample_time' in df.columns:
        try:
            df_processed['sample_time'] = pd.to_datetime(df['sample_time'], format='%Y-%m-%d %H:%M:%S.%f')
            race_data["metadata"]["time_range"] = {
                "start": df_processed['sample_time'].min().strftime('%Y-%m-%d %H:%M:%S'),
                "end": df_processed['sample_time'].max().strftime('%Y-%m-%d %H:%M:%S')
            }
        except Exception as e:
            print(f"Could not parse timestamps: {str(e)}")
            race_data["metadata"]["time_range"] = {"start": "unknown", "end": "unknown"}

# Process each boat's data
for boat_id in df_processed['id_athlete'].unique():
    boat_df = df_processed[df_processed['id_athlete'] == boat_id].copy()
    boat_df = boat_df.sort_values('sample_time')
    
    # Ensure coordinates are properly handled
    boat_df['final_latitude'] = pd.to_numeric(
        boat_df['latitude'].fillna(boat_df['latitude_strapolated']), 
        errors='coerce'
    )
    boat_df['final_longitude'] = pd.to_numeric(
        boat_df['longitude'].fillna(boat_df['longitude_strapolated']), 
        errors='coerce'
    )
    
    # Process numeric data safely
    numeric_cols = ['speed', 'direction', 'distance_from_start', 'distance_to_finish']
    for col in numeric_cols:
        boat_df[col] = pd.to_numeric(boat_df[col], errors='coerce').fillna(0)
    
    # Create track coordinates (only where both lat and lon are valid)
    coordinates = []
    for idx, row in boat_df.iterrows():
        if pd.notna(row['final_latitude']) and pd.notna(row['final_longitude']):
            coordinates.append([
                float(row['final_longitude']),
                float(row['final_latitude'])
            ])
    
    # Calculate boat statistics
    stats = {
        "avg_speed": float(boat_df['speed'].mean()),
        "max_speed": float(boat_df['speed'].max()),
        "total_distance": float(boat_df['distance_from_start'].max()),
        "points_count": len(coordinates)
    }
    
    # Create timepoints with proper validation
    timepoints = []
    for _, row in boat_df.iterrows():
        if pd.notna(row['final_latitude']) and pd.notna(row['final_longitude']):
            timepoint = {
                "timestamp": row['sample_time'].strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] 
                           if pd.notna(row['sample_time']) else "unknown",
                "position": [float(row['final_longitude']), float(row['final_latitude'])],
                "speed": float(row['speed']),
                "direction": float(row['direction']),
                "distance_from_start": float(row['distance_from_start'])
            }
            timepoints.append(timepoint)
    
    # Store boat data
    race_data["boats"][str(boat_id)] = {
        "track": {
            "type": "LineString",
            "coordinates": coordinates
        },
        "timepoints": timepoints,
        "metadata": stats
    }

# Debug output
print("\nProcessed Data Summary:")
print(f"Total boats: {len(race_data['boats'])}")
print(f"Regatta number: {race_data['metadata']['regatta_number']}")
print("Time range:", race_data['metadata']['time_range'])
print("\nGeographic bounds:")
print(json.dumps(race_data['metadata']['geographic_bounds'], indent=2))

if len(race_data['boats']) > 0:
    first_boat = next(iter(race_data['boats'].values()))
    print("\nSample boat data:")
    print(f"Number of timepoints: {len(first_boat['timepoints'])}")
    if first_boat['timepoints']:
        print("First timepoint:")
        print(json.dumps(first_boat['timepoints'][0], indent=2))

Data columns and types:
id_meeting                          int64
regatta_number                     object
id_athlete                        float64
id_position                       float64
latitude                          float64
longitude                         float64
speed                              object
direction                          object
sample_time                datetime64[ns]
distance_from_start               float64
distance_to_finish                 object
sample_time_strapolated    datetime64[ns]
latitude_strapolated              float64
longitude_strapolated              object
dtype: object

Sample of raw data:
        id_meeting                        regatta_number  id_athlete  \
0 95 1           1  5030adba-5014-4e7e-a0d1-344cd501d81b   55.359260   
1 95 1           1  8ceebd26-7e38-438e-824e-574c94f82392   55.359257   

        id_position  latitude  longitude       speed     direction  \
0 95 1     9.206899       2.9       73.0  2023-08-31  07:29:25.000

In [8]:
# Create a clean copy for processing
df_clean = df.copy()

# Fix timestamp columns using sample_time_strapolated instead of sample_time
df_clean['timestamp'] = pd.to_datetime(df_clean['sample_time_strapolated'])

# Fix numeric columns
df_clean['speed'] = pd.to_numeric(df_clean['speed'].str.split().str[0], errors='coerce')
df_clean['direction'] = pd.to_numeric(df_clean['direction'].str.split().str[0], errors='coerce')
df_clean['distance_from_start'] = pd.to_numeric(df_clean['distance_from_start'], errors='coerce')

# Process coordinates (use strapolated when original is missing)
df_clean['final_latitude'] = df_clean['latitude_strapolated']  # Use strapolated as they seem more valid
df_clean['final_longitude'] = df_clean['longitude']  # Original longitude seems valid

# Create the race data structure with corrected data
race_data = {
    "metadata": {
        "total_boats": 1,  # We know it's one boat with multiple readings
        "regatta_number": df_clean['regatta_number'].iloc[0],
        "geographic_bounds": {
            "latitude": {
                "min": float(df_clean['final_latitude'].min()),
                "max": float(df_clean['final_latitude'].max())
            },
            "longitude": {
                "min": float(df_clean['final_longitude'].min()),
                "max": float(df_clean['final_longitude'].max())
            }
        },
        "time_range": {
            "start": df_clean['timestamp'].min().strftime('%Y-%m-%d %H:%M:%S'),
            "end": df_clean['timestamp'].max().strftime('%Y-%m-%d %H:%M:%S')
        }
    },
    "boats": {}
}

# Process the single boat's data
boat_id = df_clean['id_athlete'].iloc[0]  # Get the boat ID
boat_df = df_clean.sort_values('timestamp')

# Create track coordinates
coordinates = [[float(lon), float(lat)] 
              for lon, lat in zip(boat_df['final_longitude'], boat_df['final_latitude'])
              if not (pd.isna(lon) or pd.isna(lat))]

# Calculate boat statistics
stats = {
    "avg_speed": float(boat_df['speed'].mean()),
    "max_speed": float(boat_df['speed'].max()),
    "total_distance": float(boat_df['distance_from_start'].max()),
    "points_count": len(coordinates),
    "duration_seconds": (boat_df['timestamp'].max() - boat_df['timestamp'].min()).total_seconds()
}

# Create timepoints with correct timestamps and values
timepoints = []
for _, row in boat_df.iterrows():
    if pd.notna(row['final_latitude']) and pd.notna(row['final_longitude']):
        timepoint = {
            "timestamp": row['timestamp'].strftime('%Y-%m-%d %H:%M:%S'),
            "position": [float(row['final_longitude']), float(row['final_latitude'])],
            "speed": float(row['speed']) if pd.notna(row['speed']) else 0.0,
            "direction": float(row['direction']) if pd.notna(row['direction']) else 0.0,
            "distance_from_start": float(row['distance_from_start']) if pd.notna(row['distance_from_start']) else 0.0
        }
        timepoints.append(timepoint)

# Store boat data with corrected values
race_data["boats"][str(boat_id)] = {
    "track": {
        "type": "LineString",
        "coordinates": coordinates
    },
    "timepoints": timepoints,
    "metadata": stats
}

# Debug output
print("\nProcessed Data Summary:")
print(f"Total boats: {len(race_data['boats'])}")
print(f"Time range: {race_data['metadata']['time_range']}")
print("\nSample timepoint:")
if timepoints:
    print(json.dumps(timepoints[0], indent=2))
print("\nCoordinate bounds:")
print(json.dumps(race_data['metadata']['geographic_bounds'], indent=2))


Processed Data Summary:
Total boats: 1
Time range: {'start': '2025-04-17 07:29:21', 'end': '2025-04-17 07:29:25'}

Sample timepoint:
{
  "timestamp": "2025-04-17 07:29:21",
  "position": [
    76.0,
    55.359251
  ],
  "speed": 0.0,
  "direction": 0.0,
  "distance_from_start": 17372.938796
}

Coordinate bounds:
{
  "latitude": {
    "min": 55.359251,
    "max": 55.35926
  },
  "longitude": {
    "min": 73.0,
    "max": 76.0
  }
}
