# Fishing Vessel Extened Loitering Prediction

In [1]:
import pandas as pd
import numpy as np
import json
import ast
import requests
from pandas import json_normalize
from geopy.distance import geodesic

## Data Loading
Data Loading and Processing Workflow:
1. Load and clean loitering data
2. Create loitering-event-related features using loitering and anchorage data. (loiterting is our main dataset)
3. At this point, we should have our feature added loitering dataset
4. Then use each unique combination of timestamp and lat/lon (can be general area of port), to pull relevant weather data
5. Join weather data back to loitering dataset using the same timestamp and location variables
### Data loading & Initial Cleaning - Loitering and Ports Dataset

In [None]:
def flatten_json_in_column(df, column_name):
    """
    Flatten JSON strings in a column into separate columns.

    Parameters:
    df (pandas.DataFrame): Input DataFrame
    column_name (str): Name of column containing JSON strings

    Returns:
    pandas.DataFrame: DataFrame with flattened JSON columns
    """
    # Convert JSON strings to dictionaries
    df[column_name] = df[column_name].apply(json.loads)

    # Flatten the JSON column
    flattened = pd.json_normalize(df[column_name])

    # Add prefix to avoid column name conflicts
    flattened.columns = [f"{column_name}_{col}" for col in flattened.columns]

    # Drop the original JSON column and join with flattened data
    df = df.drop(columns=[column_name]).join(flattened)

    return df


def process_vessel_events(df):
    """
    Process vessel events dataframe by flattening nested JSON data.

    Parameters:
    df (pandas.DataFrame): Input DataFrame with vessel events

    Returns:
    pandas.DataFrame: Processed DataFrame with flattened columns
    """
    # Create a copy of the dataframe to avoid modifying the original
    df = df.copy()

    # Convert event_vessels from string to list of dictionaries
    df['event_vessels'] = df['event_vessels'].apply(json.loads)

    # Extract the first vessel's information (assuming one vessel per event)
    vessel_info = pd.json_normalize([vessels[0] for vessels in df['event_vessels']])

    # Rename vessel_info columns to avoid conflicts
    rename_dict = {
        'id': 'vessel_info_id',
        'ssvid': 'vessel_info_ssvid',
        'name': 'vessel_info_name',
        'flag': 'vessel_info_flag'
    }
    vessel_info = vessel_info.rename(columns=rename_dict)

    # Reset index for both dataframes to ensure proper joining
    df = df.reset_index(drop=True)
    vessel_info = vessel_info.reset_index(drop=True)

    # Drop the original event_vessels column and join with vessel info
    df = df.drop(columns=['event_vessels'])
    df = pd.concat([df, vessel_info], axis=1)

    # Process event_info column
    df = flatten_json_in_column(df, 'event_info')

    return df


def load_and_process_data(loitering_path):
    """
    Load and process both loitering dataset.
    """
    # Read CSV file
    loitering_df = pd.read_csv(loitering_path)

    # Process event_vessels column (a nested JSON column)
    processed_loitering_df = process_vessel_events(loitering_df)

    # Convert datetime columns
    datetime_cols = ['event_start', 'event_end']
    for df in [processed_loitering_df]:
        for col in datetime_cols:
            df[col] = pd.to_datetime(df[col])

    # Convert numeric columns in loitering dataset
    numeric_cols = {
        'event_info_median_speed_knots': float,
        'event_info_total_distance_km': float,
        'event_info_loitering_hours': float
    }

    for col, dtype in numeric_cols.items():
        processed_loitering_df[col] = pd.to_numeric(processed_loitering_df[col], errors='raise')

    # Remove rows with missing values
    processed_loitering_df = processed_loitering_df.dropna()

    # Drop irrelevant columns
    processed_loitering_df = drop_irrelevant_columns(processed_loitering_df)

    return processed_loitering_df


def drop_irrelevant_columns(loitering_df):
    """
    Drop irrelevant or redundant columns from dataset
    - Only need lat_mean and lon_mean instead of their min and max
    - vessel_info: only need vessel id and flag
    - event_info_:
        - event_info_elevation_m seems irrelevant
        - origin_port: only need an ID for each port, using port_id, which is iso and label combined with a dash
        - destination_port: only need an ID for each port, using port_id, which is iso and label combined with a dash
        - regions: doesn't provide meaningful data, as its in codes
    """
    # can probably drop event_info_origin_port.label and event_info_destination_port.label as well
    loitering_cols_to_drop = [
        'lat_min', 'lat_max', 'lon_min', 'lon_max',
        'vessel_info_id', 'vessel_info_ssvid', 'vessel_info_name', 'event_geography', 'event_info_elevation_m',
        'event_info_origin_port.anchorage_id', 'event_info_destination_port.anchorage_id',
        'event_info_regions.eez', 'event_info_regions.fao', 'event_info_regions.rfmo'
    ]

    loitering_df = loitering_df.drop(columns=loitering_cols_to_drop)

    return loitering_df

loitering_path = 'data/CVP_loitering_202301.csv'

loitering_df = load_and_process_data(loitering_path)

### Data loading & Initial Cleaning - Anchorage Dataset

In [None]:
# Define dtype mapping for loading the CSV
dtype_mapping = {
    's2id': 'str',
    'lat': 'float',
    'lon': 'float',
    'label': 'str',
    'sublabel': 'str',
    'label_source': 'str',
    'iso3': 'str',
    'distance_from_shore_m': 'float',
    'drift_radius': 'float',
    'dock': 'object'
}

# Load CSV with explicit types
anchorage_df = pd.read_csv('data/named_anchorages_v2_20221206.csv', dtype=dtype_mapping)

# Filter to keep only rows where dock is True
anchorage_df['dock'] = anchorage_df['dock'].fillna(False).astype(bool)
anchorage_df = anchorage_df[anchorage_df['dock'] == True]

# Function to clean strings by removing whitespace and converting to lowercase
def clean_string(x):
    if pd.isna(x):
        return ''
    return str(x).replace(' ', '').lower()

# Clean the iso3 and label columns
anchorage_df['iso3_clean'] = anchorage_df['iso3'].apply(clean_string)
anchorage_df['label_clean'] = anchorage_df['label'].apply(clean_string)

# Create port_id column by combining cleaned iso3 and label
anchorage_df['port_id'] = anchorage_df['iso3_clean'] + '-' + anchorage_df['label_clean']

# Group by port_id and calculate mean lat and lon
port_locations_df = anchorage_df.groupby('port_id').agg({
    'lat': 'mean',
    'lon': 'mean',
    's2id': 'nunique'  # Count unique anchorage points
}).reset_index()

# Rename s2id count column to port_capacity
port_locations_df = port_locations_df.rename(columns={'s2id': 'port_capacity'})

# Round the coordinates to 6 decimal places for reasonable precision
port_locations_df['lat'] = port_locations_df['lat'].round(6)
port_locations_df['lon'] = port_locations_df['lon'].round(6)

### Initial Feature Engineering - Loiterting and Anchorage Dataset
Port Location Features:
- Mapped origin and destination port locations (lat/lon)
- Added port capacity for both origin and destination

Distance Features (using geopy):
- Port-to-port distance (origin to destination)

Temporal Features:
- Start day of week, month, hour
- Time since last loitering event

Loitering Classification:
- Target variable: extended_loitering (>24 hours)
- Location type: port_loiter (<50km from shore) vs deepsea_loiter
- Loitering transition patterns (e.g., port_to_deepsea)

Historical Statistics per Vessel:
- Frequencies over different windows (30D, 90D, 365D):
- Port-adjacent loitering count
- Deep-sea loitering count

Average durations by type:
- Port-adjacent loitering
- Deep-sea loitering

- Speed variability (std dev of median_speed_knots)

Port Congestion:
- Daily count of port-adjacent loitering vessels

In [None]:
def add_port_location_features(loitering_df, port_locations_df):
    """
    Add origin and destination port location features to loitering dataset
    """
    # Create copy to avoid modifying original
    df = loitering_df.copy()

    # Join with origin port locations
    origin_ports = port_locations_df.copy()
    origin_ports.columns = [f'origin_port_{col}' for col in origin_ports.columns]
    df = df.merge(
        origin_ports,
        left_on='event_info_origin_port.port_id',
        right_on='origin_port_port_id',
        how='left'
    )

    # Join with destination port locations
    dest_ports = port_locations_df.copy()
    dest_ports.columns = [f'destination_port_{col}' for col in dest_ports.columns]
    df = df.merge(
        dest_ports,
        left_on='event_info_destination_port.port_id',
        right_on='destination_port_port_id',
        how='left'
    )

    return df


def create_target_variable(df):
    """Create binary target variable for extended loitering events"""
    df['extended_loitering'] = (df['event_info_loitering_hours'] > 24).astype(int)
    return df


def classify_loitering_location(df):
    """Classify loitering events as port-adjacent (1) or deep-sea (0) based on shore distance"""
    df['loitering_type'] = np.where(df['event_info_distance_from_shore_m'] < 50000, 1, 0)
    return df


def calculate_distances(df):
    """Calculate various distance features using geopy"""
    # Create copy to avoid modification warnings
    df = df.copy()

    # Remove rows with missing port locations
    df = df.dropna(subset=['origin_port_lat', 'origin_port_lon', 'destination_port_lat', 'destination_port_lon'])

    # Calculate distances using .loc to avoid warnings
    df.loc[:, 'origin_destination_distance'] = df.apply(
        lambda row: geodesic(
            (row['origin_port_lat'], row['origin_port_lon']),
            (row['destination_port_lat'], row['destination_port_lon'])
        ).kilometers, axis=1
    )

    return df


def extract_temporal_features(df):
    """Extract temporal features from event timestamps"""
    # Event start features
    df['start_dayofweek'] = df['event_start'].dt.dayofweek
    df['start_month'] = df['event_start'].dt.month
    df['start_hour'] = df['event_start'].dt.hour
    return df


def calculate_historical_features(df):
    """Calculate historical statistics for each vessel"""
    # Create copy and reset index to ensure proper alignment
    df = df.copy().reset_index(drop=True)

    # Sort by vessel and time
    df = df.sort_values(['vessel_id', 'event_start'])

    # Time since last loitering event
    df['time_since_last_loiter'] = df.groupby('vessel_id')['event_start'].diff()

    # Loitering transition types
    df['prev_loitering_type'] = df.groupby('vessel_id')['loitering_type'].shift()

    # Create binary columns for each loitering type
    df['is_port_loiter'] = (df['loitering_type'] == 1).astype(int)
    df['is_deepsea_loiter'] = (df['loitering_type'] == 0).astype(int)

    # Historical frequencies
    for window in ['30D', '90D', '365D']:
        # Port-adjacent frequency
        df[f'port_loiter_freq_{window}'] = (
            df.set_index('event_start')
            .groupby('vessel_id')['is_port_loiter']
            .rolling(window)
            .sum()
            .reset_index()['is_port_loiter']
        )

        # Deep-sea frequency
        df[f'deepsea_loiter_freq_{window}'] = (
            df.set_index('event_start')
            .groupby('vessel_id')['is_deepsea_loiter']
            .rolling(window)
            .sum()
            .reset_index()['is_deepsea_loiter']
        )

    # Drop temporary columns
    df = df.drop(['is_port_loiter', 'is_deepsea_loiter', 'prev_loitering_type'], axis=1)

    return df


def calculate_averages_and_stats(df):
    """Calculate vessel-level averages and statistics"""
    # Average distance between ports
    df['avg_distance_travelled'] = df.groupby('vessel_id')['origin_destination_distance'].transform('mean')

    # Average loitering durations by type
    df['avg_port_loiter_duration'] = (
        df[df['loitering_type'] == 1]
        .groupby('vessel_id')['event_info_loitering_hours']
        .transform('mean')
    )

    df['avg_deepsea_loiter_duration'] = (
        df[df['loitering_type'] == 'deepsea']
        .groupby('vessel_id')['event_info_loitering_hours']
        .transform('mean')
    )

    # Speed variability
    df['speed_variability'] = (
        df.groupby('vessel_id')['event_info_median_speed_knots']
        .transform('std')
    )

    return df


def calculate_port_congestion(df):
    """Calculate daily port congestion"""
    df['event_date'] = df['event_start'].dt.date

    port_congestion = (
        df[df['loitering_type'] == 'port']
        .groupby(['event_date'])
        .size()
        .reset_index(name='port_congestion')
    )

    df = df.merge(port_congestion, on='event_date', how='left')
    return df


def process_features(loitering_df, port_locations_df):
    """Main function to process all feature engineering steps"""
    # Create copy of input dataframe
    df = loitering_df.copy()

    # Ensure datetime columns are in UTC
    if df['event_start'].dt.tz is None:
        df['event_start'] = df['event_start'].dt.tz_localize('UTC')
    if df['event_end'].dt.tz is None:
        df['event_end'] = df['event_end'].dt.tz_localize('UTC')

    # Add port location features
    df = add_port_location_features(df, port_locations_df)

    # Create target variable
    df = create_target_variable(df)

    # Classify loitering location
    df = classify_loitering_location(df)

    # Calculate distance features
    df = calculate_distances(df)

    # Extract temporal features
    df = extract_temporal_features(df)

    # Calculate historical features
    df = calculate_historical_features(df)

    # Calculate averages and statistics
    df = calculate_averages_and_stats(df)

    # Calculate port congestion
    df = calculate_port_congestion(df)

    return df

loitering_w_features_df = process_features(loitering_df, port_locations_df)

### Data Loading - Weather
Pulling weather data is done after loitering feature engineering to ensure we only pull relevant days and location of weather need.

In [None]:
# Get unique date and port location combinations
unique_date_ports = loitering_w_features_df[['event_date', 'destination_port_lat', 'destination_port_lon']].drop_duplicates()

unique_date_ports = unique_date_ports.sample(5)  # Make sample for testing

# Convert event_date to the required format for the API
unique_date_ports["event_date"] = pd.to_datetime(unique_date_ports["event_date"]).dt.strftime('%Y-%m-%d')

# WeatherAPI Key
# API_KEY = "e3596b537e82495c9e615821242411"

# Base URL for the WeatherAPI History API
BASE_URL = "http://api.weatherapi.com/v1/history.json"

# Function to fetch weather data
def fetch_weather_data(lat, lon, date, api_key):
    try:
        # Construct the API request URL
        url = f"{BASE_URL}?key={api_key}&q={lat},{lon}&dt={date}"

        # Make the API call
        response = requests.get(url)
        response.raise_for_status()

        # Parse the JSON response
        weather_data = response.json()
        return weather_data
    except Exception as e:
        print(f"Error fetching data for {lat}, {lon} on {date}: {e}")
        return None

# List to store the results
weather_results = []

# Iterate through the DataFrame rows
for index, row in unique_date_ports.iterrows():
    lat = row['destination_port_lat']
    lon = row['destination_port_lon']
    date = row['event_date']

    # Fetch weather data for the given row
    weather_data = fetch_weather_data(lat, lon, date, API_KEY)
    if weather_data:
        # Extract relevant fields from the 'forecastday' object
        forecast_day = weather_data.get('forecast', {}).get('forecastday', [{}])[0]
        day_data = forecast_day.get('day', {})
        day_data['latitude'] = lat
        day_data['longitude'] = lon
        day_data['date'] = date

        # Append the extracted data to the results
        weather_results.append(day_data)

# Convert results into a DataFrame
weather_df = pd.DataFrame(weather_results)

# Select only the required columns
required_columns = [
    "avgtemp_c", "maxwind_kph", "totalprecip_mm", "totalsnow_cm", "avgvis_km",
    "daily_chance_of_rain", "daily_chance_of_snow", "latitude", "longitude", "date"
]
weather_df = weather_df[required_columns]

### Dropping irrelevant columns
Some columns that were needed for data processing can be dropped, such as: ID/name columns, lat/lon columns

In [None]:
final_df = loitering_weather_df.drop([
    'event_id', 'event_type', 'vessel_id', 'event_start', 'event_end', 'lat_mean', 'lon_mean',
    'event_info_origin_port.port_id', 'event_info_origin_port.label',
    'event_info_destination_port.port_id', 'event_info_destination_port.label',
    'origin_port_port_id', 'origin_port_lat','origin_port_lon',
    'destination_port_port_id', 'destination_port_lat', 'destination_port_lon'
    ],
    axis=1)

## Exploratory Data Analysis

## Model Development

## Model Evaluation

# EDA Checks

In [None]:
# 20 nautical miles = 37,040 meters
near_port = loitering_df[loitering_df['event_info_distance_from_shore_m'] < 50000]

In [None]:
near_port[near_port['event_info_loitering_hours'] > 24].shape[0]
