In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.neighbors import BallTree

In [2]:
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█', printEnd="\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end=printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

## LSTM

In [44]:
def add_port_distance_features(ais_train, ports_df):
    """
    Adds the distances to the three closest ports as new features in the ais_train DataFrame.
    
    Parameters:
    - ais_train: DataFrame containing AIS data with 'latitude' and 'longitude' columns.
    - ports_df: DataFrame containing port data with 'latitude' and 'longitude' columns.
    
    Returns:
    - ais_train: DataFrame with added 'port_1', 'port_2', 'port_3' columns.
    """
    # Ensure 'latitude' and 'longitude' are present and valid
    if 'latitude' not in ais_train.columns or 'longitude' not in ais_train.columns:
        raise ValueError("ais_train must contain 'latitude' and 'longitude' columns.")
    if 'latitude' not in ports_df.columns or 'longitude' not in ports_df.columns:
        raise ValueError("ports_df must contain 'latitude' and 'longitude' columns.")
    
    # Extract port coordinates and convert to radians
    port_coords = ports_df[['latitude', 'longitude']].values
    port_coords_rad = np.radians(port_coords)
    
    # Build the BallTree with Haversine metric
    port_tree = BallTree(port_coords_rad, metric='haversine')
    
    # Extract vessel coordinates and convert to radians
    vessel_coords = ais_train[['latitude', 'longitude']].values
    vessel_coords_rad = np.radians(vessel_coords)
    
    # Query the BallTree for the 3 closest ports to each vessel point
    distances_rad, indices = port_tree.query(vessel_coords_rad, k=3)
    
    # Convert distances from radians to nautical miles (1 NM = 1.852 km)
    # Earth's radius is approximately 6371 km
    distances_km = distances_rad * 6371.0
    distances_nm = distances_km / 1.852  # Convert km to nautical miles
    
    # Add the distances to ais_train
    ais_train['port_1'] = distances_nm[:, 0]
    ais_train['port_2'] = distances_nm[:, 1]
    ais_train['port_3'] = distances_nm[:, 2]
    
    return ais_train


In [47]:
def preprocess_data(ais_train, vessels, ports_df, seq_len=30):
    """
    Preprocesses the AIS data by adding time-related features, merging vessel data,
    and adding port distance features.
    
    Parameters:
    - ais_train: DataFrame containing AIS data.
    - vessels: DataFrame containing vessel data.
    - ports_df: DataFrame containing port data.
    - seq_len: Minimum number of observations required per vessel.
    
    Returns:
    - ais_train: Preprocessed DataFrame with added features.
    - vesselId_dict: Dictionary mapping vesselId to unique integer identifiers.
    """
    # Remove all vessels with fewer than seq_len observations
    ais_train = ais_train.groupby('vesselId').filter(lambda x: len(x) > seq_len)
    
    # Sort by vesselId and time to ensure correct order before calculating time delta
    ais_train = ais_train.sort_values(by=['vesselId', 'time'])
    
    # Convert 'time' to datetime format
    ais_train['time'] = pd.to_datetime(ais_train['time'])
    
    # Calculate time_delta in seconds (next time minus current time)
    ais_train['time_delta'] = ais_train.groupby('vesselId')['time'].diff().dt.total_seconds()
    # Shift the time_delta by one to get the time_delta of the next observation
    ais_train['time_delta'] = ais_train.groupby('vesselId')['time_delta'].shift(-1)
    
    # Extract other time-based features
    newyear = pd.to_datetime('2024-01-01 00:00:00')
    ais_train['time_numeric'] = (ais_train['time'] - newyear).dt.total_seconds()
    ais_train['month'] = ais_train['time'].dt.month
    ais_train['hour'] = ais_train['time'].dt.hour
    ais_train['weekday'] = ais_train['time'].dt.weekday
    
    # Map vesselId to unique integer identifiers
    vesselIds = ais_train['vesselId'].unique()
    vesselId_dict = {vesselIds[i]: i for i in range(len(vesselIds))}
    ais_train['vessel_embedding'] = ais_train['vesselId'].map(vesselId_dict)
    
    # Merge vessel data (on vesselId) to add CEU, length, vesselType
    ais_train = ais_train.merge(vessels[['vesselId', 'CEU', 'length', 'vesselType']], on='vesselId', how='left')
    
    # Handle missing values for vesselType
    ais_train['vesselType'] = ais_train['vesselType'].fillna(-1)
    
    # Handle missing values after merging, if necessary
    ais_train.fillna(0, inplace=True)
    
    # Add port distance features
    ais_train = add_port_distance_features(ais_train, ports_df)
    
    # Drop columns that are not needed
    ais_train = ais_train.drop(columns=['time', 'rot', 'heading', 'etaRaw', 'vesselId', 'portId'])
    
    return ais_train, vesselId_dict


In [48]:
X_train = pd.read_csv('../data/ais_train.csv', sep='|')
vessels = pd.read_csv('../data/vessels.csv', sep='|')
ports = pd.read_csv('../data/ports.csv', sep='|')

X_train, vesselId_dict = preprocess_data(X_train, vessels, ports, seq_len=30)
X_train.tail()

Unnamed: 0,cog,sog,navstat,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,CEU,length,vesselType,port_1,port_2,port_3
1522059,324.1,13.5,0,59.63337,21.43237,1249.0,11054176.0,5,22,1,686,200,191.0,83.0,40.023794,53.714779,60.592449
1522060,324.2,13.3,0,59.69588,21.34225,1249.0,11055425.0,5,22,1,686,200,191.0,83.0,40.325926,51.671139,65.119362
1522061,356.5,12.2,0,59.76388,21.35317,1219.0,11056674.0,5,23,1,686,200,191.0,83.0,38.206025,48.004839,67.659127
1522062,52.6,17.3,0,59.83316,21.38489,1248.0,11057893.0,5,23,1,686,200,191.0,83.0,35.806334,43.996936,69.976067
1522063,53.6,17.7,0,59.89167,21.54685,0.0,11059141.0,5,23,1,686,200,191.0,83.0,30.119141,38.406226,69.515054


In [50]:
# print where vessel_embedding is 65
X_train[X_train['vessel_embedding'] == 322].tail()

Unnamed: 0,cog,sog,navstat,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,CEU,length,vesselType,port_1,port_2,port_3
584572,338.3,17.8,0,-33.76641,17.73318,1212.0,10748270.0,5,9,5,322,6312,199.99,83.0,36.101636,43.296135,394.094053
584573,338.5,17.9,0,-33.67195,17.69076,1242.0,10749482.0,5,9,5,322,6312,199.99,83.0,39.838342,48.46844,396.632956
584574,340.8,18.0,0,-33.57465,17.64701,1098.0,10750724.0,5,10,5,322,6312,199.99,83.0,44.233409,53.99321,399.334418
584575,339.4,18.4,0,-33.48759,17.60807,785.0,10751822.0,5,10,5,322,6312,199.99,83.0,48.503943,59.052253,401.811811
584576,337.1,18.3,0,-33.42449,17.57977,0.0,10752607.0,5,10,5,322,6312,199.99,83.0,51.75993,62.777972,403.652108


## Add kalman linear predictions

In [51]:
import math
from geopy.distance import distance, Point

def calculate_initial_compass_bearing(pointA, pointB):
    """
    Calculates the bearing between two points.
    """
    lat1 = math.radians(pointA[0])
    lat2 = math.radians(pointB[0])
    diffLong = math.radians(pointB[1] - pointA[1])
    
    x = math.sin(diffLong) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
            * math.cos(lat2) * math.cos(diffLong))
    
    initial_bearing = math.atan2(x, y)
    compass_bearing = (math.degrees(initial_bearing) + 360) % 360
    
    return compass_bearing

def compute_course_and_speed(lat1, lon1, lat2, lon2, time_delta):
    """
    Computes the course over ground (cog) and speed over ground (sog) between two points.
    """
    # Compute distance in meters
    distance_meters = geodesic((lat1, lon1), (lat2, lon2)).meters
    
    # Compute bearing (course over ground) 
    cog = calculate_initial_compass_bearing((lat1, lon1), (lat2, lon2))
    
    # Compute speed over ground (convert time_delta to hours)
    time_hours = time_delta / 3600.0  # convert seconds to hours
    if time_hours > 0:
        sog = (distance_meters / 1852) / time_hours  # speed in knots
    else:
        sog = np.nan  # Undefined if time is zero or negative
    
    return cog, sog

def compute_destination(lat, lon, cog, sog, time_delta):
    """
    Computes the destination point given starting point, course over ground, speed over ground, and time delta.
    """
    # Convert SOG from knots to meters per second (1 knot = 0.514444 m/s)
    sog_mps = sog * 0.514444
    
    # Compute distance traveled
    distance_meters = sog_mps * time_delta
    
    start_point = Point(lat, lon)
    destination_point = distance(meters=distance_meters).destination(point=start_point, bearing=cog)
    
    lat_new = destination_point.latitude
    lon_new = destination_point.longitude
    
    return lat_new, lon_new


In [54]:
def add_features(df):
    # Ensure the DataFrame is sorted by vessel and time
    df = df.sort_values(['vessel_embedding', 'time_numeric']).reset_index(drop=True)

    # Initialize lists to store the new features
    cog_lin_list = []
    sog_lin_list = []
    latitude_lin_list = []
    longitude_lin_list = []
    latitude_pred_list = []
    longitude_pred_list = []

    # Group by vessel_id
    grouped = df.groupby('vessel_embedding')

    n_iter = len(grouped)
    k = 0

    # For each group
    for vessel_id, group in grouped:
        group = group.sort_values('time_numeric').reset_index(drop=True)
        n = len(group)

        # Initialize lists for this group
        cog_lin = []
        sog_lin = []
        latitude_lin = []
        longitude_lin = []
        latitude_pred = []
        longitude_pred = []

        # Compute time_delta between previous and current observation
        time_deltas = group['time_numeric'].diff()
        time_deltas.iloc[0] = 0  # First entry

        for i in range(n):
            # Get time_delta to next observation from 'time_delta' column
            time_delta_next_i = group.loc[i, 'time_delta']

            if i == 0:
                # First row, no previous point
                cog_lin.append(np.nan)
                sog_lin.append(np.nan)
            else:
                # Compute cog_lin and sog_lin using previous and current lat/lon
                lat1 = group.loc[i-1, 'latitude']
                lon1 = group.loc[i-1, 'longitude']
                lat2 = group.loc[i, 'latitude']
                lon2 = group.loc[i, 'longitude']
                time_delta = time_deltas.iloc[i]

                if time_delta > 0:
                    cog_lin_i, sog_lin_i = compute_course_and_speed(lat1, lon1, lat2, lon2, time_delta)
                else:
                    cog_lin_i, sog_lin_i = (np.nan, np.nan)
                cog_lin.append(cog_lin_i)
                sog_lin.append(sog_lin_i)

            # Using cog_lin and sog_lin to predict next position
            if (not np.isnan(cog_lin[i]) and not np.isnan(sog_lin[i]) and
                not np.isnan(time_delta_next_i) and time_delta_next_i > 0):
                lat_lin, lon_lin = compute_destination(group.loc[i, 'latitude'], group.loc[i, 'longitude'],
                                                       cog_lin[i], sog_lin[i], time_delta_next_i)
            else:
                lat_lin, lon_lin = (np.nan, np.nan)
            latitude_lin.append(lat_lin)
            longitude_lin.append(lon_lin)

            # Using current measured cog, sog to predict next position
            if (not np.isnan(group.loc[i, 'cog']) and not np.isnan(group.loc[i, 'sog']) and
                not np.isnan(time_delta_next_i) and time_delta_next_i > 0):
                lat_pred, lon_pred = compute_destination(group.loc[i, 'latitude'], group.loc[i, 'longitude'],
                                                         group.loc[i, 'cog'], group.loc[i, 'sog'], time_delta_next_i)
            else:
                lat_pred, lon_pred = (np.nan, np.nan)
            latitude_pred.append(lat_pred)
            longitude_pred.append(lon_pred)

        # Append to the main lists
        cog_lin_list.extend(cog_lin)
        sog_lin_list.extend(sog_lin)
        latitude_lin_list.extend(latitude_lin)
        longitude_lin_list.extend(longitude_lin)
        latitude_pred_list.extend(latitude_pred)
        longitude_pred_list.extend(longitude_pred)

        k += 1
        progress_bar(k, n_iter, prefix='Progress:', suffix='Complete', length=50)

    # Add new columns to the DataFrame
    df['cog_lin'] = cog_lin_list
    df['sog_lin'] = sog_lin_list
    df['latitude_lin'] = latitude_lin_list
    df['longitude_lin'] = longitude_lin_list
    df['latitude_pred'] = latitude_pred_list
    df['longitude_pred'] = longitude_pred_list

    return df

In [35]:
def create_visual_df(df):
    # Create a DataFrame for visualization

    df_true = df[['vessel_embedding', 'latitude', 'longitude', 'time_numeric']]
    df_true = df_true.rename(columns={'vessel_embedding': 'vesselId', 'time_numeric': 'time'})
    df_true['vesselId'] = df_true['vesselId'].astype(str) + '_true'

    df_pred = df[['vessel_embedding', 'latitude_pred', 'longitude_pred', 'time_numeric']]
    df_pred = df_pred.rename(columns={'vessel_embedding': 'vesselId', 'time_numeric': 'time', 'latitude_pred': 'latitude', 'longitude_pred': 'longitude'})
    df_pred['vesselId'] = df_pred['vesselId'].astype(str) + '_pred'
    df_pred['time'] = df_pred['time'].shift(-1)

    df_lin = df[['vessel_embedding', 'latitude_lin', 'longitude_lin', 'time_numeric']]
    df_lin = df_lin.rename(columns={'vessel_embedding': 'vesselId', 'time_numeric': 'time', 'latitude_lin': 'latitude', 'longitude_lin': 'longitude'})
    df_lin['vesselId'] = df_lin['vesselId'].astype(str) + '_lin'
    df_lin['time'] = df_lin['time'].shift(-1)

    # stack the DataFrames on top of each other
    df_vis = pd.concat([df_true, df_pred, df_lin], axis=0)

    return df_vis

In [55]:
X_train = add_features(X_train)
X_train.head()

Progress: |██████████████████████████████████████████████████| 100.0% Complete


Unnamed: 0,cog,sog,navstat,latitude,longitude,time_delta,time_numeric,month,hour,weekday,...,vesselType,port_1,port_2,port_3,cog_lin,sog_lin,latitude_lin,longitude_lin,latitude_pred,longitude_pred
0,308.1,17.1,0,7.50361,77.5834,1393.0,1001267.0,1,14,4,...,83.0,83.495695,139.033483,167.576546,,,,,7.571971,77.496016
1,307.6,17.3,0,7.57302,77.49505,1583.0,1002660.0,1,14,4,...,83.0,82.258218,145.154128,161.45444,308.401713,17.316942,7.652217,77.394885,7.650734,77.393884
2,306.8,16.9,0,7.65043,77.39404,1285.0,1004243.0,1,14,4,...,83.0,81.592756,152.179054,154.619766,307.716894,17.258008,7.71353,77.312235,7.710934,77.312951
3,307.9,16.9,0,7.71275,77.31394,1259.0,1005528.0,1,15,4,...,83.0,81.520837,149.201198,157.802419,308.139833,16.953187,7.774059,77.235646,7.773541,77.235636
4,307.0,16.3,0,7.77191,77.23585,901.0,1006787.0,1,15,4,...,83.0,81.951135,144.055268,163.284982,307.405233,16.701365,7.814426,77.180095,7.813019,77.181142


In [56]:
X_train.to_csv('../data/xgb_linear/ais_train_processed.csv', index=False)

## Post kalman data preperation

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [4]:
def process_navstat(df):
    """
    Process the 'navstat' feature according to the specified rules:
    - Set all values equal to 8 to 0.
    - Set all values equal to 5 to 1.
    - Set all other values to 15.
    """
    df['navstat'] = df['navstat'].replace(8, 0)
    df['navstat'] = df['navstat'].replace(5, 1)
    df.loc[~df['navstat'].isin([0, 1]), 'navstat'] = 15
    return df


In [4]:
def create_lag_features(df, lagged_vars, n_lag, group_col='vessel_embedding'):
    """
    Create lag features for specified variables.

    Parameters:
    - df: DataFrame containing the data.
    - lagged_vars: List of variable names to create lag features for.
    - n_lag: Number of lag steps to create.
    - group_col: Column name to group by (e.g., 'vessel_id').
    """
    for var in lagged_vars:
        for lag in range(1, n_lag + 1):
            df[f'{var}_lag{lag}'] = df.groupby(group_col)[var].shift(lag)
    return df


In [5]:
def adjust_features_for_prediction_time(df, time_vars, group_col='vessel_embedding'):
    """
    Adjust time-related features to correspond to the prediction time.

    Parameters:
    - df: DataFrame containing the data.
    - time_vars: List of time-related feature names to adjust (e.g., ['month', 'hour', 'weekday']).
    - group_col: Column name to group by (e.g., 'vessel_id').
    """
    for var in time_vars:
        df[var] = df.groupby(group_col)[var].shift(-1)
    return df

In [6]:
def create_target_variables(df, target_vars, group_col='vessel_embedding'):
    """
    Create target variables by shifting the specified variables backward.

    Parameters:
    - df: DataFrame containing the data.
    - target_vars: List of target variable names to create (e.g., ['latitude', 'longitude']).
    - group_col: Column name to group by (e.g., 'vessel_id').
    """
    for var in target_vars:
        df[f'{var}_next'] = df.groupby(group_col)[var].shift(-1)
    return df

In [7]:
def encode_categorical_variables(df, categorical_cols):
    """
    Encode categorical variables using Label Encoding.

    Parameters:
    - df: DataFrame containing the data.
    - categorical_cols: List of categorical column names to encode.

    Returns:
    - df: DataFrame with encoded categorical variables.
    - label_encoders: Dictionary of LabelEncoders used for each column.
    """
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df.loc[:, col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    return df, label_encoders

In [8]:
def prepare_features_and_targets(df, lagged_vars, n_lag):
    """
    Prepare the feature matrix X and target vector y for model training.

    Parameters:
    - df: DataFrame containing the data.
    - lagged_vars: List of variable names for which lag features were created.
    - n_lag: Number of lag steps used.

    Returns:
    - X: Feature matrix.
    - y: Target vector.
    - feature_cols: List of feature column names.
    - target_cols: List of target column names.
    """
    # Feature columns
    lagged_feature_cols = [f'{var}_lag{lag}' for var in lagged_vars for lag in range(1, n_lag + 1)]
    feature_cols = lagged_feature_cols + [
        'month', 'hour', 'weekday', 'vessel_embedding', 'CEU', 'length', 'vesselType',
    ]
    
    # Target columns
    target_cols = ['latitude_next', 'longitude_next', 'sog_next', 'cog_next']
    
    # Prepare X and y
    X = df[feature_cols]
    y = df[target_cols]
    
    return X, y, feature_cols, target_cols


In [10]:
def split_data(X, y, df, split_ratio=0.9):
    """
    Split the data into training and testing sets based on time within each vessel.

    Parameters:
    - X: Feature matrix.
    - y: Target vector.
    - df: Original DataFrame containing 'vessel_embedding' and 'time_numeric'.
    - split_ratio: Proportion of data to use for training within each vessel (default is 0.9).

    Returns:
    - X_train, X_test, y_train, y_test: Split data.
    """
    train_indices = []
    test_indices = []
    
    # Group the data by vessel
    grouped = df.groupby('vessel_embedding')
    
    for vessel_id, group in grouped:
        # Sort each vessel's data by time
        group_sorted = group.sort_values(by='time_numeric')
        indices = group_sorted.index
        n = len(indices)
        split_point = int(split_ratio * n)
        
        # Split indices for this vessel
        train_indices.extend(indices[:split_point])
        test_indices.extend(indices[split_point:])
    
    # Split the data
    X_train = X.loc[train_indices]
    X_test = X.loc[test_indices]
    y_train = y.loc[train_indices]
    y_test = y.loc[test_indices]
    
    return X_train, X_test, y_train, y_test


In [11]:
def split_reg_clf_test(X_train, y_train, X_test, y_test, target_vars_reg, target_vars_clf):
    """
    Split the data into regression and classification targets.

    Parameters:
    - X_train: Training feature matrix.
    - y_train: Training target vector.
    - X_test: Testing feature matrix.
    - y_test: Testing target vector.
    - target_vars_reg: List of target variables for regression.
    - target_vars_clf: List of target variables for classification.

    Returns:
    - X_train_reg, y_train_reg, X_test_reg, y_test_reg: Regression data.
    - X_train_clf, y_train_clf, X_test_clf, y_test_clf: Classification data.
    """
    # Add target prefix to target variables
    target_vars_reg = [f'{var}_next' for var in target_vars_reg]
    target_vars_clf = [f'{var}_next' for var in target_vars_clf]

    # Regression data
    X_train_reg = X_train.copy()
    y_train_reg = y_train[target_vars_reg].copy()
    X_test_reg = X_test.copy()
    y_test_reg = y_test[target_vars_reg].copy()
    
    # Classification data
    X_train_clf = X_train.copy()
    y_train_clf = y_train[target_vars_clf].copy()
    X_test_clf = X_test.copy()
    y_test_clf = y_test[target_vars_clf].copy()
    
    return X_train_reg, y_train_reg, X_test_reg, y_test_reg, X_train_clf, y_train_clf, X_test_clf, y_test_clf

In [12]:
def train_xgboost_model(X_train, y_train, X_val=None, y_val=None):
    """
    Train an XGBoost model for regression.

    Parameters:
    - X_train: Training feature matrix.
    - y_train: Training target vector.

    Returns:
    - xgb_regressor: Trained XGBoost regressor model.
    """
    xgb_regressor = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=50,
        max_depth=9,
        learning_rate=0.1,
        n_jobs=-1,
        tree_method='hist',
        
    )

    # if val is none, use 10% of the training data as validation
    if X_val is None:
        val_size = int(0.1 * len(X_train))
        X_val = X_train.iloc[-val_size:]
        y_val = y_train.iloc[-val_size:]

    xgb_regressor.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=1)
    return xgb_regressor


In [13]:
def evaluate_model_reg(model, X_test, y_test):
    """
    Evaluate the trained model using RMSE for latitude and longitude.

    Parameters:
    - model: Trained model.
    - X_test: Testing feature matrix.
    - y_test: Testing target vector.
    """

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate RMSE for latitude and longitude
    rmse_lat = np.sqrt(mean_squared_error(y_test['latitude_next'], y_pred[:, 0]))
    rmse_lon = np.sqrt(mean_squared_error(y_test['longitude_next'], y_pred[:, 1]))

    print(f'RMSE Latitude: {rmse_lat:.5f}')
    print(f'RMSE Longitude: {rmse_lon:.5f}')


In [14]:
df = pd.read_csv('../data/xgb_linear/ais_train_processed.csv')
df

Unnamed: 0,cog,sog,navstat,latitude,longitude,time_delta,time_numeric,month,hour,weekday,...,vesselType,port_1,port_2,port_3,cog_lin,sog_lin,latitude_lin,longitude_lin,latitude_pred,longitude_pred
0,308.1,17.1,0,7.50361,77.58340,1393.0,1001267.0,1,14,4,...,83.0,83.495695,139.033483,167.576546,,,,,7.571971,77.496016
1,307.6,17.3,0,7.57302,77.49505,1583.0,1002660.0,1,14,4,...,83.0,82.258218,145.154128,161.454440,308.401713,17.316942,7.652217,77.394885,7.650734,77.393884
2,306.8,16.9,0,7.65043,77.39404,1285.0,1004243.0,1,14,4,...,83.0,81.592756,152.179054,154.619766,307.716894,17.258008,7.713530,77.312235,7.710934,77.312951
3,307.9,16.9,0,7.71275,77.31394,1259.0,1005528.0,1,15,4,...,83.0,81.520837,149.201198,157.802419,308.139833,16.953187,7.774059,77.235646,7.773541,77.235636
4,307.0,16.3,0,7.77191,77.23585,901.0,1006787.0,1,15,4,...,83.0,81.951135,144.055268,163.284982,307.405233,16.701365,7.814426,77.180095,7.813019,77.181142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522059,324.1,13.5,0,59.63337,21.43237,1249.0,11054176.0,5,22,1,...,83.0,40.023794,53.714779,60.592449,315.690959,13.521431,59.689131,21.324618,59.696410,21.342043
1522060,324.2,13.3,0,59.69588,21.34225,1249.0,11055425.0,5,22,1,...,83.0,40.325926,51.671139,65.119362,323.979005,13.414995,59.758427,21.252063,59.758065,21.253312
1522061,356.5,12.2,0,59.76388,21.35317,1219.0,11056674.0,5,23,1,...,83.0,38.206025,48.004839,67.659127,4.623223,11.828951,59.830247,21.363831,59.832424,21.344842
1522062,52.6,17.3,0,59.83316,21.38489,1248.0,11057893.0,5,23,1,...,83.0,35.806334,43.996936,69.976067,12.956417,12.631257,59.904093,21.417381,59.893618,21.542513


In [15]:
df = df.drop(columns=['navstat'])
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,vesselType,port_1,port_2,port_3,cog_lin,sog_lin,latitude_lin,longitude_lin,latitude_pred,longitude_pred
0,308.1,17.1,7.50361,77.58340,1393.0,1001267.0,1,14,4,0,...,83.0,83.495695,139.033483,167.576546,,,,,7.571971,77.496016
1,307.6,17.3,7.57302,77.49505,1583.0,1002660.0,1,14,4,0,...,83.0,82.258218,145.154128,161.454440,308.401713,17.316942,7.652217,77.394885,7.650734,77.393884
2,306.8,16.9,7.65043,77.39404,1285.0,1004243.0,1,14,4,0,...,83.0,81.592756,152.179054,154.619766,307.716894,17.258008,7.713530,77.312235,7.710934,77.312951
3,307.9,16.9,7.71275,77.31394,1259.0,1005528.0,1,15,4,0,...,83.0,81.520837,149.201198,157.802419,308.139833,16.953187,7.774059,77.235646,7.773541,77.235636
4,307.0,16.3,7.77191,77.23585,901.0,1006787.0,1,15,4,0,...,83.0,81.951135,144.055268,163.284982,307.405233,16.701365,7.814426,77.180095,7.813019,77.181142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,5,22,1,686,...,83.0,40.023794,53.714779,60.592449,315.690959,13.521431,59.689131,21.324618,59.696410,21.342043
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,5,22,1,686,...,83.0,40.325926,51.671139,65.119362,323.979005,13.414995,59.758427,21.252063,59.758065,21.253312
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,5,23,1,686,...,83.0,38.206025,48.004839,67.659127,4.623223,11.828951,59.830247,21.363831,59.832424,21.344842
1522062,52.6,17.3,59.83316,21.38489,1248.0,11057893.0,5,23,1,686,...,83.0,35.806334,43.996936,69.976067,12.956417,12.631257,59.904093,21.417381,59.893618,21.542513


In [16]:
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

n_lag = 30  # Adjust as needed
lagged_vars = [
    'cog', 'sog', 'latitude', 'longitude',
    'time_delta', 'port_1', 'port_2', 'port_3', 'cog_lin', 'sog_lin', 'latitude_lin', 'longitude_lin',
    'latitude_pred', 'longitude_pred'
]
df = create_lag_features(df, lagged_vars, n_lag)
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag21,longitude_pred_lag22,longitude_pred_lag23,longitude_pred_lag24,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30
0,308.1,17.1,7.50361,77.58340,1393.0,1001267.0,1,14,4,0,...,,,,,,,,,,
1,307.6,17.3,7.57302,77.49505,1583.0,1002660.0,1,14,4,0,...,,,,,,,,,,
2,306.8,16.9,7.65043,77.39404,1285.0,1004243.0,1,14,4,0,...,,,,,,,,,,
3,307.9,16.9,7.71275,77.31394,1259.0,1005528.0,1,15,4,0,...,,,,,,,,,,
4,307.0,16.3,7.77191,77.23585,901.0,1006787.0,1,15,4,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,5,22,1,686,...,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746,24.07743,24.07747
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,5,22,1,686,...,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746,24.07743
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,5,23,1,686,...,24.07748,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746
1522062,52.6,17.3,59.83316,21.38489,1248.0,11057893.0,5,23,1,686,...,24.07749,24.07748,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746


In [17]:
time_vars = ['month', 'hour', 'weekday']
df = adjust_features_for_prediction_time(df, time_vars)
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag21,longitude_pred_lag22,longitude_pred_lag23,longitude_pred_lag24,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30
0,308.1,17.1,7.50361,77.58340,1393.0,1001267.0,1.0,14.0,4.0,0,...,,,,,,,,,,
1,307.6,17.3,7.57302,77.49505,1583.0,1002660.0,1.0,14.0,4.0,0,...,,,,,,,,,,
2,306.8,16.9,7.65043,77.39404,1285.0,1004243.0,1.0,15.0,4.0,0,...,,,,,,,,,,
3,307.9,16.9,7.71275,77.31394,1259.0,1005528.0,1.0,15.0,4.0,0,...,,,,,,,,,,
4,307.0,16.3,7.77191,77.23585,901.0,1006787.0,1.0,15.0,4.0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,5.0,22.0,1.0,686,...,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746,24.07743,24.07747
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,5.0,23.0,1.0,686,...,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746,24.07743
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,5.0,23.0,1.0,686,...,24.07748,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746
1522062,52.6,17.3,59.83316,21.38489,1248.0,11057893.0,5.0,23.0,1.0,686,...,24.07749,24.07748,24.07749,24.07744,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746


In [20]:
df[df['vessel_embedding'] == 322].tail()

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag21,longitude_pred_lag22,longitude_pred_lag23,longitude_pred_lag24,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30
584572,338.3,17.8,-33.76641,17.73318,1212.0,10748270.0,5.0,9.0,5.0,322,...,31.169932,31.19451,31.085519,22.431346,31.056427,31.091184,31.059003,31.11318,31.186823,31.208485
584573,338.5,17.9,-33.67195,17.69076,1242.0,10749482.0,5.0,10.0,5.0,322,...,27.011743,31.169932,31.19451,31.085519,22.431346,31.056427,31.091184,31.059003,31.11318,31.186823
584574,340.8,18.0,-33.57465,17.64701,1098.0,10750724.0,5.0,10.0,5.0,322,...,19.095128,27.011743,31.169932,31.19451,31.085519,22.431346,31.056427,31.091184,31.059003,31.11318
584575,339.4,18.4,-33.48759,17.60807,785.0,10751822.0,5.0,10.0,5.0,322,...,19.010964,19.095128,27.011743,31.169932,31.19451,31.085519,22.431346,31.056427,31.091184,31.059003
584576,337.1,18.3,-33.42449,17.57977,0.0,10752607.0,,,,322,...,18.908099,19.010964,19.095128,27.011743,31.169932,31.19451,31.085519,22.431346,31.056427,31.091184


In [21]:
# go through df if the next vessel_embedding is different from the current one, then we have a new vessel. Store the last row of each vessel in a dict where the key is the vessel_embedding
last_obs_dict = {}
for i in range(len(df)-1):
    if df['vessel_embedding'][i] != df['vessel_embedding'][i+1]:
        last_obs_dict[df['vessel_embedding'][i]] = df.iloc[i]

# add the last row of the last vessel to the dict
last_obs_dict[df['vessel_embedding'].iloc[-1]] = df.iloc[-1]

# store the dict with the last row of each vessel in a pickle file
with open('../data/xgb_linear/last_obs.pkl', 'wb') as f:
    pickle.dump(last_obs_dict, f)

In [18]:
target_vars = ['latitude', 'longitude', 'cog', 'sog']
df = create_target_variables(df, target_vars)
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30,latitude_next,longitude_next,cog_next,sog_next
0,308.1,17.1,7.50361,77.58340,1393.0,1001267.0,1.0,14.0,4.0,0,...,,,,,,,7.57302,77.49505,307.6,17.3
1,307.6,17.3,7.57302,77.49505,1583.0,1002660.0,1.0,14.0,4.0,0,...,,,,,,,7.65043,77.39404,306.8,16.9
2,306.8,16.9,7.65043,77.39404,1285.0,1004243.0,1.0,15.0,4.0,0,...,,,,,,,7.71275,77.31394,307.9,16.9
3,307.9,16.9,7.71275,77.31394,1259.0,1005528.0,1.0,15.0,4.0,0,...,,,,,,,7.77191,77.23585,307.0,16.3
4,307.0,16.3,7.77191,77.23585,901.0,1006787.0,1.0,15.0,4.0,0,...,,,,,,,7.81285,77.18147,307.6,16.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,5.0,22.0,1.0,686,...,24.07745,24.07744,24.07746,24.07746,24.07743,24.07747,59.69588,21.34225,324.2,13.3
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,5.0,23.0,1.0,686,...,24.07746,24.07745,24.07744,24.07746,24.07746,24.07743,59.76388,21.35317,356.5,12.2
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,5.0,23.0,1.0,686,...,24.07746,24.07746,24.07745,24.07744,24.07746,24.07746,59.83316,21.38489,52.6,17.3
1522062,52.6,17.3,59.83316,21.38489,1248.0,11057893.0,5.0,23.0,1.0,686,...,24.07746,24.07746,24.07746,24.07745,24.07744,24.07746,59.89167,21.54685,53.6,17.7


In [19]:
lagged_feature_cols = [f'{var}_lag{lag}' for var in lagged_vars for lag in range(1, n_lag + 1)]
required_cols = lagged_feature_cols + [
    'month', 'hour', 'weekday', 'vessel_embedding', 'CEU', 'length', 'vesselType',
    'latitude_next', 'longitude_next', 'cog_next', 'sog_next'
]
df = df.dropna(subset=required_cols)
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30,latitude_next,longitude_next,cog_next,sog_next
31,301.2,15.7,-34.51449,18.32131,1777.0,2457212.0,1.0,11.0,0.0,0,...,77.039317,77.109409,77.181142,77.235636,77.312951,77.393884,-34.44986,18.18804,300.1,15.4
32,300.1,15.4,-34.44986,18.18804,1043.0,2458989.0,1.0,11.0,0.0,0,...,76.968190,77.039317,77.109409,77.181142,77.235636,77.312951,-34.41189,18.11114,302.3,15.1
33,302.3,15.1,-34.41189,18.11114,1693.0,2460032.0,1.0,11.0,0.0,0,...,76.903293,76.968190,77.039317,77.109409,77.181142,77.235636,-34.34598,17.99001,304.1,15.2
34,304.1,15.2,-34.34598,17.99001,2499832.0,2461725.0,2.0,10.0,1.0,0,...,76.822254,76.903293,76.968190,77.039317,77.109409,77.181142,9.50785,-79.89928,205.2,6.8
35,205.2,6.8,9.50785,-79.89928,1283.0,4961557.0,2.0,10.0,1.0,0,...,76.772374,76.822254,76.903293,76.968190,77.039317,77.109409,9.47286,-79.91497,202.0,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522058,296.3,14.7,59.57721,21.54090,1259.0,11052917.0,5.0,22.0,1.0,686,...,24.077440,24.077460,24.077460,24.077430,24.077470,24.077450,59.63337,21.43237,324.1,13.5
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,5.0,22.0,1.0,686,...,24.077450,24.077440,24.077460,24.077460,24.077430,24.077470,59.69588,21.34225,324.2,13.3
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,5.0,23.0,1.0,686,...,24.077460,24.077450,24.077440,24.077460,24.077460,24.077430,59.76388,21.35317,356.5,12.2
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,5.0,23.0,1.0,686,...,24.077460,24.077460,24.077450,24.077440,24.077460,24.077460,59.83316,21.38489,52.6,17.3


In [23]:
categorical_cols = ['vesselType', 'vessel_embedding', 'month', 'hour', 'weekday']
df, label_encoders = encode_categorical_variables(df, categorical_cols)
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30,latitude_next,longitude_next,cog_next,sog_next
31,301.2,15.7,-34.51449,18.32131,1777.0,2457212.0,0.0,17.0,0.0,0,...,77.039317,77.109409,77.181142,77.235636,77.312951,77.393884,-34.44986,18.18804,300.1,15.4
32,300.1,15.4,-34.44986,18.18804,1043.0,2458989.0,0.0,17.0,0.0,0,...,76.968190,77.039317,77.109409,77.181142,77.235636,77.312951,-34.41189,18.11114,302.3,15.1
33,302.3,15.1,-34.41189,18.11114,1693.0,2460032.0,0.0,17.0,0.0,0,...,76.903293,76.968190,77.039317,77.109409,77.181142,77.235636,-34.34598,17.99001,304.1,15.2
34,304.1,15.2,-34.34598,17.99001,2499832.0,2461725.0,1.0,12.0,1.0,0,...,76.822254,76.903293,76.968190,77.039317,77.109409,77.181142,9.50785,-79.89928,205.2,6.8
35,205.2,6.8,9.50785,-79.89928,1283.0,4961557.0,1.0,12.0,1.0,0,...,76.772374,76.822254,76.903293,76.968190,77.039317,77.109409,9.47286,-79.91497,202.0,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522058,296.3,14.7,59.57721,21.54090,1259.0,11052917.0,4.0,7.0,1.0,615,...,24.077440,24.077460,24.077460,24.077430,24.077470,24.077450,59.63337,21.43237,324.1,13.5
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,4.0,7.0,1.0,615,...,24.077450,24.077440,24.077460,24.077460,24.077430,24.077470,59.69588,21.34225,324.2,13.3
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,4.0,8.0,1.0,615,...,24.077460,24.077450,24.077440,24.077460,24.077460,24.077430,59.76388,21.35317,356.5,12.2
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,4.0,8.0,1.0,615,...,24.077460,24.077460,24.077450,24.077440,24.077460,24.077460,59.83316,21.38489,52.6,17.3


In [21]:
X, y, feature_cols, target_cols = prepare_features_and_targets(df, lagged_vars, n_lag)

In [22]:
X_train, X_test, y_train, y_test = split_data(X, y, df, split_ratio=0.85)

In [98]:
xgb_model = train_xgboost_model(X_train, y_train, X_test, y_test)

[0]	validation_0-rmse:87.98003
[1]	validation_0-rmse:80.81637
[2]	validation_0-rmse:74.50599
[3]	validation_0-rmse:68.96122
[4]	validation_0-rmse:64.09246
[5]	validation_0-rmse:59.84645
[6]	validation_0-rmse:56.17843
[7]	validation_0-rmse:53.00667
[8]	validation_0-rmse:50.28635
[9]	validation_0-rmse:47.95034
[10]	validation_0-rmse:45.96028
[11]	validation_0-rmse:44.27472
[12]	validation_0-rmse:42.85309
[13]	validation_0-rmse:41.65632
[14]	validation_0-rmse:40.64041
[15]	validation_0-rmse:39.80300
[16]	validation_0-rmse:39.10987
[17]	validation_0-rmse:38.52469
[18]	validation_0-rmse:38.03902
[19]	validation_0-rmse:37.62387
[20]	validation_0-rmse:37.27844
[21]	validation_0-rmse:36.99146
[22]	validation_0-rmse:36.74633
[23]	validation_0-rmse:36.53916
[24]	validation_0-rmse:36.36345
[25]	validation_0-rmse:36.21500
[26]	validation_0-rmse:36.09447
[27]	validation_0-rmse:35.98734
[28]	validation_0-rmse:35.89290
[29]	validation_0-rmse:35.81835
[30]	validation_0-rmse:35.75279
[31]	validation_0-

In [26]:
# save the models
import pickle
with open('../models/xgb_linear.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

NameError: name 'xgb_model' is not defined

## Predict on test data

In [65]:
def initialize_lags(last_known_data, lag_vars, n_lags):
    """
    Initialize lagged features using the last known data.
    
    Parameters:
    - last_known_data: DataFrame with the last known data for the vessel.
    - lag_vars: List of variables to create lags for.
    - n_lags: Number of lags.
    
    Returns:
    - lags: Dictionary containing initialized lagged values.
    """
    lags = {}
    for var in lag_vars:
        for lag in range(1, n_lags + 1):
            col_name = f'{var}_lag{lag}'
            if col_name in last_known_data.index:
                lags[col_name] = last_known_data[col_name]
            else:
                raise ValueError(f'Column {col_name} not found in last known data.')
    return lags


In [66]:
def get_static_features(last_known_data):
    """
    Extract static features for the vessel.
    
    Parameters:
    - last_known_data: DataFrame with the last known data for the vessel.
    
    Returns:
    - static_features: Dictionary containing static feature values.
    """
    static_features = {
        'vessel_embedding': last_known_data['vessel_embedding'],
        'CEU': last_known_data['CEU'],
        'length': last_known_data['length'],
        'vesselType': last_known_data['vesselType']
    }
    return static_features


In [58]:
def prepare_features(lags, static_features, month, hour, weekday):
    """
    Combine lags, static features, and time features into a single feature dictionary.
    
    Parameters:
    - lags: Dictionary of lagged features.
    - static_features: Dictionary of static features.
    - month, hour, weekday: Time features.
    
    Returns:
    - features: Dictionary ready for model input.
    """
    features = {}
    features.update(lags)
    features.update(static_features)
    features['month'] = month
    features['hour'] = hour
    features['weekday'] = weekday
    return features

In [82]:
def update_lags(lags, predictions, time_delta, n_lags):
    """
    Update lagged features with the latest predictions and shift previous lags.
    
    Parameters:
    - lags: Dictionary of current lagged features.
    - predictions: Dictionary of current predictions.
    - time_delta: Time delta for 'time_delta_lag1'.
    - n_lags: Number of lags.
    
    Returns:
    - lags: Updated dictionary of lagged features.
    """
    # Shift lags
    for var in predictions.keys():
        for lag in range(n_lags, 1, -1):
            lags[f'{var}_lag{lag}'] = lags[f'{var}_lag{lag-1}']
    
    # Shift 'time_delta' lags
    for lag in range(n_lags, 1, -1):
        lags[f'time_delta_lag{lag}'] = lags[f'time_delta_lag{lag-1}']

    # Update lag1 with current predictions
    for var, value in predictions.items():
        lags[f'{var}_lag1'] = value
    lags['time_delta_lag1'] = time_delta
    return lags


In [90]:
def predict_for_vessel(
    vessel_embedding, vessel_test_times, xgb_reg, xgb_clf, kalman_filters_dict,
    get_last_known_data, expected_feature_columns, last_obs, n_lags=20
):
    """
    Predict latitude and longitude for a single vessel.
    
    Parameters:
    - vessel_id: ID of the vessel.
    - vessel_test_times: DataFrame with 'ID' and 'time' for the vessel.
    - xgb_reg: Trained regression model.
    - xgb_clf: Trained classification model.
    - kalman_filters_dict: Dictionary of Kalman filters for each vessel.
    - get_last_known_data: Function to retrieve last known data for the vessel.
    - expected_feature_columns: List of expected feature column names.
    - last_obs: pd.Series with the last observation for each vessel.
    - n_lags: Number of lags used in the model.
    
    Returns:
    - predictions: List of dictionaries with predictions for each timestamp.
    """
    # Get the last known data for this vessel
    last_known_data = get_last_known_data(last_obs)
    if last_known_data is None or last_known_data.empty:
        print(f"No last known data for vessel {vessel_embedding}")
        return None
    
    # Initialize lags and static features
    lag_vars = [
        'cog', 'sog', 'latitude', 'longitude', 'navstat',
        'port_1', 'port_2', 'port_3', 'cog_lin', 'sog_lin', 'latitude_lin', 'longitude_lin',
        'latitude_pred', 'longitude_pred'
    ]
    lags = initialize_lags(last_known_data, lag_vars + ['time_delta'], n_lags)
    static_features = get_static_features(last_known_data)
    
    # Prepare to store predictions
    predictions = []
    previous_time = None
    previous_time_numeric = None
    time_newyear = pd.to_datetime('2024-01-01 00:00:00')
    
    # Iterate over each timestamp
    for idx, row in vessel_test_times.iterrows():
        current_time = pd.to_datetime(row['time']) 
        current_time_numeric = (pd.to_datetime(row['time']) - time_newyear).total_seconds()
        
        # Extract time features
        month = current_time.month
        hour = current_time.hour
        weekday = current_time.weekday()
        
        # Compute time_delta
        if previous_time is not None:
            time_delta = current_time_numeric - previous_time_numeric
        else:
            last_known_time = last_known_data['time_numeric']
            time_delta = current_time_numeric - last_known_time
        
        lags['time_delta_lag1'] = time_delta
        
        # Prepare input features
        features = prepare_features(lags, static_features, month, hour, weekday)
        
        # Convert features to DataFrame and ensure correct column order
        X_input = pd.DataFrame([features])
        X_input = X_input[expected_feature_columns]
        
        # Predict with models
        y_pred_reg = xgb_reg.predict(X_input)
        y_pred_clf = xgb_clf.predict(X_input)
        
        # Extract predictions
        predictions_dict = {
            'latitude': y_pred_reg[0][0],
            'longitude': y_pred_reg[0][1],
            'cog': y_pred_reg[0][2],
            'sog': y_pred_reg[0][3],
            'navstat': y_pred_clf[0]
        }
        
        # Store the prediction
        prediction = {
            'ID': row['ID'],
            'vesselId': vessel_embedding,
            'time': current_time,
            'latitude_pred': predictions_dict['latitude'],
            'longitude_pred': predictions_dict['longitude']
        }
        predictions.append(prediction)
        
        # Update lags
        lags = update_lags(lags, predictions_dict, time_delta, n_lags)
        
        # Update previous_time
        previous_time = current_time
        previous_time_numeric = current_time_numeric
    
    return predictions


In [38]:
def get_last_known_data(last_obs):

    # the features with names ...lagx must be renamed to ...lagx+1.
    for i in range(20, 0, -1):
        last_obs = last_obs.rename(index = {f'latitude_lag{i}': f'latitude_lag{i+1}',
                                            f'sog_kalman_lag{i}': f'sog_kalman_lag{i+1}',
                                            f'sog_lag{i}': f'sog_lag{i+1}',
                                            f'time_delta_lag{i}':f'time_delta_lag{i+1}',
                                            f'cog_kalman_lag{i}': f'cog_kalman_lag{i+1}',
                                            f'cog_lag{i}': f'cog_lag{i+1}',
                                            f'longitude_kalman_lag{i}': f'longitude_kalman_lag{i+1}',
                                            f'latitude_kalman_lag{i}': f'latitude_kalman_lag{i+1}',
                                            f'longitude_lag{i}': f'longitude_lag{i+1}',
                                            f'navstat_lag{i}': f'navstat_lag{i+1}'})
        
    # delete _lag21
    last_obs = last_obs.drop(['latitude_lag21', 'sog_kalman_lag21', 'sog_lag21', 'time_delta_lag21', 'cog_kalman_lag21', 'cog_lag21', 'longitude_kalman_lag21', 'latitude_kalman_lag21', 'longitude_lag21', 'navstat_lag21'])



    # the features in last_obs 'latitude', 'sog_kalman', 'sog', 'time_delta', 'cog_kalman', 'cog', 'longitude_kalman', 'latitude_kalman', 'longitude', 'navstat' must be renamed to ...lag1
    last_obs = last_obs.rename(index = {'latitude': 'latitude_lag1',
                                                      'sog_kalman': 'sog_kalman_lag1',
                                                      'sog': 'sog_lag1',
                                                      'time_delta': 'time_delta_lag1',
                                                      'cog_kalman': 'cog_kalman_lag1',
                                                      'cog': 'cog_lag1',
                                                      'longitude_kalman': 'longitude_kalman_lag1',
                                                      'latitude_kalman': 'latitude_kalman_lag1',
                                                      'longitude': 'longitude_lag1',
                                                      'navstat': 'navstat_lag1'})

    return last_obs

In [91]:
# Define the expected feature columns
expected_feature_columns = X_train_reg.columns.tolist()

# Get the last known data
test_data = pd.read_csv('../data/ais_test.csv', sep=',')

# Get the vesselId_dict
with open('../data/xgb_kalman/vesselId_dict.pkl', 'rb') as f:
    vesselId_dict = pickle.load(f)

##################### - DEFINITIONS FOR KALMAN FILTER - #####################

from geopy import Point
from geopy.distance import distance

def compute_distance(sog_knots, delta_t_seconds):
    # Convert SOG from knots to meters per second (1 knot = 0.514444 m/s)
    sog_mps = sog_knots * 0.514444
    distance_m = sog_mps * delta_t_seconds
    return distance_m

# Define the hx function
def hx(x):
    return x

# Define the fx function
def fx(x, dt):
    lat = x[0]
    lon = x[1]
    sog = x[2]
    cog = x[3]

    # Ensure SOG and COG are valid
    if sog <= 0 or sog >= 1022 or np.isnan(sog):
        sog = x[2]  # Use previous valid SOG
    if cog < 0 or cog >= 360 or np.isnan(cog):
        cog = x[3]  # Use previous valid COG

    # Compute distance traveled
    distance_m = compute_distance(sog, dt)

    # Use geopy to compute new position
    start_point = Point(lat, lon)
    destination = distance(meters=distance_m).destination(point=start_point, bearing=cog)

    lat_new = destination.latitude
    lon_new = destination.longitude

    # Assume SOG and COG remain the same
    sog_new = sog
    cog_new = cog

    return np.array([lat_new, lon_new, sog_new, cog_new])

with open('../data/xgb_kalman/kalman_filters_dict.pkl', 'rb') as f:
    kalman_filters_dict = pickle.load(f)

##################### - END DEFINITIONS FOR KALMAN FILTER - #####################

# Get the last obs dict
with open('../data/xgb_kalman/last_obs.pkl', 'rb') as f:
    last_obs_dict = pickle.load(f)

# Get the models
xgb_reg = pickle.load(open('../models/xgb_kalman/xgb_reg.pkl', 'rb'))
xgb_clf = pickle.load(open('../models/xgb_kalman/xgb_clf.pkl', 'rb'))

# Run the predictions
all_predictions = []

n_iter = len(test_data['vesselId'].unique())
k = 0

for vessel_id in test_data['vesselId'].unique():
    vessel_test_times = test_data[test_data['vesselId'] == vessel_id].sort_values('time')
    vessel_embedding = vesselId_dict[vessel_id]
    last_obs = last_obs_dict[vessel_embedding]
    vessel_predictions = predict_for_vessel(
        vessel_embedding,
        vessel_test_times,
        xgb_reg,
        xgb_clf,
        kalman_filters_dict,
        get_last_known_data,
        expected_feature_columns,
        last_obs,
        n_lags=20
    )
    if vessel_predictions:
        all_predictions.extend(vessel_predictions)
        
    k += 1
    progress_bar(k, n_iter, prefix='Progress:', suffix='Complete', length=50)

predictions_df = pd.DataFrame(all_predictions)

# Now 'predictions_df' contains the predicted latitude and longitude for each vessel at the specified times.

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [92]:
# save the predictions
predictions_df.to_csv('../data/xgb_kalman/predictions.csv', index = False)

In [42]:
last_obs = last_obs_dict[13]
last_obs = get_last_known_data(last_obs)
print(set(last_obs.index).difference(set(expected_feature_columns)))
print(set(expected_feature_columns).difference(set(last_obs.index)))

{'time_numeric'}
set()


In [94]:

predictions_final = predictions_df[['ID', 'latitude_pred', 'longitude_pred']]

#rename _pred to _predicted
predictions_final = predictions_final.rename(columns = {'latitude_pred': 'latitude_predicted', 'longitude_pred': 'longitude_predicted'})

predictions_final.to_csv('../data/xgb_kalman/predictions_final.csv', index = False)

predictions_final_kalman = predictions_df[['ID', 'latitude_kalman', 'longitude_kalman']]
predictions_final_kalman = predictions_final_kalman.rename(columns = {'latitude_kalman': 'latitude_predicted', 'longitude_kalman': 'longitude_predicted'})

predictions_final_kalman.to_csv('../data/xgb_kalman/predictions_final_kalman.csv', index = False)

In [95]:
# make a new predictions_final that is the average of the predictions_final and predictions_final_kalmanp
predictions_final_avg = pd.DataFrame()
predictions_final_avg['ID'] = predictions_final['ID']
predictions_final_avg['latitude_predicted'] = (predictions_final['latitude_predicted'] + predictions_final_kalman['latitude_predicted']) / 2
predictions_final_avg['longitude_predicted'] = (predictions_final['longitude_predicted'] + predictions_final_kalman['longitude_predicted']) / 2

predictions_final_avg.to_csv('../data/xgb_kalman/predictions_final_avg.csv', index = False)

In [98]:
# merge X and y into one dataframe (on index)

df = pd.merge(X, y, left_index=True, right_index=True)
df

Unnamed: 0,cog_lag1,cog_lag2,cog_lag3,cog_lag4,cog_lag5,cog_lag6,cog_lag7,cog_lag8,cog_lag9,cog_lag10,...,weekday,vessel_embedding,CEU,length,vesselType,latitude_next,longitude_next,sog_next,cog_next,navstat_next
20,202.2,218.4,265.1,243.4,234.5,222.5,227.6,308.1,297.3,294.2,...,4.0,0,6500,199.0,3.0,-29.82024,31.11275,3.3,314.0,0.0
21,297.9,202.2,218.4,265.1,243.4,234.5,222.5,227.6,308.1,297.3,...,4.0,0,6500,199.0,3.0,-29.81090,31.12018,2.7,29.6,0.0
22,314.0,297.9,202.2,218.4,265.1,243.4,234.5,222.5,227.6,308.1,...,4.0,0,6500,199.0,3.0,-29.83246,31.08940,10.5,219.6,0.0
23,29.6,314.0,297.9,202.2,218.4,265.1,243.4,234.5,222.5,227.6,...,4.0,0,6500,199.0,3.0,-29.87551,31.05145,7.6,227.9,0.0
24,219.6,29.6,314.0,297.9,202.2,218.4,265.1,243.4,234.5,222.5,...,5.0,0,6500,199.0,3.0,-29.87645,31.05018,7.4,60.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521371,292.6,291.3,288.5,288.7,247.4,247.9,247.0,266.5,278.8,278.7,...,1.0,686,200,191.0,3.0,59.63337,21.43237,13.5,324.1,0.0
1521372,296.3,292.6,291.3,288.5,288.7,247.4,247.9,247.0,266.5,278.8,...,1.0,686,200,191.0,3.0,59.69588,21.34225,13.3,324.2,0.0
1521373,324.1,296.3,292.6,291.3,288.5,288.7,247.4,247.9,247.0,266.5,...,1.0,686,200,191.0,3.0,59.76388,21.35317,12.2,356.5,0.0
1521374,324.2,324.1,296.3,292.6,291.3,288.5,288.7,247.4,247.9,247.0,...,1.0,686,200,191.0,3.0,59.83316,21.38489,17.3,52.6,0.0


In [101]:
df

Unnamed: 0,cog,sog,latitude,longitude,time_delta,time_numeric,month,hour,weekday,vessel_embedding,...,longitude_pred_lag25,longitude_pred_lag26,longitude_pred_lag27,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30,latitude_next,longitude_next,cog_next,sog_next
31,301.2,15.7,-34.51449,18.32131,1777.0,2457212.0,0.0,3.0,0.0,0,...,77.039317,77.109409,77.181142,77.235636,77.312951,77.393884,-34.44986,18.18804,300.1,15.4
32,300.1,15.4,-34.44986,18.18804,1043.0,2458989.0,0.0,3.0,0.0,0,...,76.968190,77.039317,77.109409,77.181142,77.235636,77.312951,-34.41189,18.11114,302.3,15.1
33,302.3,15.1,-34.41189,18.11114,1693.0,2460032.0,0.0,3.0,0.0,0,...,76.903293,76.968190,77.039317,77.109409,77.181142,77.235636,-34.34598,17.99001,304.1,15.2
34,304.1,15.2,-34.34598,17.99001,2499832.0,2461725.0,1.0,2.0,1.0,0,...,76.822254,76.903293,76.968190,77.039317,77.109409,77.181142,9.50785,-79.89928,205.2,6.8
35,205.2,6.8,9.50785,-79.89928,1283.0,4961557.0,1.0,2.0,1.0,0,...,76.772374,76.822254,76.903293,76.968190,77.039317,77.109409,9.47286,-79.91497,202.0,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522058,296.3,14.7,59.57721,21.54090,1259.0,11052917.0,4.0,15.0,1.0,652,...,24.077440,24.077460,24.077460,24.077430,24.077470,24.077450,59.63337,21.43237,324.1,13.5
1522059,324.1,13.5,59.63337,21.43237,1249.0,11054176.0,4.0,15.0,1.0,652,...,24.077450,24.077440,24.077460,24.077460,24.077430,24.077470,59.69588,21.34225,324.2,13.3
1522060,324.2,13.3,59.69588,21.34225,1249.0,11055425.0,4.0,16.0,1.0,652,...,24.077460,24.077450,24.077440,24.077460,24.077460,24.077430,59.76388,21.35317,356.5,12.2
1522061,356.5,12.2,59.76388,21.35317,1219.0,11056674.0,4.0,16.0,1.0,652,...,24.077460,24.077460,24.077450,24.077440,24.077460,24.077460,59.83316,21.38489,52.6,17.3


In [1]:
%pip install autogluon.tabular[all]

^C
Note: you may need to restart the kernel to use updated packages.


In [2]:
from autogluon.tabular import TabularPredictor

lagged_feature_cols = [f'{var}_lag{lag}' for var in lagged_vars for lag in range(1, n_lag + 1)]
feature_cols = lagged_feature_cols + [
    'month', 'hour', 'weekday', 'vessel_embedding', 'CEU', 'length', 'vesselType',
]
    
# Target columns
labels = 'latitude_next'

# Define the problem type
problem_type = 'regression'

# Define the hyperparameters
hyperparameters = {
    'GBM': {},
    'CAT': {},
    'RF': {},
    'XT': {},
    'KNN': {},
    'NN': {},
    'FASTAI': {},
    'XGB': {}
}

# Create a TabularPredictor
predictor = TabularPredictor(
    label=labels,
    problem_type=problem_type,
    eval_metric='root_mean_squared_error',
    path='autogluon_ais'
)

# Fit the predictor
predictor.fit(df)

# Save the predictor
predictor.save('autogluon_ais')

ModuleNotFoundError: No module named 'autogluon'

In [25]:
# Load the model


Unnamed: 0,cog_lag1,cog_lag2,cog_lag3,cog_lag4,cog_lag5,cog_lag6,cog_lag7,cog_lag8,cog_lag9,cog_lag10,...,longitude_pred_lag28,longitude_pred_lag29,longitude_pred_lag30,month,hour,weekday,vessel_embedding,CEU,length,vesselType
31,290.2,214.1,158.2,91.8,60.1,227.9,219.6,29.6,314.0,297.9,...,77.235636,77.312951,77.393884,0.0,3.0,0.0,0,6500,199.0,3.0
32,301.2,290.2,214.1,158.2,91.8,60.1,227.9,219.6,29.6,314.0,...,77.181142,77.235636,77.312951,0.0,3.0,0.0,0,6500,199.0,3.0
33,300.1,301.2,290.2,214.1,158.2,91.8,60.1,227.9,219.6,29.6,...,77.109409,77.181142,77.235636,0.0,3.0,0.0,0,6500,199.0,3.0
34,302.3,300.1,301.2,290.2,214.1,158.2,91.8,60.1,227.9,219.6,...,77.039317,77.109409,77.181142,1.0,2.0,1.0,0,6500,199.0,3.0
35,304.1,302.3,300.1,301.2,290.2,214.1,158.2,91.8,60.1,227.9,...,76.968190,77.039317,77.109409,1.0,2.0,1.0,0,6500,199.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182401,165.9,179.1,245.8,244.4,133.1,131.6,131.7,142.6,240.7,237.3,...,-122.921630,-122.921610,-122.921620,3.0,3.0,5.0,685,6459,199.0,3.0
182402,174.8,165.9,179.1,245.8,244.4,133.1,131.6,131.7,142.6,240.7,...,-122.921620,-122.921630,-122.921610,3.0,3.0,5.0,685,6459,199.0,3.0
182403,243.4,174.8,165.9,179.1,245.8,244.4,133.1,131.6,131.7,142.6,...,-122.921640,-122.921620,-122.921630,3.0,3.0,5.0,685,6459,199.0,3.0
182404,287.2,243.4,174.8,165.9,179.1,245.8,244.4,133.1,131.6,131.7,...,-122.921600,-122.921640,-122.921620,3.0,4.0,5.0,685,6459,199.0,3.0
