# AIS vessel prediction

Henrik Månum, student id: 591864

Lars Talian Stangebye-Hansen, student id: 593617

Emil Skogheim, student id: 593632

Kaggle team: "[128] - ;)"

### Kaggle Score
This notebook scored **101.05505** on the public leaderboard on Kaggle, demonstrating strong performance in predicting vessel positions.

To run the following cells, you need to install these packages:

In [None]:
!pip install pandas numpy scikit-learn xgboost joblib tqdm

## Preprocessing

By looking at the guidelines for expected values for AIS messages, one can more easily search for invalid values.

In [None]:
import pandas as pd
import numpy as np

ais_data = pd.read_csv('ais_train.csv', delimiter='|')

columns_needed = ['time', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude', 'longitude', 'vesselId']
for column in columns_needed:
    if column not in ais_data.columns:
        ais_data[column] = np.nan

ais_data['time'] = pd.to_datetime(ais_data['time'], errors='coerce')

invalid_cog = (ais_data['cog'] < 0) | (ais_data['cog'] > 359)
ais_data.loc[invalid_cog, 'cog'] = ais_data.loc[~invalid_cog, 'cog'].mean()

invalid_sog = (ais_data['sog'] > 102.2) | (ais_data['sog'] == 1023)
ais_data.loc[invalid_sog, 'sog'] = ais_data.loc[~invalid_sog, 'sog'].mean()

invalid_rot = (ais_data['rot'] < -126) | (ais_data['rot'] > 126 | (ais_data['rot'] == -128))
ais_data.loc[invalid_rot, 'rot'] = ais_data.loc[~invalid_rot, 'rot'].mean()

invalid_heading = (ais_data['heading'] < 0) | (ais_data['heading'] > 359) | (ais_data['heading'] == 511)
ais_data.loc[invalid_heading, 'heading'] = ais_data.loc[~invalid_heading, 'heading'].mean()

invalid_navstat = (ais_data['navstat'] < 0) | (ais_data['navstat'] > 15)
most_frequent_navstat = ais_data['navstat'].mode()[0]
ais_data.loc[invalid_navstat, 'navstat'] = most_frequent_navstat

invalid_latitude = (ais_data['latitude'] < -90) | (ais_data['latitude'] > 90)
ais_data.loc[invalid_latitude, 'latitude'] = ais_data.loc[~invalid_latitude, 'latitude'].mean()

invalid_longitude = (ais_data['longitude'] < -180) | (ais_data['longitude'] > 180)
ais_data.loc[invalid_longitude, 'longitude'] = ais_data.loc[~invalid_longitude, 'longitude'].mean()

categorical_cols = ['vesselId']
for column in categorical_cols:
    ais_data[column].fillna('Unknown', inplace=True)

ais_data.to_csv('ais_train_processed.csv', index=False)

print("Processed AIS training data saved to 'ais_train_processed.csv'")


## Read processed data and test data

In [None]:
train_data = pd.read_csv('ais_train_processed.csv', sep=',')

train_data = train_data.drop(['portId','etaRaw'], axis=1) 

train_data['time'] = pd.to_datetime(train_data['time'])

train_data = train_data.sort_values(by=['vesselId','time'])

print(train_data.shape[0])

test_data = pd.read_csv('ais_test.csv')

test_data['time'] = pd.to_datetime(test_data['time'])

print(test_data.shape[0])

## Feature engineering

In [None]:
from operator import index
import pandas as pd
import numpy as np
from tqdm import tqdm

def generate_training_features(df_train: pd.DataFrame) -> pd.DataFrame:
    compiled_data = [] 
    unique_vessels = df_train['vesselId'].unique()  

    for vessel_id in tqdm(unique_vessels):
        vessel_subset = df_train[df_train['vesselId'] == vessel_id].sort_values(by='time').reset_index(drop=True)
        total_rows = vessel_subset.shape[0] 
        vessel_rows = []  
        previous_row = None
        iteration_count = 0

        while total_rows > 0:
            initial_time = vessel_subset['time'].min()  
            time_plus_5_days = initial_time + pd.Timedelta(days=5) 
            time_plus_5_days_end = time_plus_5_days.replace(hour=23, minute=59, second=59)

            window_data = vessel_subset[(vessel_subset['time'] >= initial_time) & (vessel_subset['time'] <= time_plus_5_days_end)].copy()

            if iteration_count == 0 and previous_row is None:
                window_data["last_latitude"] = window_data['latitude']
                window_data["last_longitude"] = window_data["longitude"]
                window_data["last_rot"] = window_data["rot"]
                window_data["last_cog"] = window_data["cog"]
                window_data["last_sog"] = window_data["sog"]
                window_data["last_heading"] = window_data["heading"]
                window_data["last_navstat"] = window_data["navstat"]
                window_data["latitude_diff"] = 0
                window_data["longitude_diff"] = 0
                window_data["time_diff"] = 0
                window_data["estimated_distance"] = 0
                
            else:
                window_data["last_latitude"] = previous_row['latitude']
                window_data["last_longitude"] = previous_row["longitude"]
                window_data["last_rot"] = previous_row["rot"]
                window_data["last_cog"] = previous_row["cog"]
                window_data["last_sog"] = previous_row["sog"]
                window_data["last_heading"] = previous_row["heading"]
                window_data["last_navstat"] = previous_row["navstat"]

                window_data["time_diff"] = (window_data["time"] - previous_row["time"]).dt.total_seconds() / 3600
                window_data["estimated_distance"] = window_data["last_sog"] * window_data["time_diff"]
                cog_in_radians = np.deg2rad(window_data["last_cog"])
                earth_radius = 3440.065

                window_data["latitude_diff"] = (window_data["estimated_distance"] * np.cos(cog_in_radians)) / (earth_radius * (180 / np.pi))
                window_data["longitude_diff"] = (window_data["estimated_distance"] * np.sin(cog_in_radians)) / (earth_radius * np.cos(np.deg2rad(window_data["last_latitude"])) * (180 / np.pi))

            previous_row = window_data.iloc[-1]
            vessel_rows.append(window_data)
            vessel_subset = vessel_subset.drop(window_data.index).reset_index(drop=True)
            total_rows = vessel_subset.shape[0]
            iteration_count += 1

        compiled_df = pd.concat(vessel_rows, ignore_index=True)
        compiled_data.append(compiled_df)

    result_df = pd.concat(compiled_data, ignore_index=True)
    return result_df

def generate_test_features(df_train: pd.DataFrame, df_test: pd.DataFrame) -> pd.DataFrame:
    processed_data = []  
    vessel_ids = df_test['vesselId'].unique()

    for vessel_id in tqdm(vessel_ids):
        test_data_sorted = df_test[df_test['vesselId'] == vessel_id].sort_values(by='time').reset_index(drop=True)
        last_train_row = df_train[df_train['vesselId'] == vessel_id].iloc[-1]

        test_data_sorted["last_latitude"] = last_train_row['latitude']
        test_data_sorted["last_longitude"] = last_train_row["longitude"]
        test_data_sorted["last_rot"] = last_train_row["rot"]
        test_data_sorted["last_cog"] = last_train_row["cog"]
        test_data_sorted["last_sog"] = last_train_row["sog"]
        test_data_sorted["last_heading"] = last_train_row["heading"]
        test_data_sorted["last_navstat"] = last_train_row["navstat"]

        test_data_sorted["time_diff"] = (test_data_sorted["time"] - last_train_row["time"]).dt.total_seconds() / 3600
        test_data_sorted["estimated_distance"] = test_data_sorted["last_sog"] * test_data_sorted["time_diff"]
        cog_in_radians = np.deg2rad(test_data_sorted["last_cog"])
        earth_radius = 3440.065

        test_data_sorted["latitude_diff"] = (test_data_sorted["estimated_distance"] * np.cos(cog_in_radians)) / (earth_radius * (180 / np.pi))
        test_data_sorted["longitude_diff"] = (test_data_sorted["estimated_distance"] * np.sin(cog_in_radians)) / (earth_radius * np.cos(np.deg2rad(test_data_sorted["last_latitude"])) * (180 / np.pi))

        processed_data.append(test_data_sorted)

    final_test_df = pd.concat(processed_data, ignore_index=True)
    final_test_df = final_test_df.sort_values(by="ID")
    return final_test_df


In [None]:
final_train = generate_training_features(train_data)
final_test = generate_test_features(final_train, test_data)

final_train.to_csv("final_train.csv", index=False)
final_test.to_csv("final_test.csv", index=False)

# Training model, cross validation and prediction

In [None]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from joblib import Parallel, delayed
from sklearn.model_selection import KFold

test_data = pd.read_csv('final_test.csv')
training_data = pd.read_csv('final_train.csv')

input_features = training_data.drop(columns=['latitude', 'longitude', 'vesselId', 'time'])
target_outputs = training_data[['latitude', 'longitude']]

for column in input_features.columns:
    if column not in test_data.columns:
        test_data[column] = np.nan

X_test = test_data[input_features.columns]

cat_columns = input_features.select_dtypes(include=['object', 'category']).columns
for column in cat_columns:
    input_features[column] = input_features[column].astype('category')
    test_data[column] = test_data[column].astype('category')

# downcast
for column in input_features.select_dtypes(include=['float64']).columns:
    input_features[column] = pd.to_numeric(input_features[column], downcast='float')
for column in test_data.select_dtypes(include=['float64']).columns:
    test_data[column] = pd.to_numeric(test_data[column], downcast='float')

xgb_model = MultiOutputRegressor(
    Pipeline([
        ('fill_na', SimpleImputer(strategy='mean')),
        ('xgb_model', xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=200,
            seed=42,
            tree_method='gpu_hist',
            gpu_id=0,
            verbosity=0,
            reg_lambda=1.0, 
            alpha=0.1,
        ))
    ])
)

num_folds = 5
cross_validator = KFold(n_splits=num_folds, shuffle=True, random_state=42)
split_indices = list(cross_validator.split(input_features, target_outputs))

oof_predictions = np.zeros((input_features.shape[0], target_outputs.shape[1]))
test_predictions = np.zeros((X_test.shape[0], target_outputs.shape[1], num_folds))

def run_model(fold_idx, train_idx, validate_idx):
    X_train, y_train = input_features.iloc[train_idx], target_outputs.iloc[train_idx]
    X_validate, y_validate = input_features.iloc[validate_idx], target_outputs.iloc[validate_idx]

    xgb_model.fit(X_train, y_train)
    y_pred_validate = xgb_model.predict(X_validate)
    y_pred_test = xgb_model.predict(X_test)

    return fold_idx, validate_idx, y_pred_validate, y_pred_test

cv_results = Parallel(n_jobs=-1)(
    delayed(run_model)(fold, train_index, validate_index)
    for fold, (train_index, validate_index) in enumerate(split_indices)
)

for fold_idx, validate_idx, y_pred_validate, y_pred_test in cv_results:
    oof_predictions[validate_idx] = y_pred_validate
    test_predictions[:, :, fold_idx] = y_pred_test

final_predictions = np.mean(test_predictions, axis=2)

output_dataframe = pd.DataFrame({
    'ID': test_data['ID'],
    'longitude_predicted': final_predictions[:, 1],
    'latitude_predicted': final_predictions[:, 0]
})

output_dataframe.to_csv('submission.csv', index=False)
