In [19]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

pd.set_option('display.max_columns', 100)

In [20]:
ais_test = pd.read_csv('ais_test.csv')
ais_train = pd.read_csv('first_50000_rows.csv', sep='|')
ports_df = pd.read_csv('ports.csv', sep='|')


In [21]:
# Ensure columns are of the correct type
ais_train['latitude'] = ais_train['latitude'].astype(float)
ais_train['longitude'] = ais_train['longitude'].astype(float)
ports_df['latitude'] = ports_df['latitude'].astype(float)
ports_df['longitude'] = ports_df['longitude'].astype(float)

# Prepare the coordinates for nearest neighbors
ais_coords = ais_train[['latitude', 'longitude']].values
ports_coords = ports_df[['latitude', 'longitude']].values

In [22]:
# Initialize the Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(ports_coords)

# Find the nearest port for each entry in ais_train
distances, indices = nbrs.kneighbors(ais_coords)

In [23]:
# Create a temporary dataset with all columns from ais_train and add the closest port ID
ais_train = ais_train.copy()
ais_train['closest_portId'] = ports_df['portId'].iloc[indices.flatten()].values


# Get the longitude and latitude of the closest port by merging
closest_ports = ports_df[['portId', 'latitude','longitude']]
closest_ports.columns = ['closest_portId', 'portLatitude', 'portLongitude']

# Merge to get the longitude and latitude of the nearest port
ais_train = ais_train.merge(closest_ports, on='closest_portId', how='left')

In [24]:
ais_train

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,closest_portId,portLatitude,portLongitude
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.74370,-57.85130,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,61d36f150a1807568ff9a0ad,-34.855278,-57.894167
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.89440,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,634c4de270937fc01c3a7689,8.967000,-79.533000
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,61d3847bb7b7526e1adf3d19,39.232500,-76.558889
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,61d36f770a1807568ff9a126,-34.462500,150.899444
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,634c4de270937fc01c3a74f3,35.783000,-5.817000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-01-05 07:36:12,5.5,6.1,0,0,0,01-04 20:00,-24.03371,-46.34749,61e9f410b937134a3c4c0049,61d36fdf0a1807568ff9a1b0,61d36fdf0a1807568ff9a1b0,-23.968889,-46.300833
49996,2024-01-05 07:36:13,94.9,0.0,0,31,5,01-02 18:00,39.64086,-0.22345,61e9f468b937134a3c4c0289,61d37fb629b60f6113c89e99,61d37fb629b60f6113c89e99,39.642778,-0.212500
49997,2024-01-05 07:36:16,221.9,17.8,0,223,0,01-11 22:00,34.46860,138.21546,61e9f3aab937134a3c4bfe0f,61d37a221366c3998241d928,61d37a221366c3998241d928,34.619444,138.216944
49998,2024-01-05 07:36:16,324.1,0.0,0,268,5,01-03 20:30,53.33446,7.16211,61e9f397b937134a3c4bfdaf,61d375e893c6feb83e5eb3e4,61d375e893c6feb83e5eb3e4,53.346111,7.198333


In [25]:


# Merge test set with training set history
ais_train['time'] = pd.to_datetime(ais_train['time'])  # Ensure time is datetime
ais_test['time'] = pd.to_datetime(ais_test['time'])
# Sort by vesselId and time
ais_train = ais_train.sort_values(['vesselId', 'time'])
ais_test = ais_test.sort_values(['vesselId', 'time'])

# Concatenate both dataframes in order to add correct time series features

# label where data comes from before concatenation
ais_train['dataset'] = 'train'
ais_test['dataset'] = 'test'
combined_df = pd.concat([ais_train, ais_test], ignore_index=True)
# Sort by vesselId and time to ensure proper order for calculating lags

# Sort by vesselId and time
combined_df = combined_df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)


In [4]:
    
lag_windows = [1, 2, 3, 4, 5]

for lag in lag_windows:
    # Create lag features for latitude and longitude
    combined_df[f'latitude_lag{lag}'] = combined_df.groupby('vesselId')['latitude'].shift(lag)
    combined_df[f'longitude_lag{lag}'] = combined_df.groupby('vesselId')['longitude'].shift(lag)
    combined_df[f'cog_lag{lag}'] = combined_df.groupby('vesselId')['cog'].shift(lag)
    combined_df[f'sog_lag{lag}'] = combined_df.groupby('vesselId')['sog'].shift(lag)
    combined_df[f'rot_lag{lag}'] = combined_df.groupby('vesselId')['rot'].shift(lag)
    combined_df[f'heading_lag{lag}'] = combined_df.groupby('vesselId')['heading'].shift(lag)

# Forward fill missing values within each 'vesselId' after lagging
combined_df[[f'latitude_lag{lag}' for lag in lag_windows]] = combined_df.groupby('vesselId')[[f'latitude_lag{lag}' for lag in lag_windows]].fillna(method='ffill')
combined_df[[f'longitude_lag{lag}' for lag in lag_windows]] = combined_df.groupby('vesselId')[[f'longitude_lag{lag}' for lag in lag_windows]].fillna(method='ffill')