In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load the data
ais_train = pd.read_csv('ais_train.csv', sep='|')
ports_df = pd.read_csv('ports.csv', sep='|')

In [2]:
# Ensure columns are of the correct type
ais_train['latitude'] = ais_train['latitude'].astype(float)
ais_train['longitude'] = ais_train['longitude'].astype(float)
ports_df['latitude'] = ports_df['latitude'].astype(float)
ports_df['longitude'] = ports_df['longitude'].astype(float)

# Prepare the coordinates for nearest neighbors
ais_coords = ais_train[['latitude', 'longitude']].values
ports_coords = ports_df[['latitude', 'longitude']].values

In [3]:
# Initialize the Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(ports_coords)

# Find the nearest port for each entry in ais_train
distances, indices = nbrs.kneighbors(ais_coords)

In [9]:
pd.set_option('display.width', 20000)

In [11]:
# Create a temporary dataset with all columns from ais_train and add the closest port ID
temp_ais_train = ais_train.copy()
temp_ais_train['closest_portId'] = ports_df['portId'].iloc[indices.flatten()].values


# Get the longitude and latitude of the closest port by merging
closest_ports = ports_df[['portId', 'longitude', 'latitude']]
closest_ports.columns = ['closest_portId', 'portLongitude', 'portLatitude']

# Merge to get the longitude and latitude of the nearest port
temp_ais_train = temp_ais_train.merge(closest_ports, on='closest_portId', how='left')

In [12]:
# Verify the temporary dataset
print(temp_ais_train)

                        time    cog   sog  rot  heading  navstat       etaRaw  latitude  longitude                   vesselId                    portId            closest_portId  closest_distance  portLongitude  portLatitude
0        2024-01-01 00:00:25  284.0   0.7    0       88        0  01-09 23:00 -34.74370  -57.85130   61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f  61d36f150a1807568ff9a0ad          0.119529     -57.894167    -34.855278
1        2024-01-01 00:00:36  109.6   0.0   -6      347        1  12-29 20:00   8.89440  -79.47939   61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689  634c4de270937fc01c3a7689          0.090249     -79.533000      8.967000
2        2024-01-01 00:01:45  111.0  11.0    0      112        0  01-02 09:00  39.19065  -76.47567   61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19  61d3847bb7b7526e1adf3d19          0.093149     -76.558889     39.232500
3        2024-01-01 00:03:11   96.4   0.0    0      142        1  12-31 20:00 -34.41189  151.02067  