In [21]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [22]:
ais_train = pd.read_csv('../first_50000_rows.csv', sep='|')
ais_test = pd.read_csv('../ais_test.csv')
vessel_df = pd.read_csv('../vessels.csv', sep='|')

ais_train['time'] = pd.to_datetime(ais_train['time'])
ais_test['time'] = pd.to_datetime(ais_test['time'])

In [23]:
# Define each 5-day window (7200 minutes) independently for each vessel
ais_train['time_diff'] = ais_train.groupby('vesselId')['time'].transform(lambda x: (x - x.iloc[0]).dt.total_seconds() / 60)
ais_train['window_starts'] = (ais_train['time_diff'] // (5 * 24 * 60)).astype(int)

# Initialize columns for last known values and time_since_last_known
cols_to_propagate = ['latitude', 'longitude', 'cog', 'navstat', 'sog', 'heading', 'rot']
for col in cols_to_propagate:
    ais_train[f'last_known_{col}'] = np.nan
ais_train['time_since_last_known'] = np.nan

for vessel_id, vessel_data in ais_train.groupby('vesselId'):
    last_known_values = {}
    last_known_time = None
    
    for window, window_data in vessel_data.groupby('window_starts'):
        window_index = window_data.index
        
        # For the first window, use the row’s own values as the last known values
        if window == 0:
            for col in cols_to_propagate:
                ais_train.loc[window_index, f'last_known_{col}'] = window_data[col].iloc[0]
            ais_train.loc[window_index, 'time_since_last_known'] = (window_data['time'] - window_data['time'].iloc[0]).dt.total_seconds() / 60
        else:
            # For other windows, use the last row of the previous window as the last known values
            for col in cols_to_propagate:
                ais_train.loc[window_index, f'last_known_{col}'] = last_known_values[col]
            ais_train.loc[window_index, 'time_since_last_known'] = (window_data['time'] - last_known_time).dt.total_seconds() / 60
        
        # Update last known values and time for the next window
        last_known_values = {col: window_data[col].iloc[-1] for col in cols_to_propagate}
        last_known_time = window_data['time'].iloc[-1]

# Set time_since_last_known to 0 for the first row of each vessel
first_rows = ais_train.groupby('vesselId').head(1).index
ais_train.loc[first_rows, 'time_since_last_known'] = 0

In [25]:
ais_train

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,time_diff,window_starts,last_known_latitude,last_known_longitude,last_known_cog,last_known_navstat,last_known_sog,last_known_heading,last_known_rot,time_since_last_known
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.74370,-57.85130,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,0.000000,0,-34.74370,-57.85130,284.0,0.0,0.7,88.0,0.0,0.000000
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.89440,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,0.000000,0,8.89440,-79.47939,109.6,1.0,0.0,347.0,-6.0,0.000000
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,0.000000,0,39.19065,-76.47567,111.0,0.0,11.0,112.0,0.0,0.000000
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,0.000000,0,-34.41189,151.02067,96.4,1.0,0.0,142.0,0.0,0.000000
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,0.000000,0,35.88379,-5.91636,214.0,0.0,19.7,215.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-01-05 07:36:12,5.5,6.1,0,0,0,01-04 20:00,-24.03371,-46.34749,61e9f410b937134a3c4c0049,61d36fdf0a1807568ff9a1b0,1449.266667,0,-24.01803,-43.30338,259.3,0.0,18.8,261.0,0.0,1449.266667
49996,2024-01-05 07:36:13,94.9,0.0,0,31,5,01-02 18:00,39.64086,-0.22345,61e9f468b937134a3c4c0289,61d37fb629b60f6113c89e99,5659.833333,0,39.09719,9.26304,299.3,0.0,15.4,301.0,0.0,5659.833333
49997,2024-01-05 07:36:16,221.9,17.8,0,223,0,01-11 22:00,34.46860,138.21546,61e9f3aab937134a3c4bfe0f,61d37a221366c3998241d928,5182.416667,0,33.89162,134.96944,5.7,0.0,12.3,5.0,0.0,5182.416667
49998,2024-01-05 07:36:16,324.1,0.0,0,268,5,01-03 20:30,53.33446,7.16211,61e9f397b937134a3c4bfdaf,61d375e893c6feb83e5eb3e4,6199.950000,0,50.44568,-3.13388,35.4,0.0,5.2,13.0,0.0,6199.950000
