In [11]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [12]:
ais_test = pd.read_csv('ais_test.csv')
ais_train = pd.read_csv('first_50000_rows.csv', sep='|')

# Merge test set with training set history
ais_train['time'] = pd.to_datetime(ais_train['time'])  # Ensure time is datetime
ais_test['time'] = pd.to_datetime(ais_test['time'])

# Sort by vesselId and time
ais_train = ais_train.sort_values(['vesselId', 'time'])
ais_test = ais_test.sort_values(['vesselId', 'time'])



# Concatenate both dataframes in order to add correct time series features

# label where data comes from before concatenation
ais_train['dataset'] = 'train'
ais_test['dataset'] = 'test'

combined_df = pd.concat([ais_train, ais_test], ignore_index=True)

# Sort by vesselId and time to ensure proper order for calculating lags
combined_df = combined_df.sort_values(['vesselId', 'time'])

In [13]:
# NEW FEATURES: Creating lag features and moving avg.
# ONE HOT ENCODING: Navstat is lagged, an then one hot encoded within all lag windows



# TODO: Ensure that lag does not point on any other vesselID.

# Create lag features (e.g., lag of 1, 2, 3 steps)
for lag in [1, 2, 3]:
    combined_df[f'sog_lag{lag}'] = combined_df.groupby('vesselId')['sog'].shift(lag).ffill()
    combined_df[f'cog_lag{lag}'] = combined_df.groupby('vesselId')['cog'].shift(lag).ffill()
    combined_df[f'rot_lag{lag}'] = combined_df.groupby('vesselId')['rot'].shift(lag).ffill()
    combined_df[f'heading_lag{lag}'] = combined_df.groupby('vesselId')['heading'].shift(lag).ffill()
    combined_df[f'navstat_lag{lag}'] = combined_df.groupby('vesselId')['navstat'].shift(lag).ffill()
    combined_df[f'etaRaw_lag{lag}'] = combined_df.groupby('vesselId')['etaRaw'].shift(lag).ffill()
    combined_df[f'latitude_lag{lag}'] = combined_df.groupby('vesselId')['latitude'].shift(lag).ffill()
    combined_df[f'longitude_lag{lag}'] = combined_df.groupby('vesselId')['longitude'].shift(lag).ffill()
    
# Adding moving averages on a set of attributes
combined_df['sog_ma5'] = combined_df.groupby('vesselId')['sog'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['cog_ma5'] = combined_df.groupby('vesselId')['cog'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['rot_ma5'] = combined_df.groupby('vesselId')['rot'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['heading_ma5'] = combined_df.groupby('vesselId')['heading'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['latitude_ma5'] = combined_df.groupby('vesselId')['latitude'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['longitude_ma5'] = combined_df.groupby('vesselId')['longitude'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)



# one-hot encode the lagged navstat lag features
lag_columns = ['navstat_lag1', 'navstat_lag2', 'navstat_lag3']
combined_df = pd.get_dummies(combined_df, columns=lag_columns, prefix=lag_columns)


In [14]:
## Split back


combined_df = combined_df.drop(columns=['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'portId'])
combined_df

# Separate the enriched test set back from combined data
ais_test_enriched = combined_df[combined_df['dataset'] == 'test'].copy()
ais_train_enriched = combined_df[combined_df['dataset'] == 'train'].copy()

# Drop the 'dataset' column as it's no longer needed
ais_test_enriched = ais_test_enriched.drop(columns=['dataset', ])
ais_train_enriched = ais_train_enriched.drop(columns=['dataset', ])

ais_test_enriched

Unnamed: 0,time,latitude,longitude,vesselId,ID,scaling_factor,sog_lag1,cog_lag1,rot_lag1,heading_lag1,etaRaw_lag1,latitude_lag1,longitude_lag1,sog_lag2,cog_lag2,rot_lag2,heading_lag2,etaRaw_lag2,latitude_lag2,longitude_lag2,sog_lag3,cog_lag3,rot_lag3,heading_lag3,etaRaw_lag3,latitude_lag3,longitude_lag3,sog_ma5,cog_ma5,rot_ma5,heading_ma5,latitude_ma5,longitude_ma5,navstat_lag1_0.0,navstat_lag1_1.0,navstat_lag1_2.0,navstat_lag1_3.0,navstat_lag1_5.0,navstat_lag1_8.0,navstat_lag1_15.0,navstat_lag2_0.0,navstat_lag2_1.0,navstat_lag2_2.0,navstat_lag2_3.0,navstat_lag2_5.0,navstat_lag2_8.0,navstat_lag2_15.0,navstat_lag3_0.0,navstat_lag3_1.0,navstat_lag3_2.0,navstat_lag3_3.0,navstat_lag3_5.0,navstat_lag3_8.0,navstat_lag3_15.0
50000,2024-05-08 00:12:27,,,61e9f38eb937134a3c4bfd8d,4.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.4,242.0,0.0,246.0,01-06 11:30,49.50783,-4.15792,12.9,243.0,0.0,246.0,01-06 11:30,49.54392,-4.05385,13.200000,241.75,0.0,245.25,49.532125,-4.089072,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50001,2024-05-08 00:39:27,,,61e9f38eb937134a3c4bfd8d,201.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.4,242.0,0.0,246.0,01-06 11:30,49.50783,-4.15792,13.166667,243.00,0.0,246.00,49.517083,-4.131157,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50002,2024-05-08 01:33:28,,,61e9f38eb937134a3c4bfd8d,583.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.300000,243.00,0.0,246.00,49.503665,-4.169810,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50003,2024-05-08 01:51:26,,,61e9f38eb937134a3c4bfd8d,701.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.200000,244.00,0.0,246.00,49.499500,-4.181700,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50004,2024-05-08 02:03:29,,,61e9f38eb937134a3c4bfd8d,829.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,,,,,,,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101734,2024-05-12 22:37:33,,,clh6aqawa0007gh0z9h6zi9bo,51161.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101735,2024-05-12 22:58:05,,,clh6aqawa0007gh0z9h6zi9bo,51302.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101736,2024-05-12 23:18:20,,,clh6aqawa0007gh0z9h6zi9bo,51444.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101737,2024-05-12 23:39:21,,,clh6aqawa0007gh0z9h6zi9bo,51595.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False


In [15]:
ais_test_enriched.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51739 entries, 50000 to 101738
Data columns (total 54 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   time               51739 non-null  datetime64[ns]
 1   latitude           0 non-null      float64       
 2   longitude          0 non-null      float64       
 3   vesselId           51739 non-null  object        
 4   ID                 51739 non-null  float64       
 5   scaling_factor     51739 non-null  float64       
 6   sog_lag1           51739 non-null  float64       
 7   cog_lag1           51739 non-null  float64       
 8   rot_lag1           51739 non-null  float64       
 9   heading_lag1       51739 non-null  float64       
 10  etaRaw_lag1        51739 non-null  object        
 11  latitude_lag1      51739 non-null  float64       
 12  longitude_lag1     51739 non-null  float64       
 13  sog_lag2           51739 non-null  float64       
 14  cog_la

In [16]:
# Check for null values in each column
null_values = ais_test_enriched.isnull().sum()

# Display the result
print(null_values)

time                     0
latitude             51739
longitude            51739
vesselId                 0
ID                       0
scaling_factor           0
sog_lag1                 0
cog_lag1                 0
rot_lag1                 0
heading_lag1             0
etaRaw_lag1              0
latitude_lag1            0
longitude_lag1           0
sog_lag2                 0
cog_lag2                 0
rot_lag2                 0
heading_lag2             0
etaRaw_lag2              0
latitude_lag2            0
longitude_lag2           0
sog_lag3                 0
cog_lag3                 0
rot_lag3                 0
heading_lag3             0
etaRaw_lag3              0
latitude_lag3            0
longitude_lag3           0
sog_ma5              51067
cog_ma5              51067
rot_ma5              51067
heading_ma5          51067
latitude_ma5         51067
longitude_ma5        51067
navstat_lag1_0.0         0
navstat_lag1_1.0         0
navstat_lag1_2.0         0
navstat_lag1_3.0         0
n

In [17]:
ais_test_enriched

Unnamed: 0,time,latitude,longitude,vesselId,ID,scaling_factor,sog_lag1,cog_lag1,rot_lag1,heading_lag1,etaRaw_lag1,latitude_lag1,longitude_lag1,sog_lag2,cog_lag2,rot_lag2,heading_lag2,etaRaw_lag2,latitude_lag2,longitude_lag2,sog_lag3,cog_lag3,rot_lag3,heading_lag3,etaRaw_lag3,latitude_lag3,longitude_lag3,sog_ma5,cog_ma5,rot_ma5,heading_ma5,latitude_ma5,longitude_ma5,navstat_lag1_0.0,navstat_lag1_1.0,navstat_lag1_2.0,navstat_lag1_3.0,navstat_lag1_5.0,navstat_lag1_8.0,navstat_lag1_15.0,navstat_lag2_0.0,navstat_lag2_1.0,navstat_lag2_2.0,navstat_lag2_3.0,navstat_lag2_5.0,navstat_lag2_8.0,navstat_lag2_15.0,navstat_lag3_0.0,navstat_lag3_1.0,navstat_lag3_2.0,navstat_lag3_3.0,navstat_lag3_5.0,navstat_lag3_8.0,navstat_lag3_15.0
50000,2024-05-08 00:12:27,,,61e9f38eb937134a3c4bfd8d,4.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.4,242.0,0.0,246.0,01-06 11:30,49.50783,-4.15792,12.9,243.0,0.0,246.0,01-06 11:30,49.54392,-4.05385,13.200000,241.75,0.0,245.25,49.532125,-4.089072,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50001,2024-05-08 00:39:27,,,61e9f38eb937134a3c4bfd8d,201.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.4,242.0,0.0,246.0,01-06 11:30,49.50783,-4.15792,13.166667,243.00,0.0,246.00,49.517083,-4.131157,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50002,2024-05-08 01:33:28,,,61e9f38eb937134a3c4bfd8d,583.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.300000,243.00,0.0,246.00,49.503665,-4.169810,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50003,2024-05-08 01:51:26,,,61e9f38eb937134a3c4bfd8d,701.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.200000,244.00,0.0,246.00,49.499500,-4.181700,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
50004,2024-05-08 02:03:29,,,61e9f38eb937134a3c4bfd8d,829.0,0.3,13.2,244.0,0.0,246.0,01-06 11:30,49.4995,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,13.2,244.0,0.0,246.0,01-06 11:30,49.49950,-4.18170,,,,,,,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101734,2024-05-12 22:37:33,,,clh6aqawa0007gh0z9h6zi9bo,51161.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101735,2024-05-12 22:58:05,,,clh6aqawa0007gh0z9h6zi9bo,51302.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101736,2024-05-12 23:18:20,,,clh6aqawa0007gh0z9h6zi9bo,51444.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
101737,2024-05-12 23:39:21,,,clh6aqawa0007gh0z9h6zi9bo,51595.0,0.1,0.0,0.0,1.0,134.0,01-05 05:00,53.8618,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,0.0,0.0,1.0,134.0,01-05 05:00,53.86180,8.72498,,,,,,,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
