In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [6]:
ais_test = pd.read_csv('ais_test.csv')
ais_train = pd.read_csv('first_50000_rows.csv', sep='|')

# Merge test set with training set history
ais_train['time'] = pd.to_datetime(ais_train['time'])  # Ensure time is datetime
ais_test['time'] = pd.to_datetime(ais_test['time'])

# Sort by vesselId and time
ais_train = ais_train.sort_values(['vesselId', 'time'])
ais_test = ais_test.sort_values(['vesselId', 'time'])



# Concatenate both dataframes in order to add correct time series features

# label where data comes from before concatenation
ais_train['dataset'] = 'train'
ais_test['dataset'] = 'test'

combined_df = pd.concat([ais_train, ais_test], ignore_index=True)

# Sort by vesselId and time to ensure proper order for calculating lags
combined_df = combined_df.sort_values(['vesselId', 'time'])

In [None]:
# NEW FEATURES: Creating lag features and moving avg.
# ONE HOT ENCODING: Navstat is lagged, an then one hot encoded within all lag windows



# TODO: Ensure that lag does not point on any other vesselID.

# Create lag features (e.g., lag of 1, 2, 3 steps)
for lag in [1, 2, 3]:
    combined_df[f'sog_lag{lag}'] = combined_df.groupby('vesselId')['sog'].shift(lag).ffill()
    combined_df[f'cog_lag{lag}'] = combined_df.groupby('vesselId')['cog'].shift(lag).ffill()
    combined_df[f'rot_lag{lag}'] = combined_df.groupby('vesselId')['rot'].shift(lag).ffill()
    combined_df[f'heading_lag{lag}'] = combined_df.groupby('vesselId')['heading'].shift(lag).ffill()
    combined_df[f'navstat_lag{lag}'] = combined_df.groupby('vesselId')['navstat'].shift(lag).ffill()
    combined_df[f'etaRaw_lag{lag}'] = combined_df.groupby('vesselId')['etaRaw'].shift(lag).ffill()
    combined_df[f'latitude_lag{lag}'] = combined_df.groupby('vesselId')['latitude'].shift(lag).ffill()
    combined_df[f'longitude_lag{lag}'] = combined_df.groupby('vesselId')['longitude'].shift(lag).ffill()
    
# Adding moving averages on a set of attributes
combined_df['sog_ma5'] = combined_df.groupby('vesselId')['sog'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['cog_ma5'] = combined_df.groupby('vesselId')['cog'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['rot_ma5'] = combined_df.groupby('vesselId')['rot'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['heading_ma5'] = combined_df.groupby('vesselId')['heading'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['latitude_ma5'] = combined_df.groupby('vesselId')['latitude'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)
combined_df['longitude_ma5'] = combined_df.groupby('vesselId')['longitude'].rolling(window=5, min_periods=1).apply(lambda x: x.dropna().mean()).reset_index(0, drop=True)



# one-hot encode the lagged navstat lag features
lag_columns = ['navstat_lag1', 'navstat_lag2', 'navstat_lag3']
combined_df = pd.get_dummies(combined_df, columns=lag_columns, prefix=lag_columns)


In [None]:
## Split back


combined_df = combined_df.drop(columns=['ID', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'portId'])
combined_df

# Separate the enriched test set back from combined data
ais_test_enriched = combined_df[combined_df['dataset'] == 'test'].copy()
ais_train_enriched = combined_df[combined_df['dataset'] == 'train'].copy()

# Drop the 'dataset' column as it's no longer needed
ais_test_enriched = ais_test_enriched.drop(columns=['dataset', ])
ais_train_enriched = ais_train_enriched.drop(columns=['dataset', ])

ais_test_enriched

In [None]:
ais_test_enriched.info()

In [None]:
# Check for null values in each column
null_values = ais_test_enriched.isnull().sum()

# Display the result
print(null_values)