In [124]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [125]:
ais_test = pd.read_csv('../ais_test.csv')
ais_train = pd.read_csv('../first_50000_rows.csv', sep='|')

In [126]:
ais_test['source'] = 'test'
ais_test = ais_test.drop(columns=['ID', 'scaling_factor'])

ais_train['source'] = 'train'
df = pd.concat([ais_train, ais_test], ignore_index=True)
df['time'] = pd.to_datetime(df['time'])

In [127]:
# Define each 5-day window (7200 minutes) independently for each vessel
df['time_diff'] = df.groupby('vesselId')['time'].transform(lambda x: (x - x.iloc[0]).dt.total_seconds() / 60)
df['window_starts'] = (df['time_diff'] // (5 * 24 * 60)).astype(int)

# Initialize columns for last known values and time_since_last_known
cols_to_propagate = ['latitude', 'longitude', 'cog', 'navstat', 'sog', 'heading', 'rot']
for col in cols_to_propagate:
    df[f'last_known_{col}'] = np.nan
df['time_since_last_known'] = np.nan

for vessel_id, vessel_data in df.groupby('vesselId'):
    last_known_values = {}
    last_known_time = None
    
    for window, window_data in vessel_data.groupby('window_starts'):
        window_index = window_data.index
        
        # For the first window, use the row’s own values as the last known values
        if window == 0:
            for col in cols_to_propagate:
                df.loc[window_index, f'last_known_{col}'] = window_data[col].iloc[0]
            df.loc[window_index, 'time_since_last_known'] = (window_data['time'] - window_data['time'].iloc[0]).dt.total_seconds() / 60
        else:
            # For other windows, use the last row of the previous window as the last known values
            for col in cols_to_propagate:
                df.loc[window_index, f'last_known_{col}'] = last_known_values[col]
            df.loc[window_index, 'time_since_last_known'] = (window_data['time'] - last_known_time).dt.total_seconds() / 60
        
        # Update last known values and time for the next window
        last_known_values = {col: window_data[col].iloc[-1] for col in cols_to_propagate}
        last_known_time = window_data['time'].iloc[-1]

# Set time_since_last_known to 0 for the first row of each vessel
first_rows = df.groupby('vesselId').head(1).index
df.loc[first_rows, 'time_since_last_known'] = 0

In [128]:
df = df.drop(columns=['cog','sog','rot','heading', 'navstat', 'etaRaw','vesselId', 'portId'])
df['last_known_cog_sin'] = np.sin(np.radians(df['last_known_cog']))
df['last_known_cog_cos'] = np.cos(np.radians(df['last_known_cog']))
df['last_known_heading_sin'] = np.sin(np.radians(df['last_known_heading']))
df['last_known_heading_cos'] = np.cos(np.radians(df['last_known_heading']))

# Define the set of values that correspond to the "underway" state
underway_values = {0, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15}
# Create the 'underway' column: 1 if 'navstat' is in underway_values, otherwise 0
df['last_navstat_underway'] = df['last_known_navstat'].apply(lambda x: 1 if x in underway_values else 0)
# Create the 'anchored' column: 1 if 'navstat' is not in underway_values, otherwise 0
df['last_navstat_anchored'] = df['last_known_navstat'].apply(lambda x: 1 if x not in underway_values else 0)

#TODO: Distance travelled/trianguleringer

df = df.drop(columns=['last_known_cog', 'last_known_heading', 'last_known_navstat'])



In [129]:
df.to_csv('hoy.csv')

In [None]:
## split
enriched_train = df[df['source'] == 'train']
enriched_train = enriched_train.drop(columns=['source'])

enriched_test = df[df['source'] == 'test']
enriched_test = enriched_test.drop(columns=['source'])