In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)


In [None]:
test_df = pd.read_csv('../ais_test.csv')
df = pd.read_csv('../first_50000_rows.csv', sep='|')

In [7]:
vessel_df = pd.read_csv('../vessels.csv', sep='|')

In [None]:
# Prepare test data
print("Start")
test_df_input = test_df.copy()

# Convert 'time' columns to datetime
df['time'] = pd.to_datetime(df['time'], errors='coerce')
test_df_input['time'] = pd.to_datetime(test_df_input['time'], errors='coerce')

# Merge vessel information
vessel_df_test = vessel_df.drop(columns=["maxHeight", "maxWidth", "rampCapacity", "shippingLineId", "homePort"], axis=1)
test_df_input = test_df_input.merge(vessel_df_test, on="vesselId")

# Retrieve last known values from the training data
last_values_df = df.groupby('vesselId').last().reset_index()

# Merge last known features into test data
test_df_input = test_df_input.merge(last_values_df, on='vesselId', how='left', suffixes=('', '_last'))

# Calculate time_since_last_known in minutes
test_df_input['time_since_last_known'] = (
    (test_df_input['time'] - test_df_input['time_last']).dt.total_seconds() / 60
)

# Decompose the last known SOG and COG values into x and y components
test_df_input['last_known_sog_x'] = test_df_input['sog'] * np.cos(np.radians(test_df_input['cog']))
test_df_input['last_known_sog_y'] = test_df_input['sog'] * np.sin(np.radians(test_df_input['cog']))

# Calculate moving and stationary flags based on last known values
test_df_input['last_known_isMoving'] = ((test_df_input['navstat'] == 0) | (test_df_input['navstat'] == 2) |
                                        (test_df_input['navstat'] == 3) | (test_df_input['navstat'] == 4))

test_df_input['last_known_isStationary'] = ((test_df_input['navstat'] == 1) | (test_df_input['navstat'] == 5) |
                                            (test_df_input['navstat'] == 6))

# Select and rename final columns to match the training data features
test_df_prepared = test_df_input[['ID','longitude', 'latitude', 'last_known_sog_x', 'last_known_sog_y',
                                  'last_known_isMoving', 'last_known_isStationary', 'rot', 'time_since_last_known', 'area_covered']]

# Rename last known columns for latitude, longitude, and rot to match training set format
test_df_prepared = test_df_prepared.rename(columns={
    'latitude': 'last_known_latitude',
    'longitude': 'last_known_longitude',
    'rot': 'last_known_rot'
})

# Fill any remaining NaN values with 0
test_df_prepared = test_df_prepared.fillna(0)

print("Prepared Test Data Columns:", test_df_prepared.columns)
print(test_df_prepared.info())

test_df_prepared