In [160]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [161]:
df = pd.read_csv('../first_50000_rows.csv', sep='|')

In [162]:
df['time'] = pd.to_datetime(df['time'])

In [163]:
# Define each 5-day window (7200 minutes) independently for each vessel
df['time_diff'] = df.groupby('vesselId')['time'].transform(lambda x: (x - x.iloc[0]).dt.total_seconds() / 60)
df['window_starts'] = (df['time_diff'] // (5 * 24 * 60)).astype(int)

# Initialize columns for last known values and time_since_last_known
cols_to_propagate = ['latitude', 'longitude', 'cog', 'navstat', 'sog', 'heading', 'rot']
for col in cols_to_propagate:
    df[f'last_known_{col}'] = np.nan
df['time_since_last_known'] = np.nan

for vessel_id, vessel_data in df.groupby('vesselId'):
    last_known_values = {}
    last_known_time = None
    
    for window, window_data in vessel_data.groupby('window_starts'):
        window_index = window_data.index
        
        # For the first window, use the row’s own values as the last known values
        if window == 0:
            for col in cols_to_propagate:
                df.loc[window_index, f'last_known_{col}'] = window_data[col].iloc[0]
            df.loc[window_index, 'time_since_last_known'] = (window_data['time'] - window_data['time'].iloc[0]).dt.total_seconds() / 60
        else:
            # For other windows, use the last row of the previous window as the last known values
            for col in cols_to_propagate:
                df.loc[window_index, f'last_known_{col}'] = last_known_values[col]
            df.loc[window_index, 'time_since_last_known'] = (window_data['time'] - last_known_time).dt.total_seconds() / 60
        
        # Update last known values and time for the next window
        last_known_values = {col: window_data[col].iloc[-1] for col in cols_to_propagate}
        last_known_time = window_data['time'].iloc[-1]

# Set time_since_last_known to 0 for the first row of each vessel
first_rows = df.groupby('vesselId').head(1).index
df.loc[first_rows, 'time_since_last_known'] = 0

In [164]:
# Decompose the last known SOG and COG values into x and y components
df['last_known_sog_x'] = df['last_known_sog'] * np.cos(np.radians(df['last_known_cog']))
df['last_known_sog_y'] = df['last_known_sog'] * np.sin(np.radians(df['last_known_cog']))

In [165]:
# df = df.drop(columns=['cog','sog','rot','heading', 'navstat', 'etaRaw','vesselId', 'portId'])
# df['last_known_cog_sin'] = np.sin(np.radians(df['last_known_cog']))
# df['last_known_cog_cos'] = np.cos(np.radians(df['last_known_cog']))
# df['last_known_heading_sin'] = np.sin(np.radians(df['last_known_heading']))
# df['last_known_heading_cos'] = np.cos(np.radians(df['last_known_heading']))

# # Define the set of values that correspond to the "underway" state
# underway_values = {0, 3, 4, 7, 9, 10, 11, 12, 13, 14, 15}
# # Create the 'underway' column: 1 if 'navstat' is in underway_values, otherwise 0
# df['last_navstat_underway'] = df['last_known_navstat'].apply(lambda x: 1 if x in underway_values else 0)
# # Create the 'anchored' column: 1 if 'navstat' is not in underway_values, otherwise 0
# df['last_navstat_anchored'] = df['last_known_navstat'].apply(lambda x: 1 if x not in underway_values else 0)

# #TODO: Distance travelled/trianguleringer

# df = df.drop(columns=['last_known_cog', 'last_known_heading', 'last_known_navstat'])


In [166]:
test_df = pd.read_csv('../ais_test.csv')
vessel_df = pd.read_csv('../vessels.csv', sep='|')

In [167]:
# Prepare test data
print("Start")
test_df_input = test_df.copy()

# Convert 'time' columns to datetime
df['time'] = pd.to_datetime(df['time'], errors='coerce')
test_df_input['time'] = pd.to_datetime(test_df_input['time'], errors='coerce')

# Merge vessel information
vessel_df_test = vessel_df.drop(columns=["maxHeight", "maxWidth", "rampCapacity", "shippingLineId", "homePort"], axis=1)
test_df_input = test_df_input.merge(vessel_df_test, on="vesselId")

# Retrieve last known values from the training data
last_values_df = df.groupby('vesselId').last().reset_index()

# Merge last known features into test data
test_df_input = test_df_input.merge(last_values_df, on='vesselId', how='left', suffixes=('', '_last'))

# Calculate time_since_last_known in minutes
test_df_input['time_since_last_known'] = (
    (test_df_input['time'] - test_df_input['time_last']).dt.total_seconds() / 60
)

# Decompose the last known SOG and COG values into x and y components
test_df_input['last_known_sog_x'] = test_df_input['sog'] * np.cos(np.radians(test_df_input['cog']))
test_df_input['last_known_sog_y'] = test_df_input['sog'] * np.sin(np.radians(test_df_input['cog']))

# # Calculate moving and stationary flags based on last known values
# test_df_input['last_known_isMoving'] = ((test_df_input['navstat'] == 0) | (test_df_input['navstat'] == 2) |
#                                         (test_df_input['navstat'] == 3) | (test_df_input['navstat'] == 4))

# test_df_input['last_known_isStationary'] = ((test_df_input['navstat'] == 1) | (test_df_input['navstat'] == 5) |
#                                             (test_df_input['navstat'] == 6))

# Select and rename final columns to match the training data features
test_df_prepared = test_df_input[['ID','longitude', 'latitude', 'last_known_sog_x', 'last_known_sog_y',
                                   'rot', 'time_since_last_known']]

# Rename last known columns for latitude, longitude, and rot to match training set format
test_df_prepared = test_df_prepared.rename(columns={
    'latitude': 'last_known_latitude',
    'longitude': 'last_known_longitude',
    'rot': 'last_known_rot'
})

# Fill any remaining NaN values with 0
test_df_prepared = test_df_prepared.fillna(0)
test_df_prepared = test_df_prepared.drop(columns=['ID'])

print("Prepared Test Data Columns:", test_df_prepared.columns)
print(test_df_prepared.info())


Start
Prepared Test Data Columns: Index(['last_known_longitude', 'last_known_latitude', 'last_known_sog_x',
       'last_known_sog_y', 'last_known_rot', 'time_since_last_known'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51739 entries, 0 to 51738
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   last_known_longitude   51739 non-null  float64
 1   last_known_latitude    51739 non-null  float64
 2   last_known_sog_x       51739 non-null  float64
 3   last_known_sog_y       51739 non-null  float64
 4   last_known_rot         51739 non-null  float64
 5   time_since_last_known  51739 non-null  float64
dtypes: float64(6)
memory usage: 2.4 MB
None


In [168]:
# reordering
test_df_prepared = test_df_prepared[['last_known_latitude', 'last_known_longitude', 'last_known_rot', 'last_known_sog_x',
                                     'last_known_sog_y', 'time_since_last_known']]
df = df.drop(columns=['time','time_diff','cog','sog','rot','heading','navstat','etaRaw','vesselId', 'portId','window_starts',
                      'last_known_cog', 'last_known_navstat', 'last_known_sog', 'last_known_heading'])
df = df[['last_known_latitude', 'last_known_longitude', 'last_known_rot', 'last_known_sog_x', 'last_known_sog_y',
         'time_since_last_known','latitude','longitude']]

In [169]:
test_df_prepared

Unnamed: 0,last_known_latitude,last_known_longitude,last_known_rot,last_known_sog_x,last_known_sog_y,time_since_last_known
0,9.45839,-79.85208,5.0,0.0,-0.0,178612.933333
1,9.45839,-79.85208,5.0,0.0,-0.0,178645.900000
2,9.45839,-79.85208,5.0,0.0,-0.0,178660.866667
3,9.45839,-79.85208,5.0,0.0,-0.0,178681.900000
4,9.45839,-79.85208,5.0,0.0,-0.0,178699.900000
...,...,...,...,...,...,...
51734,0.00000,0.00000,0.0,0.0,0.0,0.000000
51735,0.00000,0.00000,0.0,0.0,0.0,0.000000
51736,0.00000,0.00000,0.0,0.0,0.0,0.000000
51737,0.00000,0.00000,0.0,0.0,0.0,0.000000


In [170]:
df

Unnamed: 0,last_known_latitude,last_known_longitude,last_known_rot,last_known_sog_x,last_known_sog_y,time_since_last_known,latitude,longitude
0,-34.74370,-57.85130,0.0,0.169345,-0.679207,0.000000,-34.74370,-57.85130
1,8.89440,-79.47939,-6.0,-0.000000,0.000000,0.000000,8.89440,-79.47939
2,39.19065,-76.47567,0.0,-3.942047,10.269385,0.000000,39.19065,-76.47567
3,-34.41189,151.02067,0.0,-0.000000,0.000000,0.000000,-34.41189,151.02067
4,35.88379,-5.91636,0.0,-16.332040,-11.016100,0.000000,35.88379,-5.91636
...,...,...,...,...,...,...,...,...
49995,-24.01803,-43.30338,0.0,-3.490532,-18.473121,1449.266667,-24.03371,-46.34749
49996,39.09719,9.26304,0.0,7.536490,-13.429867,5659.833333,39.64086,-0.22345
49997,33.89162,134.96944,0.0,12.239184,1.221633,5182.416667,34.46860,138.21546
49998,50.44568,-3.13388,0.0,4.238665,3.012262,6199.950000,53.33446,7.16211


### Machine learning part

In [171]:
# Split the train set into X (features) and y (targets)
X_train = df.drop(columns=['latitude', 'longitude'])
y_train = df[['latitude', 'longitude']]

# For the test set, we only have features (no target columns)
X_test = test_df_prepared  # Ensure test set has the same features as X_train

In [None]:
import xgboost as xgb

# Split targets
X = X_train
y_latitude = y_train['latitude']
y_longitude = y_train['longitude']

# Define DMatrix for XGBoost
dtrain_lat = xgb.DMatrix(X, label=y_latitude)
dtrain_lon = xgb.DMatrix(X, label=y_longitude)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Regression objective
    'eval_metric': 'rmse',            # Evaluation metric
    'max_depth': 6,                   # Example depth
    'eta': 0.1,                       # Learning rate
    'seed': 42                        # Random seed for reproducibility
}

# Train model for latitude
model_latitude = xgb.train(params, dtrain_lat, num_boost_round=100)

# Train model for longitude
model_longitude = xgb.train(params, dtrain_lon, num_boost_round=100)

In [None]:
# Define DMatrix for X_test
dtest = xgb.DMatrix(X_test)

# Predict latitude and longitude
pred_latitude = model_latitude.predict(dtest)
pred_longitude = model_longitude.predict(dtest)

# Combine predictions into a DataFrame
predictions = pd.DataFrame({
    'latitude': pred_latitude,
    'longitude': pred_longitude
})