In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb

In [9]:
pd.set_option('display.max_columns', None)

ais_test = pd.read_csv('ais_test.csv')

#Hei på deg, dette er en test
ais_train = pd.read_csv('first_50000_rows.csv', sep='|')

# Merge test set with training set history
ais_train['time'] = pd.to_datetime(ais_train['time'])  # Ensure time is datetime
ais_test['time'] = pd.to_datetime(ais_test['time'])

# Sort by vesselId and time
ais_train = ais_train.sort_values(['vesselId', 'time'])
ais_test = ais_test.sort_values(['vesselId', 'time'])

# label where data comes from before concatenation
ais_train['dataset'] = 'train'
ais_test['dataset'] = 'test'

In [10]:
# Convert 'time' columns to datetime
ais_train['time'] = pd.to_datetime(ais_train['time'])
ais_test['time'] = pd.to_datetime(ais_test['time'])

# Sort by 'vesselId' and 'time'
ais_train = ais_train.sort_values(['vesselId', 'time'])
ais_test = ais_test.sort_values(['vesselId', 'time'])

# Label datasets for concatenation
ais_train['dataset'] = 'train'
ais_test['dataset'] = 'test'

# Combine datasets to create lag features
combined_df = pd.concat([ais_train, ais_test], ignore_index=True)
combined_df = combined_df.sort_values(['vesselId', 'time'])
combined_df

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,dataset,ID,scaling_factor
0,2024-01-01 00:14:36,348.0,0.0,0.0,333.0,5.0,12-29 21:00,51.30883,3.23027,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,train,,
1,2024-01-01 00:35:36,8.0,0.0,0.0,333.0,5.0,12-29 21:00,51.30882,3.23025,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,train,,
2,2024-01-01 00:56:34,20.0,0.0,0.0,333.0,5.0,12-29 21:00,51.30882,3.23027,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,train,,
3,2024-01-01 01:17:35,6.0,0.0,0.0,334.0,5.0,12-29 21:00,51.30880,3.23023,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,train,,
4,2024-01-01 01:35:36,353.0,0.0,0.0,334.0,5.0,12-29 21:00,51.30882,3.23030,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,train,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101734,2024-05-12 22:37:33,,,,,,,,,clh6aqawa0007gh0z9h6zi9bo,,test,51161.0,0.1
101735,2024-05-12 22:58:05,,,,,,,,,clh6aqawa0007gh0z9h6zi9bo,,test,51302.0,0.1
101736,2024-05-12 23:18:20,,,,,,,,,clh6aqawa0007gh0z9h6zi9bo,,test,51444.0,0.1
101737,2024-05-12 23:39:21,,,,,,,,,clh6aqawa0007gh0z9h6zi9bo,,test,51595.0,0.1


In [11]:
# Create lag features for 'sog', 'cog', 'rot', and 'heading'
for lag in [1, 2, 3]:
    for feature in ['sog', 'cog', 'rot', 'heading']:
        combined_df[f'{feature}_lag{lag}'] = combined_df.groupby('vesselId')[feature].shift(lag)

# Create moving averages
for feature in ['sog', 'cog', 'rot', 'heading']:
    combined_df[f'{feature}_ma5'] = combined_df.groupby('vesselId')[feature].rolling(window=5).mean().reset_index(0, drop=True)

# One-hot encode 'navstat'
combined_df = pd.get_dummies(combined_df, columns=['navstat'], prefix='navstat')

# Drop unnecessary columns
combined_df = combined_df.drop(columns=['ID'])  # Drop 'ID' column

# Separate the enriched test set from combined data
ais_test_enriched = combined_df[combined_df['dataset'] == 'test'].copy()
ais_train_enriched = combined_df[combined_df['dataset'] == 'train'].copy()

# Drop the 'dataset' column as it's no longer needed
ais_test_enriched = ais_test_enriched.drop(columns=['dataset'])
ais_train_enriched = ais_train_enriched.drop(columns=['dataset'])

# Drop non-numeric or unnecessary columns to avoid dtype errors
ais_train_enriched = ais_train_enriched.drop(columns=['time', 'vesselId', 'etaRaw', 'portId'], errors='ignore')
ais_test_enriched = ais_test_enriched.drop(columns=['time', 'vesselId', 'etaRaw', 'portId'], errors='ignore')

# Ensure all remaining columns are numeric
X_train = ais_train_enriched.drop(columns=['latitude', 'longitude'])
y_train = ais_train_enriched[['latitude', 'longitude']]
X_test = ais_test_enriched.drop(columns=['latitude', 'longitude'])

# Align columns between train and test data to avoid column mismatches
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Initialize and wrap XGBoost with MultiOutputRegressor
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
multi_output_model = MultiOutputRegressor(xgb_model)

# Train the model on the training data
multi_output_model.fit(X_train, y_train)

# Make predictions on the test data
predictions = multi_output_model.predict(X_test)

# Create the output DataFrame with predictions
output = pd.DataFrame({
    'ID': ais_test['ID'],  # Row ID from the test set
    'longitude_predicted': predictions[:, 1],  # Longitude prediction
    'latitude_predicted': predictions[:, 0]    # Latitude prediction
})

# Display the first few rows of the output DataFrame
print(output.head())

# Save the output to a CSV file (if needed)
output.to_csv('ais_predictions.csv', index=False)

      ID  longitude_predicted  latitude_predicted
4      4            18.793764           18.113619
201  201            11.716568           19.403973
583  583            12.506432           12.497017
701  701             5.596546           11.587506
829  829             5.596546           11.587506
