In [1]:
import os 
import sys

from catboost import CatBoostRegressor
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

sys.path.append('../../')
import config
from notebooks.scripts import evaluator, constants

In [2]:
train_data = pd.read_csv(config.DATA_DIR + '/01-ais/02-train_data.csv', parse_dates = ['time'])
val_data = pd.read_csv(config.DATA_DIR + '/01-ais/02-val_data.csv', parse_dates = ['time'])

In [3]:
base_data = train_data[['time', 'vesselId', 'latitude', 'longitude']]

In [4]:
val_data

Unnamed: 0,time,vesselId,latitude,longitude,scaling_factor
0,2024-05-03 00:02:06,631e3cc6192150e13fa41625,30.78370,-14.12051,3.0
1,2024-05-03 00:02:11,61e9f39bb937134a3c4bfdbd,15.59584,-102.14670,3.0
2,2024-05-03 00:02:53,61e9f441b937134a3c4c018b,32.93787,34.58350,3.0
3,2024-05-03 00:04:23,6326ec84c46d6a20d22ca318,35.38995,139.64783,3.0
4,2024-05-03 00:05:56,61e9f470b937134a3c4c02cd,40.71222,29.46758,3.0
...,...,...,...,...,...
60501,2024-05-07 23:59:07,clh6aqawa0002gh0zypfa5dut,52.19131,-5.82223,1.0
60502,2024-05-07 23:59:08,61e9f3aeb937134a3c4bfe43,38.96142,-12.00502,1.0
60503,2024-05-07 23:59:08,61e9f43db937134a3c4c0169,49.71372,-5.22042,1.0
60504,2024-05-07 23:59:08,61e9f469b937134a3c4c029b,38.27895,10.78280,1.0


In [5]:
base_data['day_of_week'] = base_data['time'].dt.dayofweek
base_data['day_of_month'] = base_data['time'].dt.day
base_data['month'] = base_data['time'].dt.month
base_data['hour'] = base_data['time'].dt.hour
base_data['minute'] = base_data['time'].dt.minute
base_data['unix_time'] = base_data['time'].astype(np.int64) // 10**9 

In [6]:
base_data

Unnamed: 0,time,vesselId,latitude,longitude,day_of_week,day_of_month,month,hour,minute,unix_time
0,2024-01-01 00:00:25,61e9f3a8b937134a3c4bfdf7,-34.74370,-57.85130,0,1,1,0,0,1704067225
1,2024-01-01 00:00:36,61e9f3d4b937134a3c4bff1f,8.89440,-79.47939,0,1,1,0,0,1704067236
2,2024-01-01 00:01:45,61e9f436b937134a3c4c0131,39.19065,-76.47567,0,1,1,0,1,1704067305
3,2024-01-01 00:03:11,61e9f3b4b937134a3c4bfe77,-34.41189,151.02067,0,1,1,0,3,1704067391
4,2024-01-01 00:03:51,61e9f41bb937134a3c4c0087,35.88379,-5.91636,0,1,1,0,3,1704067431
...,...,...,...,...,...,...,...,...,...,...
1461553,2024-05-02 23:58:23,61e9f415b937134a3c4c0061,1.17215,103.83209,3,2,5,23,58,1714694303
1461554,2024-05-02 23:58:23,61e9f42ab937134a3c4c00ef,52.85952,4.25145,3,2,5,23,58,1714694303
1461555,2024-05-02 23:58:23,630fecca698dd2548ac1ee64,52.33208,-5.88908,3,2,5,23,58,1714694303
1461556,2024-05-02 23:58:24,61e9f43cb937134a3c4c0165,18.40001,-69.84725,3,2,5,23,58,1714694304


In [7]:
val_data['day_of_week'] = val_data['time'].dt.dayofweek
val_data['day_of_month'] = val_data['time'].dt.day
val_data['month'] = val_data['time'].dt.month
val_data['hour'] = val_data['time'].dt.hour
val_data['minute'] = val_data['time'].dt.minute
val_data['unix_time'] = val_data['time'].astype(np.int64) // 10**9 

In [8]:
val_data

Unnamed: 0,time,vesselId,latitude,longitude,scaling_factor,day_of_week,day_of_month,month,hour,minute,unix_time
0,2024-05-03 00:02:06,631e3cc6192150e13fa41625,30.78370,-14.12051,3.0,4,3,5,0,2,1714694526
1,2024-05-03 00:02:11,61e9f39bb937134a3c4bfdbd,15.59584,-102.14670,3.0,4,3,5,0,2,1714694531
2,2024-05-03 00:02:53,61e9f441b937134a3c4c018b,32.93787,34.58350,3.0,4,3,5,0,2,1714694573
3,2024-05-03 00:04:23,6326ec84c46d6a20d22ca318,35.38995,139.64783,3.0,4,3,5,0,4,1714694663
4,2024-05-03 00:05:56,61e9f470b937134a3c4c02cd,40.71222,29.46758,3.0,4,3,5,0,5,1714694756
...,...,...,...,...,...,...,...,...,...,...,...
60501,2024-05-07 23:59:07,clh6aqawa0002gh0zypfa5dut,52.19131,-5.82223,1.0,1,7,5,23,59,1715126347
60502,2024-05-07 23:59:08,61e9f3aeb937134a3c4bfe43,38.96142,-12.00502,1.0,1,7,5,23,59,1715126348
60503,2024-05-07 23:59:08,61e9f43db937134a3c4c0169,49.71372,-5.22042,1.0,1,7,5,23,59,1715126348
60504,2024-05-07 23:59:08,61e9f469b937134a3c4c029b,38.27895,10.78280,1.0,1,7,5,23,59,1715126348


In [9]:
model = CatBoostRegressor(logging_level='Silent', random_state=42, thread_count=12,loss_function='MultiRMSE',eval_metric='MultiRMSE')

In [10]:
target = constants.target_columns
cat_columns = base_data.select_dtypes(object).columns.tolist()
model.fit(base_data.drop(columns = ['time'] + target), base_data[target], cat_features=cat_columns)

<catboost.core.CatBoostRegressor at 0x128c9ba90>

In [1]:
base_data.drop(columns = ['time'] + target)

NameError: name 'base_data' is not defined

In [11]:
val_data

Unnamed: 0,time,vesselId,latitude,longitude,scaling_factor,day_of_week,day_of_month,month,hour,minute,unix_time
0,2024-05-03 00:02:06,631e3cc6192150e13fa41625,30.78370,-14.12051,3.0,4,3,5,0,2,1714694526
1,2024-05-03 00:02:11,61e9f39bb937134a3c4bfdbd,15.59584,-102.14670,3.0,4,3,5,0,2,1714694531
2,2024-05-03 00:02:53,61e9f441b937134a3c4c018b,32.93787,34.58350,3.0,4,3,5,0,2,1714694573
3,2024-05-03 00:04:23,6326ec84c46d6a20d22ca318,35.38995,139.64783,3.0,4,3,5,0,4,1714694663
4,2024-05-03 00:05:56,61e9f470b937134a3c4c02cd,40.71222,29.46758,3.0,4,3,5,0,5,1714694756
...,...,...,...,...,...,...,...,...,...,...,...
60501,2024-05-07 23:59:07,clh6aqawa0002gh0zypfa5dut,52.19131,-5.82223,1.0,1,7,5,23,59,1715126347
60502,2024-05-07 23:59:08,61e9f3aeb937134a3c4bfe43,38.96142,-12.00502,1.0,1,7,5,23,59,1715126348
60503,2024-05-07 23:59:08,61e9f43db937134a3c4c0169,49.71372,-5.22042,1.0,1,7,5,23,59,1715126348
60504,2024-05-07 23:59:08,61e9f469b937134a3c4c029b,38.27895,10.78280,1.0,1,7,5,23,59,1715126348


In [12]:
model.predict(val_data.drop(columns = ['time']))

array([[ 46.16714089,  -7.49259114],
       [ 29.00215358, -32.78659827],
       [ 40.19956528,  21.07924306],
       ...,
       [ 53.10935032,   5.60182478],
       [ 15.48006652, 110.27654847],
       [ 27.28564721, -22.29941254]])

In [13]:
val_data[constants.predicted_columns] =  model.predict(val_data.drop(columns = ['time']))

In [33]:
val_data

Unnamed: 0,ID,time,vesselId,latitude,longitude,scaling_factor,day_of_week,day_of_month,month,hour,minute,unix_time,latitude_predicted,longitude_predicted
0,0,2024-05-03 00:02:06,631e3cc6192150e13fa41625,30.78370,-14.12051,3.0,4,3,5,0,2,1714694526,46.167141,-7.492591
1,1,2024-05-03 00:02:11,61e9f39bb937134a3c4bfdbd,15.59584,-102.14670,3.0,4,3,5,0,2,1714694531,29.002154,-32.786598
2,2,2024-05-03 00:02:53,61e9f441b937134a3c4c018b,32.93787,34.58350,3.0,4,3,5,0,2,1714694573,40.199565,21.079243
3,3,2024-05-03 00:04:23,6326ec84c46d6a20d22ca318,35.38995,139.64783,3.0,4,3,5,0,4,1714694663,16.500878,113.989904
4,4,2024-05-03 00:05:56,61e9f470b937134a3c4c02cd,40.71222,29.46758,3.0,4,3,5,0,5,1714694756,37.598126,26.725346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60501,60501,2024-05-07 23:59:07,clh6aqawa0002gh0zypfa5dut,52.19131,-5.82223,1.0,1,7,5,23,59,1715126347,50.742748,2.210771
60502,60502,2024-05-07 23:59:08,61e9f3aeb937134a3c4bfe43,38.96142,-12.00502,1.0,1,7,5,23,59,1715126348,0.297041,185.720439
60503,60503,2024-05-07 23:59:08,61e9f43db937134a3c4c0169,49.71372,-5.22042,1.0,1,7,5,23,59,1715126348,53.109350,5.601825
60504,60504,2024-05-07 23:59:08,61e9f469b937134a3c4c029b,38.27895,10.78280,1.0,1,7,5,23,59,1715126348,15.480067,110.276548


In [44]:

from geopy.distance import geodesic

def score(val_data):
    val_data['weighted_distance'] = val_data.apply(calculate_distance, axis=1)
    return val_data['weighted_distance'].mean() / 1000.0

def calculate_distance(row):
    """Calculates the weighted distance between the actual and predicted lat/long points."""
    if pd.isna(row['latitude']) or pd.isna(row['latitude_predicted']):
        return np.nan
    # Calculate the geodesic distance in meters
    distance = geodesic((row['latitude'], row['longitude']), 
                        (row['latitude_predicted'], row['longitude_predicted'])).meters
    # Weight the distance by the scaling factor
    weighted_distance = distance * row['scaling_factor']
    return weighted_distance

In [45]:
score(val_data)

5628.711589869587