In [51]:
from xgboost import XGBRegressor
import numpy as np

In [52]:
x_train = np.load('../data/xgb/X_train.npz')['arr_0']
y_train = np.load('../data/xgb/y_train.npz')['arr_0']
x_val = np.load('../data/xgb/X_val.npz')['arr_0']
y_val = np.load('../data/xgb/y_val.npz')['arr_0']

In [53]:
# print x_train[0] but not whith numbers on scientific notation
np.set_printoptions(suppress=True)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)
print(x_train[0])

(1515194, 174) (1515194, 12) (297840, 174) (297840, 12)
[   7.50361    77.5834    308.1        17.1         1.          0.
    0.          0.          0.          0.          0.          0.
    0.          7.57302    77.49505   307.6        17.3         1.
    0.          0.          0.          0.          0.          0.
    0.         23.216667    7.65043    77.39404   306.8        16.9
    1.          0.          0.          0.          0.          0.
    0.          0.         26.383333    7.71275    77.31394   307.9
   16.9         1.          0.          0.          0.          0.
    0.          0.          0.         21.416666    7.77191    77.23585
  307.         16.3         1.          0.          0.          0.
    0.          0.          0.          0.         20.983334    7.81285
   77.18147   307.6        16.1         1.          0.          0.
    0.          0.          0.          0.          0.         15.016666
    7.86929    77.11032   309.5        16.1         1. 

In [54]:
model = XGBRegressor(n_estimators=50, max_depth=9, learning_rate=0.1, tree_method='hist', subsample=0.95, colsample_bytree=0.7, objective='reg:squarederror')

model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=True)

model.save_model('../models/xgb_model.json')

[0]	validation_0-rmse:56.72972
[1]	validation_0-rmse:51.97716
[2]	validation_0-rmse:47.53011
[3]	validation_0-rmse:43.73538
[4]	validation_0-rmse:40.24186
[5]	validation_0-rmse:37.16665
[6]	validation_0-rmse:34.47039
[7]	validation_0-rmse:32.22323
[8]	validation_0-rmse:30.15011
[9]	validation_0-rmse:28.37117
[10]	validation_0-rmse:26.92095
[11]	validation_0-rmse:25.58990
[12]	validation_0-rmse:24.46076
[13]	validation_0-rmse:23.49653
[14]	validation_0-rmse:22.68042
[15]	validation_0-rmse:21.99121
[16]	validation_0-rmse:21.40270
[17]	validation_0-rmse:20.92262
[18]	validation_0-rmse:20.51221
[19]	validation_0-rmse:20.16772
[20]	validation_0-rmse:19.91798
[21]	validation_0-rmse:19.66154
[22]	validation_0-rmse:19.45342
[23]	validation_0-rmse:19.27750
[24]	validation_0-rmse:19.12958
[25]	validation_0-rmse:19.02033
[26]	validation_0-rmse:18.92748
[27]	validation_0-rmse:18.83695
[28]	validation_0-rmse:18.76346
[29]	validation_0-rmse:18.69636
[30]	validation_0-rmse:18.64676
[31]	validation_0-

In [None]:
# use optuna to find the best hyperparameters. using device cuda

import optuna
from optuna.integration import XGBoostPruningCallback
from optuna.integration.xgboost import XGBoostPruningCallback


def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'tree_method': 'hist',
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'device': 'cuda'
    }

    pruning_callback = XGBoostPruningCallback(trial, 'validation_0-rmse')
    model = XGBRegressor(**param)
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False, callbacks=[pruning_callback])

    y_pred = model.predict(x_val)
    rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
    return rmse




In [55]:
def progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█', printEnd="\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end=printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

In [112]:
import pandas as pd

def format_last_known_sequence(last_known_seq, example_length, vessel_data_length, test_time):
    data = last_known_seq[0]

    time_prev = last_known_seq[1][-1]
    test_time = pd.Timestamp(test_time)
    time_delta = np.array((test_time - time_prev).total_seconds() / 60)

    history = data[example_length:example_length*11]
    vessel_data = data[example_length*11:(example_length*11 + vessel_data_length)]
    vessel_type = data[-4:]

    time_data = np.zeros(5+24+7)

    # get 5 dummy variables for month
    month = test_time.month
    for i in range(5):
        time_data[i] = month == i + 1

    # get 24 dummy variables for hour
    hour = test_time.hour
    for i in range(24):
        time_data[5 + i] = hour == i

    # get 7 dummy variables for weekday
    weekday = test_time.weekday()
    for i in range(7):
        time_data[29 + i] = weekday == i

    last_known_sequence = np.hstack([history, time_delta, vessel_data, time_data, vessel_type])

    return last_known_sequence

def adjust_prediction(pred):
    pred[0] = max(-90, min(90, pred[0]))
    pred[1] = max(-180, min(180, pred[1]))
    pred[2] = max(0, min(359, pred[2]))
    pred[3] = max(0, min(1022, pred[3]))
    
    # set pred[4:] to 1 for the highest and 0 for the rest
    max_index = np.argmax(pred[4:])
    pred[4:] = 0
    pred[4 + max_index] = 1

    return pred

def format_recurrently(last_known_seq, pred, example_length, vessel_data_length, time_prev, test_time):

    data = last_known_seq[example_length:]
    history = data[:(example_length*9)]

    pred_input = np.hstack([adjust_prediction(pred), np.array(data[example_length*9])])
    
    time_prev = pd.Timestamp(time_prev)
    test_time = pd.Timestamp(test_time)
    time_delta = np.array((test_time - time_prev).total_seconds() / 60)

    vessel_data = data[(example_length*9 + 1):(example_length*9 + 1 + vessel_data_length)]
    vessel_type = data[-4:]

    time_data = np.zeros(5+24+7)

    # get 5 dummy variables for month
    month = test_time.month
    for i in range(5):
        time_data[i] = month == i + 1

    # get 24 dummy variables for hour
    hour = test_time.hour
    for i in range(24):
        time_data[5 + i] = hour == i

    # get 7 dummy variables for weekday
    weekday = test_time.weekday()
    for i in range(7):
        time_data[29 + i] = weekday == i

    last_known_sequence = np.hstack([history, pred_input, time_delta, vessel_data, time_data, vessel_type])


    return last_known_sequence



def format_history_visual(last_known_seq, vesselId):
    times = pd.to_datetime(last_known_seq[1])
    last_known_seq = last_known_seq[0]

    latitudes = last_known_seq[:11*13:13]
    longitudes = last_known_seq[1:1+11*13:13]

    history = pd.DataFrame({'time': times, 'vesselId': [vesselId for i in range(len(latitudes))], 'latitude': latitudes, 'longitude': longitudes})

    return history



def predict_future_positions(model, test_data, x_final, example_length, vessel_data_length, seq_length=10, return_first=False):

    IDs = []
    latitudes = [] 
    longitudes = []
    vessel_ids = []

    times_visual, latitudes_visual, longitudes_visual, vessel_ids_visual = [], [], [], []

    vesselId = ''

    # iterate over the test_data
    j = 0
    for i in range(len(test_data)):

        if test_data.iloc[i]['vesselId'] != vesselId:
            vesselId = test_data.iloc[i]['vesselId']
            last_known_sequence = format_last_known_sequence(x_final[j], example_length, vessel_data_length, test_data.iloc[i]['time'])

            if j != 0:
                result = pd.DataFrame({'ID': IDs, 'latitude': latitudes, 'longitude': longitudes})
                result = result.sort_values(by='ID').reset_index(drop=True)
                visualizer = pd.concat([visualizer, pd.DataFrame({'time': times_visual, 'vesselId': vessel_ids_visual, 'latitude': latitudes_visual, 'longitude': longitudes_visual})])
                visualizer.to_csv(f'../data/visual/visualizer_{j}.csv', index=False)

                times_visual, latitudes_visual, longitudes_visual, vessel_ids_visual = [], [], [], []
                visualizer = format_history_visual(x_final[j], vesselId)
            else:
                visualizer = format_history_visual(x_final[j], vesselId)
                
            j += 1

        else:
            last_known_sequence = format_recurrently(last_known_sequence, pred[0], example_length, vessel_data_length, time_prev, test_data.iloc[i]['time'])


        # Predict future positions
        pred = model.predict(last_known_sequence.reshape(1, -1))

        vessel_ids.append('pred_'+vesselId)
        time_prev = test_data.iloc[i]['time']

        IDs.append(test_data.iloc[i]['ID'])
        latitudes.append(pred[0][0])
        longitudes.append(pred[0][1])

        times_visual.append(test_data.iloc[i]['time'])
        latitudes_visual.append(pred[0][0])
        longitudes_visual.append(pred[0][1])
        vessel_ids_visual.append('pred_' + vesselId)

        if j > 10:
            break
        

        #progress_bar(i, len(test_data))

        

    # return a dataframe sorted by ID
    result = pd.DataFrame({'ID': IDs, 'latitude': latitudes, 'longitude': longitudes})
    result = result.sort_values(by='ID').reset_index(drop=True)
    return result


In [136]:
import pickle

x_final_train = pickle.load(open('../data/xgb/with_val/x_final_train.pkl', 'rb'))
x_val_stripped = pd.read_csv('../data/xgb/with_val/stripped_val_data.csv')

# load model
model = XGBRegressor()
model.load_model('../models/xgb_model.json')

example_length = 13
vessel_data_length = 3

result = predict_future_positions(model, x_val_stripped, x_final_train, example_length, vessel_data_length, return_first=True)

In [137]:
# store result to csv
result.to_csv('../data/xgb/xgb_result.csv', index=False)

In [140]:
# load the result and the stripped val data. calculate the mean geodesic distance in kilomiters
result = pd.read_csv('../data/xgb/xgb_result.csv')
val_data = pd.read_csv('../data/xgb/y_val_stripped.csv')

result = result.sort_values(by='ID').reset_index(drop=True)
val_data = val_data.sort_values(by='ID').reset_index(drop=True)

print(result.head())
print(val_data.head())

from geopy.distance import geodesic

mean_distance = 0
for i in range(len(result)):
    pred = (result.iloc[i]['latitude'], result.iloc[i]['longitude'])
    true = (val_data.iloc[i]['latitude'], val_data.iloc[i]['longitude'])
    mean_distance += geodesic(pred, true).kilometers

mean_distance /= len(result)
print('Mean geodesic distance in kilometers:', mean_distance)

    ID   latitude  longitude
0  321  41.139430   2.326052
1  322  41.156430   2.469431
2  323  41.156430   2.532059
3  324  41.154522   2.646584
4  325  41.154522   2.828715
   latitude  longitude   ID
0  41.21099    2.20128  321
1  41.25944    2.19747  322
2  41.26718    2.20361  323
3  41.26558    2.21807  324
4  41.26185    2.23374  325
Mean geodesic distance in kilometers: 3163.442090075849


In [64]:
def predict_future_positions_test(model, test_data, x_final, example_length, vessel_data_length, vesselID_dict ,seq_length=10, return_first=False):

    # sort the test_data by vesselId and time
    test_data = test_data.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

    IDs = []
    latitudes = [] 
    longitudes = []
    times = []
    vessel_ids = []

    vesselId = ''
    formated_output = []

    # iterate over the test_data
    j = 0
    for i in range(len(test_data)):

        if test_data.iloc[i]['vesselId'] != vesselId:
            vesselId = test_data.iloc[i]['vesselId']
            index = int(vesselID_dict[vesselId])
            last_known_sequence = format_last_known_sequence(x_final[index], example_length, vessel_data_length, test_data.iloc[i]['time'])
            #if j == 0:
                #visualizer = format_history_visual(x_final[index])
            j += 1
            if return_first and j == 2:
                result = pd.DataFrame({'ID': IDs, 'latitude': latitudes, 'longitude': longitudes})
                result = result.sort_values(by='ID').reset_index(drop=True)
                #visualizer = pd.concat([visualizer, pd.DataFrame({'time': times, 'vesselId': vessel_ids, 'latitude': latitudes, 'longitude': longitudes})])
                #visualizer.to_csv('../data/visualizer.csv', index=False)
                return result

        else:
            last_known_sequence = format_recurrently(last_known_sequence, pred[0], example_length, vessel_data_length, time_prev, test_data.iloc[i]['time'])

        if j < 10:
            print(last_known_sequence)
        else:
            return None
        
        # Predict future positions
        pred = model.predict(last_known_sequence.reshape(1, -1))

        #print(pred)

        times.append(test_data.iloc[i]['time'])
        vessel_ids.append('pred_'+vesselId)
        time_prev = test_data.iloc[i]['time']

        IDs.append(test_data.iloc[i]['ID'])
        latitudes.append(pred[0][0])
        longitudes.append(pred[0][1])
        

        #progress_bar(i, len(test_data))

        

    # return a dataframe sorted by ID
    result = pd.DataFrame({'ID': IDs, 'latitude': latitudes, 'longitude': longitudes})
    result = result.sort_values(by='ID').reset_index(drop=True)
    return result

In [61]:
print(x_test['vesselId'].nunique())
print(len(x_final_train))

215
687


In [None]:
x_final_train = pickle.load(open('../data/xgb/x_final_train.pkl', 'rb'))
x_test = pd.read_csv('../data/ais_test.csv')
vesselId_dict = pickle.load(open('../data/xgb/vesselId_dict.pkl', 'rb'))

# load model
model = XGBRegressor()
model.load_model('../models/xgb_model.json')

example_length = 13
vessel_data_length = 3

result = predict_future_positions_test(model, x_test, x_final_train, example_length, vessel_data_length, vesselId_dict)

In [63]:
# store result to csv. rename the columns to match the submission format ('ID', 'latitude_predicted', 'longitude_predicted')
result.columns = ['ID', 'latitude_predicted', 'longitude_predicted']
result.to_csv('../predictions/submission_xgb_1.csv', index=False)

In [137]:
def full_visualizer(filepath, train):


    i = 1
    while True:
        
        try:

            path = f'{filepath}visualizer_{i}.csv'

            # Load the visualizer
            visualizer = pd.read_csv(path)

            # find the vesselId of the first row
            vesselId = visualizer['vesselId'].iloc[0]

            # find the corresponding rows in the train data
            train_vessel = train[train['vesselId'] == vesselId]

            # find the corresponding times for the 'pred_' vesselId
            times = visualizer[visualizer['vesselId'] == 'pred_' + vesselId]['time']

            # find the values in train with the corresponding times
            train_vessel = train_vessel[train_vessel['time'].isin(times)]

            # add prefix 'true_' to the vesselId in train
            train_vessel['vesselId'] = 'true_' + train_vessel['vesselId']
            train_vessel = train_vessel[['time', 'vesselId', 'latitude', 'longitude']]

            # add rows to the visualizer where the vesselId is 'true_vesselId'
            visualizer = pd.concat([visualizer, train_vessel])

            # save full_visualizer to csv
            visualizer.to_csv(path)

        except:
            break

        i += 1



In [138]:
train = pd.read_csv('../data/ais_train.csv', sep='|')

In [139]:
full_visualizer('../data/visual/', train)