### Import relevant packages

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import tqdm as tqdm

### Import datasets 

In [11]:
train_df = pd.read_csv('../first_50000_rows.csv', sep = "|")
vessels_df = pd.read_csv('../vessels.csv', sep = '|')
ports_df = pd.read_csv('../ports.csv', sep = '|')
ports_df['portLatitude'] = ports_df['latitude']
ports_df['portLongitude'] = ports_df['longitude']
ports_df = ports_df.drop(columns = ['latitude', 'longitude'])
test_df = pd.read_csv('../ais_test.csv', sep = ',')
# schedules_df = pd.read_csv('../schedules_to_may_2024.csv', sep = '|')

### Some preprocessing 

In [12]:
## adding vessel and port info
train_df = pd.merge(train_df, ports_df, on = 'portId', how = 'left')
train_df = pd.merge(train_df, vessels_df[['vesselId', 'shippingLineId', ]], on = 'vesselId')


train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

timestamps = train_df[['time', 'vesselId']]

train_df = train_df.drop(columns = ['countryName', 'ISO', 'UN_LOCODE', 'name', 'portLocation', ])
train_df

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,portLatitude,portLongitude,shippingLineId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.74370,-57.85130,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900
1,2024-01-01 06:09:08,92.8,14.2,0,90,0,01-09 23:00,-35.16787,-56.77210,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900
2,2024-01-01 06:38:19,90.5,14.3,0,88,0,01-09 23:00,-35.16863,-56.63185,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900
3,2024-01-01 06:58:55,88.2,14.3,0,86,0,01-09 23:00,-35.16805,-56.53190,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900
4,2024-01-01 07:15:56,88.3,12.3,0,86,0,01-09 23:00,-35.16715,-56.45306,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-01-05 07:36:07,9.0,16.1,0,12,0,01-05 04:00,22.85250,119.84820,61e9f394b937134a3c4bfda3,61d3801529b60f6113c89f1e,22.565278,120.311111,61a8e673f9cba188601e84ae
49996,2024-01-05 06:56:21,297.7,9.7,0,302,0,01-05 01:00,36.93161,-75.93341,61e9f3b1b937134a3c4bfe57,61d38482b7b7526e1adf3d25,36.901944,-76.297778,61a8e672f9cba188601e84ac
49997,2024-01-05 07:17:42,331.2,11.4,0,335,0,01-05 07:00,36.96040,-75.98922,61e9f3b1b937134a3c4bfe57,61d38482b7b7526e1adf3d25,36.901944,-76.297778,61a8e672f9cba188601e84ac
49998,2024-01-05 06:57:49,262.0,9.8,0,265,0,01-05 10:00,1.31167,104.29918,61e9f3b9b937134a3c4bfe89,61d37ee429b60f6113c89d01,1.292778,103.725278,61a8e673f9cba188601e84ad


### Label encoding

In [None]:

# le_vesselid = LabelEncoder()
# all_vesselId = pd.concat([train_df['vesselId'], schedules_df['vesselId']], axis = 0)
# le_vesselid.fit(all_vesselId)
# train_df['vesselId'] = le_vesselid.transform(train_df['vesselId'])
# test_df['vesselId'] =  le_vesselid.transform(test_df['vesselId'])
# schedules_df['vesselId'] = le_vesselid.transform(schedules_df['vesselId'])

# le_shippingLineId = LabelEncoder()
# all_shippingLineId = pd.concat([train_df['shippingLineId'], schedules_df['shippingLineId']], axis = 0)
# le_shippingLineId.fit(all_shippingLineId)
# train_df['shippingLineId'] = le_shippingLineId.transform(train_df['shippingLineId'])
# schedules_df['shippingLineId'] = le_shippingLineId.transform(schedules_df['shippingLineId'])

# le_portid = LabelEncoder()
# le_portid.fit(train_df['portId'])
# train_df['portId'] = le_portid.transform(train_df['portId'])

# train_df['navstat'] = pd.Categorical(train_df['navstat']).codes


### Feature engineering
First for the training set

In [13]:

X = train_df.sort_values(by = 'vesselId', kind =  'stable')
X_1 = X.copy()
X_1[['time_x', 'longitude', 'latitude']] = (X_1[['time', 'vesselId', 'longitude', 'latitude']].groupby(by = 'vesselId').shift(-1))
X_1['vesselId'] = X['vesselId']
X_1['time_diff'] =  (X_1['time_x'] - X_1['time']).dt.total_seconds()
X_1 = X_1.dropna()
display(X)
display(X_1)

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,portLatitude,portLongitude,shippingLineId
12876,2024-01-01 00:14:36,348.0,0.0,0,333,5,12-29 21:00,51.30883,3.23027,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a
12877,2024-01-01 00:35:36,8.0,0.0,0,333,5,12-29 21:00,51.30882,3.23025,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a
12878,2024-01-01 00:56:34,20.0,0.0,0,333,5,12-29 21:00,51.30882,3.23027,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a
12879,2024-01-01 01:17:35,6.0,0.0,0,334,5,12-29 21:00,51.30880,3.23023,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a
12880,2024-01-01 01:35:36,353.0,0.0,0,334,5,12-29 21:00,51.30882,3.23030,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43774,2024-01-05 05:56:19,0.0,0.0,-2,135,5,01-05 05:00,53.86176,8.72497,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3
43775,2024-01-05 06:17:19,0.0,0.0,3,135,5,01-05 05:00,53.86175,8.72496,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3
43776,2024-01-05 06:35:19,0.0,0.0,-2,135,5,01-05 05:00,53.86178,8.72498,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3
43777,2024-01-05 06:56:19,0.0,0.0,2,134,5,01-05 05:00,53.86178,8.72498,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,portLatitude,portLongitude,shippingLineId,time_x,time_diff
12876,2024-01-01 00:14:36,348.0,0.0,0,333,5,12-29 21:00,51.30882,3.23025,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a,2024-01-01 00:35:36,1260.0
12877,2024-01-01 00:35:36,8.0,0.0,0,333,5,12-29 21:00,51.30882,3.23027,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a,2024-01-01 00:56:34,1258.0
12878,2024-01-01 00:56:34,20.0,0.0,0,333,5,12-29 21:00,51.30880,3.23023,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a,2024-01-01 01:17:35,1261.0
12879,2024-01-01 01:17:35,6.0,0.0,0,334,5,12-29 21:00,51.30882,3.23030,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a,2024-01-01 01:35:36,1081.0
12880,2024-01-01 01:35:36,353.0,0.0,0,334,5,12-29 21:00,51.30882,3.23035,61e9f38eb937134a3c4bfd8d,61d36f9a0a1807568ff9a156,51.336389,3.207222,61ec94f1a8cafc0e93f0e92a,2024-01-01 01:56:34,1258.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43773,2024-01-05 05:35:18,0.0,0.0,-1,135,5,01-05 05:00,53.86176,8.72497,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3,2024-01-05 05:56:19,1261.0
43774,2024-01-05 05:56:19,0.0,0.0,-2,135,5,01-05 05:00,53.86175,8.72496,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3,2024-01-05 06:17:19,1260.0
43775,2024-01-05 06:17:19,0.0,0.0,3,135,5,01-05 05:00,53.86178,8.72498,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3,2024-01-05 06:35:19,1080.0
43776,2024-01-05 06:35:19,0.0,0.0,-2,135,5,01-05 05:00,53.86178,8.72498,clh6aqawa0007gh0z9h6zi9bo,61d375e793c6feb83e5eb3e3,53.895833,9.135833,61a8e673f9cba188601e84b3,2024-01-05 06:56:19,1260.0


In [14]:
X = train_df.copy()
X[['time_y', 'longitude_y', 'latitude_y']] = (X[['time', 'vesselId', 'longitude', 'latitude']].groupby(by = 'vesselId').shift(-1))
X['vesselId'] = X['vesselId']
X['time_diff'] =  (X['time_y'] - X['time']).dt.total_seconds()
X = X.dropna()
for k in tqdm.tqdm([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]):
    print(k , end = '\r')
    X_k = train_df.copy()
    X_k[['time_y', 'longitude_y', 'latitude_y']] = (X_k[['time', 'vesselId', 'longitude', 'latitude']].groupby(by = 'vesselId').shift(-2**k))
    X_k['vesselId'] = train_df['vesselId']
    X_k['time_diff'] =  (X_k['time_y'] - X_k['time']).dt.total_seconds()
    X_k = X_k.dropna()
    X = pd.concat([X, X_k])
    
for k in tqdm.tqdm(range(3, 15)):
    if k in [2, 4, 8]:
        continue
    print(k , end = '\r')
    X_k = train_df.copy()
    X_k[['time_y', 'longitude_y', 'latitude_y']] = (X_k[['time', 'vesselId', 'longitude', 'latitude']].groupby(by = 'vesselId').shift(-k))
    X_k['vesselId'] = train_df['vesselId']
    X_k['time_diff'] =  (X_k['time_y'] - X_k['time']).dt.total_seconds()
    X_k = X_k.dropna()
    X = pd.concat([X, X_k])

display(X)

 20%|██        | 2/10 [00:00<00:00, 13.72it/s]

3

 40%|████      | 4/10 [00:00<00:00, 13.02it/s]

6

 80%|████████  | 8/10 [00:00<00:00, 11.59it/s]

9

100%|██████████| 10/10 [00:00<00:00, 12.02it/s]


10

  0%|          | 0/12 [00:00<?, ?it/s]

3

  8%|▊         | 1/12 [00:00<00:01,  8.15it/s]

5

 25%|██▌       | 3/12 [00:00<00:00, 13.68it/s]

7

 42%|████▏     | 5/12 [00:00<00:00, 10.41it/s]

9

 58%|█████▊    | 7/12 [00:00<00:00,  9.96it/s]

11

 92%|█████████▏| 11/12 [00:01<00:00,  9.44it/s]

13

100%|██████████| 12/12 [00:01<00:00,  9.67it/s]

14




Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,portLatitude,portLongitude,shippingLineId,time_y,longitude_y,latitude_y,time_diff
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.74370,-57.85130,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900,2024-01-01 06:09:08,-56.77210,-35.16787,22123.0
1,2024-01-01 06:09:08,92.8,14.2,0,90,0,01-09 23:00,-35.16787,-56.77210,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900,2024-01-01 06:38:19,-56.63185,-35.16863,1751.0
2,2024-01-01 06:38:19,90.5,14.3,0,88,0,01-09 23:00,-35.16863,-56.63185,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900,2024-01-01 06:58:55,-56.53190,-35.16805,1236.0
3,2024-01-01 06:58:55,88.2,14.3,0,86,0,01-09 23:00,-35.16805,-56.53190,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900,2024-01-01 07:15:56,-56.45306,-35.16715,1021.0
4,2024-01-01 07:15:56,88.3,12.3,0,86,0,01-09 23:00,-35.16715,-56.45306,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,61ec65aea8cafc0e93f0e900,2024-01-01 07:28:15,-56.40306,-35.16646,739.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49938,2024-01-05 02:38:39,281.0,12.3,0,280,0,01-05 02:00,1.21047,103.80817,61e9f430b937134a3c4c010b,61d37ee429b60f6113c89d01,1.292778,103.725278,61a8e673f9cba188601e84ae,2024-01-05 07:17:19,103.75122,1.28265,16720.0
49939,2024-01-05 02:59:10,315.0,10.2,0,315,0,01-05 02:00,1.26095,103.76688,61e9f430b937134a3c4c010b,61d37ee429b60f6113c89d01,1.292778,103.725278,61a8e673f9cba188601e84ae,2024-01-05 07:35:19,103.75122,1.28265,16569.0
49958,2024-01-05 01:57:58,244.4,17.5,8,245,0,12-27 13:00,-29.76792,31.33055,61e9f3c5b937134a3c4bfec7,61d37ef929b60f6113c89d1b,-29.881111,31.027222,61be24564ea00ae59d0fe37a,2024-01-05 06:37:31,31.14744,-29.81433,16773.0
49959,2024-01-05 02:18:17,247.2,13.6,5,248,0,12-27 13:00,-29.80352,31.23361,61e9f3c5b937134a3c4bfec7,61d37ef929b60f6113c89d1b,-29.881111,31.027222,61be24564ea00ae59d0fe37a,2024-01-05 06:57:50,31.13015,-29.80968,16773.0


### Feature engineering 2 
For test set 

In [None]:
display(test_df)

vessels = test_df['vesselId'].unique()
last_values = {}
for vessel in vessels:
    data_vessel = train_df[train_df['vesselId'] == vessel]
    last_values[vessel] = data_vessel.tail(1)


In [None]:
test_data = {}
for index, row in test_df.iterrows():
    print(index, end = '\r')
    last_data = last_values[row['vesselId']]
    row = pd.DataFrame(row).T    
    row = pd.merge(last_data, row, left_on='vesselId', right_on='vesselId', how='right')
    test_data[index] = row
test_data = pd.concat(test_data)

In [None]:
test_data['time_diff'] = (pd.to_datetime(test_data['time_y']) - pd.to_datetime(test_data['time_x'])).dt.total_seconds()
display(test_data)

In [None]:
X['time_x'] = X['time']
features1 = ['time_x', 'time_y', 'time_diff', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude', 'longitude', 'portLatitude', 'portLongitude']
features2 = ['time_x', 'time_y', 'time_diff', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude', 'longitude', 'portLatitude', 'portLongitude', 'latitude_y', 'longitude_y']

display(test_data[features1])
display(X[features2])


In [None]:
train_data = X.copy()
features = ['time_diff', 'vesselId', 'cog', 'sog', 'rot', 'heading', 'latitude', 'longitude']
target_lat = 'latitude_y'
target_lon = 'longitude_y'

X_train = train_data[features]
y_lat = train_data[target_lat]
y_lon = train_data[target_lon]
# display(X_train)
# display(y_lat)
# display(y_lon)

# Train the model
model_lat = RandomForestRegressor(n_jobs=-1, n_estimators=20, verbose=3, random_state=42, warm_start=False, criterion='squared_error',max_depth=25)
model_lat.fit(X_train, y_lat)

model_lon = RandomForestRegressor(n_jobs=-1, n_estimators=20, verbose=3, random_state=42, warm_start=False, criterion='squared_error', max_depth=35)
model_lon.fit(X_train, y_lon)


In [None]:
# Make predictions
lat_predictions = model_lat.predict(test_data[features])
lon_predictions = model_lon.predict(test_data[features])

# Combine predictions into a DataFrame
predictions = pd.DataFrame({
    'ID': test_data['ID'],  # Assuming 'ID' is a column in test_data
    'longitude_predicted': lon_predictions,
    'latitude_predicted': lat_predictions,
})

# Save predictions to a file
predictions.to_csv('../../data/predictions/predictions_large.csv', index=False)