In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime as dt
import math
import warnings
warnings.filterwarnings('ignore')

In [55]:
ais_dataset = pd.read_csv('./datasets/ais_dataset.csv')
port_call_dataset = pd.read_csv('./datasets/port_call_dataset.csv')
distance_dataset = pd.read_csv('./datasets/distance_dataset.csv')




print(len(ais_dataset['imo'].unique()))
print(len(port_call_dataset['imo'].unique()))
print(ais_dataset.info())
print(port_call_dataset.info())
print(distance_dataset.info())
# no terminal, maneuvering , berth
# suggests that only anchored and transit
# we can simply find if anchored or not

2543
4166
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251769 entries, 0 to 251768
Data columns (total 30 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    251769 non-null  int64  
 1   mmsi                   251769 non-null  int64  
 2   vessel_name            251769 non-null  object 
 3   date_of_build          251769 non-null  object 
 4   vessel_type            251769 non-null  object 
 5   group                  251769 non-null  object 
 6   timestamp              251769 non-null  object 
 7   lon                    251769 non-null  float64
 8   lat                    251769 non-null  float64
 9   nav_stat               251769 non-null  int64  
 10  speed                  249752 non-null  float64
 11  course                 251769 non-null  int64  
 12  heading                250464 non-null  float64
 13  fuel_category          251769 non-null  int64  
 14  main_engine_fuel_type  251

### Dropping Columns
---

##### AIS

In [56]:
# dropping missing values
ais_dataset.drop(columns=['berth' , 'terminal' , 'maneuvering_zone'] , inplace=True)
# ais_dataset.dropna(inplace=True) # 《 5% missing data

In [57]:
# From background research cum provided information
# ABL AEL !== 0
# drop meta data:
# port_name , mmsi (cannot be used since other datasets only have imo) , date_of_build
useless_columns = ['port_name' , 'mmsi' , 'date_of_build']
ais_dataset.drop(columns=useless_columns , inplace=True)

In [58]:
abl_ael_not_zero_condition = (ais_dataset['abl'] > 0) & (ais_dataset['ael'] > 0)
ais_dataset = ais_dataset[abl_ael_not_zero_condition]

In [59]:
ais_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248013 entries, 0 to 251768
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    248013 non-null  int64  
 1   vessel_name            248013 non-null  object 
 2   vessel_type            248013 non-null  object 
 3   group                  248013 non-null  object 
 4   timestamp              248013 non-null  object 
 5   lon                    248013 non-null  float64
 6   lat                    248013 non-null  float64
 7   nav_stat               248013 non-null  int64  
 8   speed                  246213 non-null  float64
 9   course                 248013 non-null  int64  
 10  heading                246742 non-null  float64
 11  fuel_category          248013 non-null  int64  
 12  main_engine_fuel_type  248013 non-null  object 
 13  aux_engine_fuel_type   248013 non-null  object 
 14  engine_type            247590 non-null  o

### Imputing

In [60]:
# When looking through the speed we notice each imo has a null speed
# This suggests that each ship is anchored-travels to sg-travels further/stays in sg
# Speed null values can be imputed as '0'
ais_dataset['speed'].fillna(0 , inplace=True)

In [61]:
# We can also drop speeds that are greather than 30
less_than_30_condition = ais_dataset['speed'] < 30
ais_dataset = ais_dataset[less_than_30_condition]

In [62]:
ais_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248013 entries, 0 to 251768
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    248013 non-null  int64  
 1   vessel_name            248013 non-null  object 
 2   vessel_type            248013 non-null  object 
 3   group                  248013 non-null  object 
 4   timestamp              248013 non-null  object 
 5   lon                    248013 non-null  float64
 6   lat                    248013 non-null  float64
 7   nav_stat               248013 non-null  int64  
 8   speed                  248013 non-null  float64
 9   course                 248013 non-null  int64  
 10  heading                246742 non-null  float64
 11  fuel_category          248013 non-null  int64  
 12  main_engine_fuel_type  248013 non-null  object 
 13  aux_engine_fuel_type   248013 non-null  object 
 14  engine_type            247590 non-null  o

### Feature Engineering

port call

In [63]:
# creating time col (duration spent during an event)
# duration travelled = current - prev earliest time
# duration in port = current latest - current earliest
def get_time_difference_in_seconds_from_utc(s1 , s2):
  current_time_timestamp = dt.strptime(s1, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp()
  prev_time_timestamp = dt.strptime(s2, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp()
  duration_timestamp_seconds = current_time_timestamp - prev_time_timestamp
  return duration_timestamp_seconds

def get_port_to_sg_duration(rows):
  s1 = rows['current_earliest_visit_time_utc']
  s2 = rows['prev_country_earliest_visit_time_utc']
  return round(get_time_difference_in_seconds_from_utc(s1 , s2) / 3600 , 5) # rounding to 5 dp

def get_sg_port_duration(rows):
  s1 = rows['current_latest_visit_time_utc']
  s2 = rows['current_earliest_visit_time_utc']
  return round(get_time_difference_in_seconds_from_utc(s1 , s2) / 3600 , 5) # rounding to 5 dp



port_call_dataset['Time from Prev port to SG (h)'] = port_call_dataset.loc[: , ['current_earliest_visit_time_utc' , 'prev_country_earliest_visit_time_utc']].apply(get_port_to_sg_duration, axis=1)
port_call_dataset['Duration in Port (h)'] = port_call_dataset.loc[: , ['current_latest_visit_time_utc' , 'current_earliest_visit_time_utc']].apply(get_sg_port_duration , axis=1)

AIS dataset

In [64]:
# Handling state (transit or anchored)
def get_state(row):
  TRANSIT_NAV_STAT = [0 , 3 , 4 , 8 , 11 , 12]
  if row['speed'] > 1 or row['nav_stat'] in TRANSIT_NAV_STAT:
    return 'Transit'
  elif row['speed'] < 1 and row['nav_stat'] == 0:
    return 'Anchored'

In [65]:
ais_dataset['state'] = ais_dataset.apply(get_state , axis=1)

In [66]:
ais_dataset['state']

0         Transit
1         Transit
2         Transit
3         Transit
4         Transit
           ...   
251764    Transit
251765    Transit
251766    Transit
251767    Transit
251768    Transit
Name: state, Length: 248013, dtype: object

SyntaxError: unterminated string literal (detected at line 1) (3674507783.py, line 1)