In [527]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime as dt
import math
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [528]:
ais_dataset = pd.read_csv('./datasets/ais_dataset.csv')
port_call_dataset = pd.read_csv('./datasets/port_call_dataset.csv')
distance_dataset = pd.read_csv('./datasets/distance_dataset.csv')
llaf_table = pd.read_csv('./datasets/llaf_table.csv')




print(len(ais_dataset['imo'].unique()))
print(len(port_call_dataset['imo'].unique()))
print(ais_dataset.info())
print(port_call_dataset.info())
print(distance_dataset.info())
# no terminal, maneuvering , berth
# suggests that only anchored and transit
# we can simply find if anchored or not

2543
4166
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251769 entries, 0 to 251768
Data columns (total 30 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    251769 non-null  int64  
 1   mmsi                   251769 non-null  int64  
 2   vessel_name            251769 non-null  object 
 3   date_of_build          251769 non-null  object 
 4   vessel_type            251769 non-null  object 
 5   group                  251769 non-null  object 
 6   timestamp              251769 non-null  object 
 7   lon                    251769 non-null  float64
 8   lat                    251769 non-null  float64
 9   nav_stat               251769 non-null  int64  
 10  speed                  249752 non-null  float64
 11  course                 251769 non-null  int64  
 12  heading                250464 non-null  float64
 13  fuel_category          251769 non-null  int64  
 14  main_engine_fuel_type  251

### Dropping Columns
---

##### AIS

In [529]:
# dropping missing values
ais_dataset.drop(columns=['berth' , 'terminal' , 'maneuvering_zone'] , inplace=True)
# ais_dataset.dropna(inplace=True) # 《 5% missing data

In [530]:
# From background research cum provided information
# ABL AEL !== 0
# drop meta data:
# port_name , mmsi (cannot be used since other datasets only have imo) , date_of_build
useless_columns = ['port_name' , 'mmsi' , 'date_of_build']
ais_dataset.drop(columns=useless_columns , inplace=True)

In [531]:
abl_ael_not_zero_condition = (ais_dataset['abl'] > 0) & (ais_dataset['ael'] > 0)
ais_dataset = ais_dataset[abl_ael_not_zero_condition]

In [532]:
ais_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248013 entries, 0 to 251768
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    248013 non-null  int64  
 1   vessel_name            248013 non-null  object 
 2   vessel_type            248013 non-null  object 
 3   group                  248013 non-null  object 
 4   timestamp              248013 non-null  object 
 5   lon                    248013 non-null  float64
 6   lat                    248013 non-null  float64
 7   nav_stat               248013 non-null  int64  
 8   speed                  246213 non-null  float64
 9   course                 248013 non-null  int64  
 10  heading                246742 non-null  float64
 11  fuel_category          248013 non-null  int64  
 12  main_engine_fuel_type  248013 non-null  object 
 13  aux_engine_fuel_type   248013 non-null  object 
 14  engine_type            247590 non-null  o

### Imputing

In [533]:
# When looking through the speed we notice each imo has a null speed
# This suggests that each ship is anchored-travels to sg-travels further/stays in sg
# Speed null values can be imputed as '0'
ais_dataset['speed'].fillna(0 , inplace=True)

In [534]:
# We can also drop speeds that are greather than 30
less_than_30_condition = ais_dataset['speed'] < 30
ais_dataset = ais_dataset[less_than_30_condition]

In [535]:
ais_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248013 entries, 0 to 251768
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imo                    248013 non-null  int64  
 1   vessel_name            248013 non-null  object 
 2   vessel_type            248013 non-null  object 
 3   group                  248013 non-null  object 
 4   timestamp              248013 non-null  object 
 5   lon                    248013 non-null  float64
 6   lat                    248013 non-null  float64
 7   nav_stat               248013 non-null  int64  
 8   speed                  248013 non-null  float64
 9   course                 248013 non-null  int64  
 10  heading                246742 non-null  float64
 11  fuel_category          248013 non-null  int64  
 12  main_engine_fuel_type  248013 non-null  object 
 13  aux_engine_fuel_type   248013 non-null  object 
 14  engine_type            247590 non-null  o

In [536]:
# imputing the max speed for the distance dataset
# Adding vref to the dataset
# we will calculate for null max speeds to be vref * 1.06

distance_dataset = pd.merge(distance_dataset , ais_dataset[['imo' , 'vref']] , on='imo' , how='inner')
distance_dataset = distance_dataset.drop_duplicates()

distance_dataset['max_speed_kt'] = distance_dataset.apply(lambda row: row['vref'] * 1.06 if pd.isna(row['max_speed_kt']) else row['max_speed_kt'] , axis=1)

### Feature Engineering

port call

In [537]:
# creating time col (duration spent during an event)
# duration travelled = current - prev earliest time
# duration in port = current latest - current earliest
def get_time_difference_in_seconds_from_utc(s1 , s2):
  try: 
    current_time_timestamp = dt.strptime(s1, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp()
    prev_time_timestamp = dt.strptime(s2, '%Y-%m-%dT%H:%M:%S.%f%z').timestamp()
    duration_timestamp_seconds = abs(current_time_timestamp - prev_time_timestamp)
    return duration_timestamp_seconds
  except:
    # For when value is null
    return np.nan

def get_duration(col1 , col2):
  def calculate(rows):
    s1 = rows[col1]
    s2 = rows[col2]
    return round(get_time_difference_in_seconds_from_utc(s1 , s2) / 3600 , 5) # rounding to 5 dp
  
  return calculate


prev_anchor = ['prev_country_earliest_visit_time_utc' , 'prev_country_latest_visit_time_utc']
prev_to_sg_transit = ['prev_country_latest_visit_time_utc' , 'current_earliest_visit_time_utc']
sg_anchor = ['current_latest_visit_time_utc' , 'current_earliest_visit_time_utc']
sg_to_next_transit = ['current_latest_visit_time_utc' , 'next_country_earliest_visit_time_utc']
next_anchor = ['next_country_earliest_visit_time_utc' , 'next_country_latest_visit_time_utc']

port_call_dataset['Prev Anchor'] = port_call_dataset.loc[: , prev_anchor].apply(get_duration(*prev_anchor) , axis=1)
port_call_dataset['Prev to SG Transit'] = port_call_dataset.loc[: , prev_to_sg_transit].apply(get_duration(*prev_to_sg_transit), axis=1)
port_call_dataset['SG Anchor'] = port_call_dataset.loc[: , sg_anchor].apply(get_duration(*sg_anchor), axis=1)
port_call_dataset['SG to Next Transit'] = port_call_dataset.loc[: , sg_to_next_transit].apply(get_duration(*sg_to_next_transit) , axis=1)
port_call_dataset['Next Anchor'] = port_call_dataset.loc[: , next_anchor].apply(get_duration(*next_anchor) , axis=1)

AIS dataset

In [538]:
def get_state(row):
  TRANSIT_NAV_STAT = [0 , 3 , 4 , 8 , 11 , 12]
  if row['speed'] > 1 or row['nav_stat'] in TRANSIT_NAV_STAT:
    return 'Transit'
  return 'Anchored'

In [539]:
ais_dataset['state'] = ais_dataset.apply(get_state , axis=1)

In [540]:
ais_dataset['state']

0         Transit
1         Transit
2         Transit
3         Transit
4         Transit
           ...   
251764    Transit
251765    Transit
251766    Transit
251767    Transit
251768    Transit
Name: state, Length: 248013, dtype: object

In [541]:
# Adding the admission tier


### Calculating Before Emissions

In [542]:
# Calculating ef
ais_dataset['ef_me'] = ais_dataset['sfc_me'] * 0.867 * 3.667
ais_dataset['ef_ae'] = ais_dataset['sfc_ae'] * 0.867 * 3.667
ais_dataset['ef_ab'] = ais_dataset['sfc_ab'] * 0.867 * 3.667


In [543]:
# result["ratio"] = result["max_speed_kt"] / result["vref"]
# result["rounded"] = result["ratio"].apply(lambda x: 0 if x < 1 else 1)

# # Count the number of 0s and 1s
# counts = result["rounded"].value_counts()

# # Plot the pie chart
# plt.figure(figsize=(8, 6))
# counts.plot.pie(
#     labels=["Below 1 (0)", "Above or Equal to 1 (1)"],
#     autopct="%1.1f%%",
#     startangle=90,
#     colors=["lightblue", "orange"]
# )

# # Add a title
# plt.title("Distribution of Rounded Values", fontsize=14)
# plt.ylabel("")  # Hide the y-axis label
# plt.show()

In [544]:
print(np.mean(port_call_dataset['SG Anchor']))
print(np.mean(port_call_dataset['Prev Anchor']))
print(np.mean(port_call_dataset['Next Anchor'].notnull()))

3.9299050600096015
1.5400323979836774
0.971435429668747


In [545]:
print(np.sort(port_call_dataset['current_earliest_visit_time_utc'].unique()))
print(ais_dataset.columns)

['2024-06-30T00:20:08.000+0000' '2024-06-30T00:20:42.000+0000'
 '2024-06-30T00:45:03.000+0000' ... '2024-07-31T22:41:13.000+0000'
 '2024-07-31T23:15:01.000+0000' '2024-07-31T23:45:02.000+0000']
Index(['imo', 'vessel_name', 'vessel_type', 'group', 'timestamp', 'lon', 'lat',
       'nav_stat', 'speed', 'course', 'heading', 'fuel_category',
       'main_engine_fuel_type', 'aux_engine_fuel_type', 'engine_type',
       'anchorage', 'p', 'vref', 'sfc_me', 'sfc_ae', 'sfc_ab', 'ael', 'abl',
       'distance', 'state', 'ef_me', 'ef_ae', 'ef_ab'],
      dtype='object')


In [546]:
avgtime = pd.merge(port_call_dataset, ais_dataset[['imo', 'vessel_type']], how='inner', on='imo')
# Check the columns of the merged DataFrame to ensure 'vessel_type' is present
print(avgtime.columns)

# Now group by 'vessel_type' and calculate the mean of 'Sg Anchor'
avg_time_per_vessel_type = avgtime.groupby('vessel_type')['SG Anchor'].mean()
avg_speed_per_vessel_type = avgtime.groupby('vessel_type')['Prev to SG Transit'].mean()# Print the result
print(avg_time_per_vessel_type.nlargest(10))
print(avg_speed_per_vessel_type.nlargest(10))

Index(['current_snapshot_date_utc', 'imo', 'current_port_name',
       'current_port_country_name', 'current_earliest_visit_time_utc',
       'current_latest_visit_time_utc', 'current_zone_polygon_name',
       'current_zone_type', 'next_port_name', 'next_country_name',
       'next_country_earliest_visit_time_utc',
       'next_country_latest_visit_time_utc', 'next_zone_polygon_name',
       'next_snapshot_date', 'next_zone_type', 'prev_port_name',
       'prev_country_name', 'prev_country_earliest_visit_time_utc',
       'prev_country_latest_visit_time_utc', 'prev_zone_polygon_name',
       'prev_snapshot_date', 'prev_zone_type', 'Prev Anchor',
       'Prev to SG Transit', 'SG Anchor', 'SG to Next Transit', 'Next Anchor',
       'vessel_type'],
      dtype='object')
vessel_type
Deck Cargo Ship              22.287801
Shuttle Tanker               17.537780
Livestock Carrier            17.138988
Heavy Load Carrier           16.533120
LNG Tanker                   16.162102
Crude Oil Tank

### Attaching Emissions to port call

In [547]:
def calculate_emissions_aux(row):
    LA = row['ael']
    LBL = row['abl']
    EF_SFCAE = row['ef_ae']
    EF_SFCAB = row['ef_ab']
    base_emissions = (LA * EF_SFCAE + LBL * EF_SFCAB)
    return base_emissions


# Apply emissions calculations for 'Aux' and 'Transit' columns
ais_dataset['Aux'] = ais_dataset.apply(calculate_emissions_aux, axis=1)


In [548]:
ais_dataset['Aux_total'] = ais_dataset.groupby('imo')['Aux'].transform('sum')


In [549]:
print(ais_dataset['Aux_total'])


0         6.868089e+07
1         6.868089e+07
2         6.868089e+07
3         6.868089e+07
4         6.868089e+07
              ...     
251764    1.313553e+08
251765    1.313553e+08
251766    1.313553e+08
251767    1.313553e+08
251768    1.313553e+08
Name: Aux_total, Length: 248013, dtype: float64


In [586]:
def calculate_emissions_transit(row):
    try:
        if row['state'] != 'Transit':
            return 0

        IMO = row['imo']
        transit_and_imo_condition = (ais_dataset['state'] == 'Transit') & (ais_dataset['imo'] == IMO)

        speeds_in_transit = ais_dataset[transit_and_imo_condition]['speed']
        if speeds_in_transit.empty:
            return 0
        
        AVERAGE_SPEED = speeds_in_transit.mean()

        max_speed_info = distance_dataset[distance_dataset['imo'] == IMO]
        if max_speed_info.empty:
            return 0
        MAX_SPEED = max_speed_info['max_speed_kt'].values[0]

        LF = (AVERAGE_SPEED / MAX_SPEED) ** 3
        SUMMED_SPEED = speeds_in_transit.sum()

        EF_SFME = row['ef_me']
        P = row['p']
        A = row['speed'] / SUMMED_SPEED

        load_factor = f'{math.ceil(LF * 100)}%'
        LLAF = llaf_table[llaf_table['Load'] == load_factor]

        if LLAF.empty:
            LLAF = 1
        else:
            LLAF = LLAF.iloc[0]['CO2']

        emissions = P * LF * A * EF_SFME * LLAF
        return emissions
    except Exception as e:
        print(f"Error processing IMO {row['imo']}: {e}")
        return 0

# Apply the function to the dataset and test it with print statements
ais_dataset['Transit'] = ais_dataset.apply(calculate_emissions_transit, axis=1)

In [555]:

a= llaf_table[llaf_table['Load'] == '2%']['CO2']
