# Using Predictive Analytics to Predict Delays in Maritime Container Shipping

In [1]:
%matplotlib inline

**Import Libraries**

In [2]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import pylab 
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

#http://pythondata.com/dask-large-csv-python/
#import dask.dataframe as dd
 
#from collections import Counter
#from patsy import dmatrices
#import statsmodels.api as sm

#https://www.dataquest.io/blog/pandas-big-data/

**Read in file**

In [3]:
df = pd.read_pickle('2014_data.pkl')

In [4]:
#check that all rows imported
df.shape

(10816209, 24)

In [5]:
#check datatypes
df.dtypes

identifier                             uint64
trade_update_date              datetime64[ns]
run_date                       datetime64[ns]
vessel_name                          category
port_of_unlading                     category
estimated_arrival_date         datetime64[ns]
foreign_port_of_lading               category
record_status_indicator              category
place_of_receipt                     category
port_of_destination                  category
foreign_port_of_destination          category
actual_arrival_date            datetime64[ns]
consignee_name                       category
shipper_party_name                   category
container_number                     category
description_sequence_number            uint64
piece_count                            uint64
description_text                     category
carrier                              category
day_of_week_est                         uint8
day_of_week_act                         uint8
month_est                         

In [6]:
df.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,container_number,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TGHU8036820,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1,8,8,5
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3172532,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3599329,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TRLU3176120,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,OOLU1064780,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2,8,8,1


In [7]:
list(df)

['identifier',
 'trade_update_date',
 'run_date',
 'vessel_name',
 'port_of_unlading',
 'estimated_arrival_date',
 'foreign_port_of_lading',
 'record_status_indicator',
 'place_of_receipt',
 'port_of_destination',
 'foreign_port_of_destination',
 'actual_arrival_date',
 'consignee_name',
 'shipper_party_name',
 'container_number',
 'description_sequence_number',
 'piece_count',
 'description_text',
 'carrier',
 'day_of_week_est',
 'day_of_week_act',
 'month_est',
 'month_act',
 'delay_days']

** Get all vessels for the top West and East Coast U.S. ports **  
'Los Angeles, California',  
'New York/Newark Area, Newark, New Jersey' & 'New York, New York'  

In [8]:
#set up a data frame for each port
df_LA = df[df.port_of_unlading == 'Los Angeles, California']

In [9]:
#get the list of all vessels that use these ports
vessels_LA = df_LA.vessel_name.unique()
len(vessels_LA)

(1640, 1255)

** Get all observations for these vessels for vessel voyages that surved these ports **
-where the date is within 30 days of any unlading date at these ports

In [10]:
#Get all unladings at the port by vessel_name and estimated_arrival_date
unl_dates_LA = df_LA[['vessel_name','estimated_arrival_date']].drop_duplicates()
print len(unl_dates_LA)
unl_dates_LA.head(10)

17635


Unnamed: 0,vessel_name,estimated_arrival_date
7,COSCO HARMONY,2014-08-19
66,YM MUTUALITY,2014-08-19
67,YM MUTUALITY,2014-08-18
69,SEA-LAND CHARGER,2014-07-28
74,CSCL SUMMER,2014-08-17
84,MOL CONTRIBUTION,2014-08-18
87,NYK TERRA,2014-08-18
111,YM MUTUALITY,2014-08-20
139,MAERSK WOLFSBURG,2014-08-20
147,GUSTAV MAERSK,2014-08-17


In [11]:
#identify the range d (=/- days around the estimated arrival dates)
d = 15
#add a column for each date
cols = []
for i in range(-d,d+1):
    colname = 'col' + str(i)
    unl_dates_LA[colname] = unl_dates_LA.estimated_arrival_date + timedelta(days=i)
    cols.append(colname)

#drop the estimated_arrival_date column
unl_dates_LA.drop('estimated_arrival_date', axis = 1, inplace = True)

In [12]:
#view the data
print cols
unl_dates_LA.head()

['col-15', 'col-14', 'col-13', 'col-12', 'col-11', 'col-10', 'col-9', 'col-8', 'col-7', 'col-6', 'col-5', 'col-4', 'col-3', 'col-2', 'col-1', 'col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15']


Unnamed: 0,vessel_name,col-15,col-14,col-13,col-12,col-11,col-10,col-9,col-8,col-7,...,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15
7,COSCO HARMONY,2014-08-04,2014-08-05,2014-08-06,2014-08-07,2014-08-08,2014-08-09,2014-08-10,2014-08-11,2014-08-12,...,2014-08-25,2014-08-26,2014-08-27,2014-08-28,2014-08-29,2014-08-30,2014-08-31,2014-09-01,2014-09-02,2014-09-03
66,YM MUTUALITY,2014-08-04,2014-08-05,2014-08-06,2014-08-07,2014-08-08,2014-08-09,2014-08-10,2014-08-11,2014-08-12,...,2014-08-25,2014-08-26,2014-08-27,2014-08-28,2014-08-29,2014-08-30,2014-08-31,2014-09-01,2014-09-02,2014-09-03
67,YM MUTUALITY,2014-08-03,2014-08-04,2014-08-05,2014-08-06,2014-08-07,2014-08-08,2014-08-09,2014-08-10,2014-08-11,...,2014-08-24,2014-08-25,2014-08-26,2014-08-27,2014-08-28,2014-08-29,2014-08-30,2014-08-31,2014-09-01,2014-09-02
69,SEA-LAND CHARGER,2014-07-13,2014-07-14,2014-07-15,2014-07-16,2014-07-17,2014-07-18,2014-07-19,2014-07-20,2014-07-21,...,2014-08-03,2014-08-04,2014-08-05,2014-08-06,2014-08-07,2014-08-08,2014-08-09,2014-08-10,2014-08-11,2014-08-12
74,CSCL SUMMER,2014-08-02,2014-08-03,2014-08-04,2014-08-05,2014-08-06,2014-08-07,2014-08-08,2014-08-09,2014-08-10,...,2014-08-23,2014-08-24,2014-08-25,2014-08-26,2014-08-27,2014-08-28,2014-08-29,2014-08-30,2014-08-31,2014-09-01


In [13]:
#reshape the data to get columns as rows (name the new column estimated arrival date for next step in comparing to original file)
unl_flag_LA = pd.melt(unl_dates_LA, id_vars=['vessel_name'], value_vars=cols,
        var_name='colname', value_name='estimated_arrival_date')

In [14]:
#view the data
print unl_flag_LA.shape
unl_flag_LA.head()

(546685, 3)


Unnamed: 0,vessel_name,colname,estimated_arrival_date
0,COSCO HARMONY,col-15,2014-08-04
1,YM MUTUALITY,col-15,2014-08-04
2,YM MUTUALITY,col-15,2014-08-03
3,SEA-LAND CHARGER,col-15,2014-07-13
4,CSCL SUMMER,col-15,2014-08-02


In [15]:
#remove duplicates
unl_flag_LA = unl_flag_LA[['vessel_name','estimated_arrival_date']].drop_duplicates()
unl_flag_LA.shape

(155116, 2)

In [16]:
#get all observations for the vessels (at any port of unlading) for vessels that use these ports 
#match the vessel_name and the date if it is within d days of an unlading observation at these ports
#add a flag to the original dataframe
df_flag_LA = pd.merge(df, unl_flag_LA, on=['vessel_name','estimated_arrival_date'], how='left', indicator='flag')

In [19]:
df_flag_LA.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days,flag
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1,8,8,5,left_only
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5,left_only
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5,left_only
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5,left_only
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2,8,8,1,both


In [30]:
#get just the observation where the flag = both
df_v_LA = df_flag_LA[df_flag_LA.flag == 'both']
df_v_LA.shape

(5126351, 25)

** Identify Vessel Voyages **  
group vessels by dates within d days of each other - assume longer than d days is a different voyage

In [39]:
#sort by vessel,then by estimated arrival date, then by port_of_unlading
df_v_LA = df_v_LA.sort_values(['vessel_name','estimated_arrival_date','port_of_unlading'])
df_v_LA.head(10)

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days,flag
7154900,2014081953955,2014-07-31,2014-08-19,GUSTAV MAERSK,"Los Angeles, California",2014-08-17,"Yantian,China (Mainland)",New,YANTIAN,,...,1,224,RATCHET RATCHET,GUSTAV,6,0,8,8,1,both
7915738,2014060342997,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,1,4,ROLLER . .,XIN,4,6,5,6,2,both
10172071,2014060343078,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,1,10,RADIAL BALL BEARING . .,XIN,4,6,5,6,2,both
1843760,201404011097,2014-02-07,2014-04-01,1406,"Los Angeles, California",2014-03-17,"Kwangyang,South Korea",New,KARACHI-PAKISTAN,,...,1,772,KNITTED SOCKS,1406,0,0,3,3,14,both
138311,2014030426213,2014-02-04,2014-03-04,610140003896,"Los Angeles, California",2014-02-27,"Singapore,Singapore",New,JAKARTA,,...,1,190,COMPONENT FOR HYDRAULIC EXCAVATOR,610140003896,3,5,2,3,2,both
4184140,2014080128972,2014-07-10,2014-08-01,9074389,"Los Angeles, California",2014-07-29,"Xingang,China (Mainland)",New,TIANJIN,,...,1,39,AIR MOVING DEVICE FMC#8400-00-0482-0001 39CTNS...,9074389,1,3,7,7,2,both
7815745,2014012138431,2014-01-03,2014-01-21,9074391,"Los Angeles, California",2014-01-20,"Pusan,South Korea",New,XINGANG,,...,1,156,GLASSWARE,9074391,0,0,1,1,0,both
1988280,2014092437176,2014-09-05,2014-09-24,9074547,"Los Angeles, California",2014-09-23,"Xingang,China (Mainland)",New,TIANJIN,,...,1,4,AC TRACTION MOTOR,9074547,1,1,9,9,0,both
4373677,2014040245889,2014-03-19,2014-04-02,9077276,"Los Angeles, California",2014-03-21,"Yokohama,Japan",New,YOKOHAMA,,...,1,3,WELDER HS CODE: 8515.21,9077276,4,1,3,4,11,both
9005066,2014040245882,2014-03-19,2014-04-02,9077276,"Los Angeles, California",2014-03-21,"Tokyo ,Japan",New,TOKYO,,...,1,1,"""SEIKO"" BRAND INDUSTRIAL SEWING MACHINE HS COD...",9077276,4,1,3,4,11,both


In [43]:
#groupby vessel_name and shift the estimated_arrival_date down one row to compare current to prior
df_v_LA['prior_est_arr_date'] = df_v_LA.groupby(['vessel_name'])['estimated_arrival_date'].shift()

In [44]:
df_v_LA.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days,flag,prior_est_arr_date
7154900,2014081953955,2014-07-31,2014-08-19,GUSTAV MAERSK,"Los Angeles, California",2014-08-17,"Yantian,China (Mainland)",New,YANTIAN,,...,224,RATCHET RATCHET,GUSTAV,6,0,8,8,1,both,NaT
7915738,2014060342997,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,4,ROLLER . .,XIN,4,6,5,6,2,both,NaT
10172071,2014060343078,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,10,RADIAL BALL BEARING . .,XIN,4,6,5,6,2,both,2014-05-30
1843760,201404011097,2014-02-07,2014-04-01,1406,"Los Angeles, California",2014-03-17,"Kwangyang,South Korea",New,KARACHI-PAKISTAN,,...,772,KNITTED SOCKS,1406,0,0,3,3,14,both,NaT
138311,2014030426213,2014-02-04,2014-03-04,610140003896,"Los Angeles, California",2014-02-27,"Singapore,Singapore",New,JAKARTA,,...,190,COMPONENT FOR HYDRAULIC EXCAVATOR,610140003896,3,5,2,3,2,both,NaT


In [63]:
df_v_LA.prior_est_arr_date.values[0] == np.datetime64('nat')

  if __name__ == '__main__':


True

In [78]:
#check if the prior date is within d days of the prior estimated arrival date
df_v_LA['same_as_prior'] = df_v_LA.estimated_arrival_date <= df_v_LA.prior_est_arr_date + timedelta(days=d)

In [80]:
#for all false in same as prior enter 1 otherwise enter 0, then implement a running sum to get the voyage numbers
df_v_LA['new_voyage'] = ~df_v_LA.same_as_prior * 1

In [100]:
#df_v_LA.new_voyage.cumsum()
#df.groupby('id')['val'].cumsum()
df_v_LA['voyage'] = df_v_LA.groupby('vessel_name')['new_voyage'].cumsum()

In [103]:
df_v_LA.head(50)

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,day_of_week_est,day_of_week_act,month_est,month_act,delay_days,flag,prior_est_arr_date,same_as_prior,new_voyage,voyage
7154900,2014081953955,2014-07-31,2014-08-19,GUSTAV MAERSK,"Los Angeles, California",2014-08-17,"Yantian,China (Mainland)",New,YANTIAN,,...,6,0,8,8,1,both,NaT,False,1,1
7915738,2014060342997,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,4,6,5,6,2,both,NaT,False,1,1
10172071,2014060343078,2014-05-15,2014-06-03,XIN YA ZHOU,"Los Angeles, California",2014-05-30,"Ningpo ,China (Mainland)",New,NINGBO,,...,4,6,5,6,2,both,2014-05-30,True,0,1
1843760,201404011097,2014-02-07,2014-04-01,1406,"Los Angeles, California",2014-03-17,"Kwangyang,South Korea",New,KARACHI-PAKISTAN,,...,0,0,3,3,14,both,NaT,False,1,1
138311,2014030426213,2014-02-04,2014-03-04,610140003896,"Los Angeles, California",2014-02-27,"Singapore,Singapore",New,JAKARTA,,...,3,5,2,3,2,both,NaT,False,1,1
4184140,2014080128972,2014-07-10,2014-08-01,9074389,"Los Angeles, California",2014-07-29,"Xingang,China (Mainland)",New,TIANJIN,,...,1,3,7,7,2,both,NaT,False,1,1
7815745,2014012138431,2014-01-03,2014-01-21,9074391,"Los Angeles, California",2014-01-20,"Pusan,South Korea",New,XINGANG,,...,0,0,1,1,0,both,NaT,False,1,1
1988280,2014092437176,2014-09-05,2014-09-24,9074547,"Los Angeles, California",2014-09-23,"Xingang,China (Mainland)",New,TIANJIN,,...,1,1,9,9,0,both,NaT,False,1,1
4373677,2014040245889,2014-03-19,2014-04-02,9077276,"Los Angeles, California",2014-03-21,"Yokohama,Japan",New,YOKOHAMA,,...,4,1,3,4,11,both,NaT,False,1,1
9005066,2014040245882,2014-03-19,2014-04-02,9077276,"Los Angeles, California",2014-03-21,"Tokyo ,Japan",New,TOKYO,,...,4,1,3,4,11,both,2014-03-21,True,0,1


** Explore what voyages looks like for one vessel **

In [107]:
#look for a vessel with a lot of voyages and get the number of voyages per vessel and sort by number of voyages
df_v_LA[['vessel_name','voyage']].groupby('vessel_name').max().sort_values('voyage',ascending = False)

Unnamed: 0_level_0,voyage
vessel_name,Unnamed: 1_level_1
MAERSK WINNIPEG,13
MOL MANEUVER,13
APL THAILAND,13
MAERSK WOLFSBURG,13
ESTHER SCHULTE,12
YM MASCULINITY,12
YM MOVEMENT,11
MOL MAGNIFICENCE,11
HAMMONIA ROMA,11
MOL MISSION,11


In [115]:
#check that there are other ports besides LA in the new dataframe
df_v_LA.port_of_unlading.unique()

[Los Angeles, California, Seattle, Washington, Oakland, California, Long Beach, California, Los Angeles International Airport, Los Angeles..., ..., Brunswick, Georgia, San Francisco International Airport, San Franc..., Port San Luis, California, Omaha, Nebraska, Buffalo Niagara Falls, New York]
Length: 119
Categories (118, object): [Los Angeles, California, Seattle, Washington, Oakland, California, Long Beach, California, ..., San Francisco International Airport, San Franc..., Port San Luis, California, Omaha, Nebraska, Buffalo Niagara Falls, New York]

In [201]:
vessel1 = df_v_LA[df_v_LA.vessel_name == 'MAERSK WINNIPEG'][['vessel_name','foreign_port_of_lading','port_of_unlading',
                                                             'voyage','estimated_arrival_date','actual_arrival_date',
                                                            'delay_days']]

In [124]:
vessel1.sort_values(['voyage','foreign_port_of_lading']).drop_duplicates()[0:50]
#d = 15 seems reasonable for this vessel

Unnamed: 0,vessel_name,foreign_port_of_lading,port_of_unlading,voyage,estimated_arrival_date,actual_arrival_date,delay_days
1225053,MAERSK WINNIPEG,"Balboa,Panama","Los Angeles, California",1,2013-12-26,2013-12-26,0
8951218,MAERSK WINNIPEG,"Acajutla,El Salvador","Los Angeles, California",2,2014-01-22,2014-01-22,0
2352499,MAERSK WINNIPEG,"Balboa,Panama","Los Angeles, California",2,2014-01-22,2014-01-22,0
948146,MAERSK WINNIPEG,"Corinto,Nicaragua","Los Angeles, California",2,2014-01-22,2014-01-22,0
8654451,MAERSK WINNIPEG,"Lazaro Cardenas,Mexico","Los Angeles, California",2,2014-01-22,2014-01-22,0
948183,MAERSK WINNIPEG,"Puerto Quetzal ,Guatemala","Los Angeles, California",2,2014-01-22,2014-01-22,0
3796786,MAERSK WINNIPEG,"Acajutla,El Salvador","Los Angeles, California",3,2014-02-19,2014-02-19,0
1265139,MAERSK WINNIPEG,"Balboa,Panama","Los Angeles, California",3,2014-02-19,2014-02-19,0
3518328,MAERSK WINNIPEG,"Corinto,Nicaragua","Los Angeles, California",3,2014-02-19,2014-02-19,0
7280020,MAERSK WINNIPEG,"Lazaro Cardenas,Mexico","Los Angeles, California",3,2014-02-19,2014-02-19,0


** Delay Range by Voyage **  
look at the ranges around delays by voyage - look at examples that have a wide range

In [171]:
#get min and max delay by vessel name and voyage
delay_by_voyage = df_v_LA[['vessel_name','voyage','delay_days']].groupby(['vessel_name','voyage'],as_index=False).agg([min,max])

In [172]:
#reformat columns
delay_by_voyage.columns = ['_'.join(tup).rstrip('_') for tup in delay_by_voyage.columns.values]
delay_by_voyage.reset_index(col_level=1,inplace = True)
delay_by_voyage.head()

Unnamed: 0,vessel_name,voyage,delay_days_min,delay_days_max
0,GUSTAV MAERSK,1,1,1
1,XIN YA ZHOU,1,2,2
2,1406,1,14,14
3,610140003896,1,2,2
4,9074389,1,2,2


In [177]:
#add a delay difference column
delay_by_voyage['delay_diff'] = delay_by_voyage.delay_days_max - delay_by_voyage.delay_days_min

In [180]:
#view shape
delay_by_voyage.shape

(3755, 5)

In [181]:
#see number of rows where the difference between the min and max is greater than 5 days
delay_by_voyage[delay_by_voyage.delay_diff >= 5].shape

(2202, 5)

In [184]:
#look at some of the large discrepancies
delay_by_voyage.sort_values('delay_diff', ascending = False).head()

Unnamed: 0,vessel_name,voyage,delay_days_min,delay_days_max,delay_diff
2723,NYK PEGASUS,6,-724,15,739
2769,NYK THESEUS,6,-559,5,564
3024,SANTA PAOLA,2,-525,7,532
2870,OOCL LONG BEACH,2,-467,28,495
878,COSCO KOREA,8,-460,30,490


In [189]:
delay_by_voyage[delay_by_voyage.vessel_name == 'NYK PEGASUS']

Unnamed: 0,vessel_name,voyage,delay_days_min,delay_days_max,delay_diff
2718,NYK PEGASUS,1,732,733,1
2719,NYK PEGASUS,2,6,370,364
2720,NYK PEGASUS,3,0,0,0
2721,NYK PEGASUS,4,15,15,0
2722,NYK PEGASUS,5,-8,11,19
2723,NYK PEGASUS,6,-724,15,739
2724,NYK PEGASUS,7,-13,11,24
2725,NYK PEGASUS,8,-11,5,16


In [202]:
#view one of the vessels - unique records only
vessel2 = df_v_LA[df_v_LA.vessel_name == 'NYK PEGASUS'][['vessel_name','foreign_port_of_lading','port_of_unlading',
                                                             'voyage','estimated_arrival_date','actual_arrival_date',
                                                            'delay_days']].drop_duplicates()

In [205]:
#check the number of records
vessel2.shape

(258, 7)

Observations on vessel 2:
- voyage 1: the actual arrival date and estimted arrival dates appear to have errors in the year: possible rule - change year of estimated arrival date if year is greater than 2014 and month and day are the same - the arrival is on time if years are changed.
- voyage 2: is probably the same as voyage 1 - has similar issues, the years appear to be wrong: possibly change all non-2014 years to 2014 except december 2013 estimated arrival dates
- Combine port of Los Angeles and port of Long Beach  (like we did with NY and Newark) - and add a column that combines these as one
- Chicago appears in the list and this is clearly a mistake - need to identify west coast vs east coast ports and either modify the entry to make it the correct port or delete these reccords if there aren't very many
- appears that in this case ships go to LA first and then to Oakland, which means oakland wouldn't be a stop along the way to LA
- Voyage 3: is probably part of a later voyage, once year is changed to 2014
- Voyage 4??? dates are from late in 2013 - not sure if this is an error or should be 2014
- Voyage 5: texas and east coast ports are showing up in the list see fourth bullet
- Voyage 6: there is both a 02-16 and a 02-03 arrival into LA - this doesn't make sense... the 02-16 would sync up with voyage 1  - the 02-03 appears to be wrong, but not sure how to fix this systematically - the 03-29 is a new voyage, but there is a 03-03 and 03-18 in the estimated arrival date that is causing it to be lumped into 6   - solution might be to check if there are estimated_arrival_date that are unique for the vessel, and drop there aren't very many or adjust them to the next lower estimated_arrival_date - possible they aren't actually unique though since I'm only showing a drop duplicates view...
- voyage 7 - looks ok
- keep only Los Angeles and Long Beach as port_of_unlading but check how many observations have non-west coast ports.


In [228]:
vessel2[250:258]

Unnamed: 0,vessel_name,foreign_port_of_lading,port_of_unlading,voyage,estimated_arrival_date,actual_arrival_date,delay_days
9621807,NYK PEGASUS,"Shanghai ,China (Mainland)","Los Angeles, California",8,2014-06-26,2014-06-20,-6
10464317,NYK PEGASUS,"Shanghai ,China (Mainland)","Los Angeles, California",8,2014-06-27,2014-06-20,-7
9337768,NYK PEGASUS,"Shanghai ,China (Mainland)","Los Angeles, California",8,2014-06-28,2014-06-20,-8
8765823,NYK PEGASUS,"Shanghai ,China (Mainland)","Chicago, Illinois",8,2014-06-29,2014-06-20,-9
1622462,NYK PEGASUS,"Shanghai ,China (Mainland)","Dallas/Ft. Worth Airport, Texas",8,2014-06-29,2014-06-20,-9
6798565,NYK PEGASUS,"Shanghai ,China (Mainland)","Long Beach, California",8,2014-06-29,2014-06-20,-9
1344891,NYK PEGASUS,"Shanghai ,China (Mainland)","Los Angeles, California",8,2014-06-30,2014-06-20,-10
9337818,NYK PEGASUS,"Shanghai ,China (Mainland)","Los Angeles, California",8,2014-07-01,2014-06-20,-11
