# Using Predictive Analytics to Predict Delays in Maritime Container Shipping

**Import Libraries**

In [10]:
import csv
import pandas as pd
import numpy as np 
import scipy.stats as stats

#http://pythondata.com/dask-large-csv-python/
#import dask.dataframe as dd
#https://www.dataquest.io/blog/pandas-big-data/

**Read in files**

In [3]:
#open and read the file into a pandas data frame
filename = "C:/temp/Enigma-BillofLadingSummary-2014.csv"

In [4]:
#set the columns to read in
columns = ['identifier',
 'trade_update_date',
 'run_date',
 'vessel_name',
 'port_of_unlading',
 'estimated_arrival_date',
 'foreign_port_of_lading',
 'record_status_indicator',
 'place_of_receipt',
 'port_of_destination',
 'foreign_port_of_destination',
 #'secondary_notify_party_1',
 'actual_arrival_date',
 'consignee_name',
 #'consignee_address',
 #'consignee_contact_name',
 #'consignee_comm_number_qualifier',
 #'consignee_comm_number',
 'shipper_party_name',
 #'shipper_address'
 #'shipper_contact_name',
 #'shipper_comm_number_qualifier',
 #'shipper_comm_number',
 'container_number',
 'description_sequence_number',
 'piece_count',
 'description_text',
 #'harmonized_number',
 #'harmonized_value',
 #'harmonized_weight',
 #'harmonized_weight_unit'
 ]

In [5]:
#identify the date columns
parse_dates = ['trade_update_date',
 'run_date',
 'estimated_arrival_date',
 'actual_arrival_date',
 ]

In [6]:
#Set up datatypes, use category for strings where possible, more memory efficient than object
dtypes2 = {'identifier': 'uint64', #uint is for positive integers and more memory efficient
 'trade_update_date': 'str' ,
 'run_date': 'str',
 'vessel_name': 'category',
 'port_of_unlading':'category',
 'estimated_arrival_date':'str',
 'foreign_port_of_lading':'category',
 'record_status_indicator':'category',
 'place_of_receipt':'category',
 'port_of_destination':'category',
 'foreign_port_of_destination':'category',
 #'secondary_notify_party_1':'category',
 'actual_arrival_date':'str',
 'consignee_name':'category',
 #'consignee_address':'category',
 #'consignee_contact_name':'category',
 #'consignee_comm_number_qualifier':'category',
 #'consignee_comm_number':'category',
 'shipper_party_name':'category',
 #'shipper_address':'category',
 #'shipper_contact_name':'category',
 #'shipper_comm_number_qualifier':'category',
 #'shipper_comm_number':'category',
 'container_number':'category',
 'description_sequence_number':'uint64',
 'piece_count':'uint64',
 'description_text':'category',
 #'harmonized_number':'category',
 #'harmonized_value':'category',
 #'harmonized_weight':'category',
 #'harmonized_weight_unit':'category'
 }

In [7]:
#read in the file
df = pd.read_csv(filename, usecols = columns, dtype = dtypes2,  parse_dates = parse_dates)

In [8]:
#check that all rows imported
df.shape

(11025607, 18)

In [9]:
#check data types
df.dtypes

identifier                             uint64
trade_update_date              datetime64[ns]
run_date                       datetime64[ns]
vessel_name                          category
port_of_unlading                     category
estimated_arrival_date         datetime64[ns]
foreign_port_of_lading               category
record_status_indicator              category
place_of_receipt                     category
port_of_destination                  category
foreign_port_of_destination          category
actual_arrival_date            datetime64[ns]
consignee_name                       category
shipper_party_name                   category
container_number                     category
description_sequence_number            uint64
piece_count                            uint64
description_text                     category
dtype: object

In [9]:
#write all data to a file
df.to_pickle('D:/CUNY Files/capstone/code files/2014_data_with_deleted.pkl')

** Remove records where the record_status_type is deleted **

In [10]:
#record status types
df.record_status_indicator.unique()

[New, Amended, Deleted]
Categories (3, object): [New, Amended, Deleted]

In [11]:
#remove deleted records
df = df[df.record_status_indicator != 'Deleted']

In [12]:
df.shape

(10816209, 18)

In [13]:
#get the company from the vessel name (first word) - will need to research this, probably doesn't always apply
df['carrier'] = df.vessel_name.str.split().str[0].astype('category')

In [14]:
df['carrier'].unique()

[HYUNDAI, OOCL, COSCO, MSC, NAGOYA, ..., BOSTSWANA, MORSTON, HAMBURG, FLAG, EVERBRIGHT]
Length: 3735
Categories (3735, object): [HYUNDAI, OOCL, COSCO, MSC, ..., MORSTON, HAMBURG, FLAG, EVERBRIGHT]

In [25]:
#Get day of week DatetimeIndex.dayofweek
#The day of the week with Monday=0, Sunday=6
df['day_of_week_est'] = df.estimated_arrival_date.dt.dayofweek.astype('uint8')
df['day_of_week_act'] = df.actual_arrival_date.dt.dayofweek.astype('uint8')

In [17]:
df.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,actual_arrival_date,consignee_name,shipper_party_name,container_number,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,2014-08-19,IKEA DISTRIBUTION SERVICES INC. - S,TIANJIN TONG SHI INDUSTRY TRADE C,TGHU8036820,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,2014-08-19,,,APZU3172532,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,2014-08-19,,,APZU3599329,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,2014-08-19,,,TRLU3176120,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,2014-08-20,SHEPHERD CASTER CORP,BEST HOPEFUL INDUSTRIAL DEVELOPMENT,OOLU1064780,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2


In [26]:
#get the month
df['month_est'] = df.estimated_arrival_date.dt.month.astype('uint8')
df['month_act'] = df.actual_arrival_date.dt.month.astype('uint8')

In [19]:
df.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,shipper_party_name,container_number,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TIANJIN TONG SHI INDUSTRY TRADE C,TGHU8036820,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1,8,8
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,,APZU3172532,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,,APZU3599329,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,,TRLU3176120,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,BEST HOPEFUL INDUSTRIAL DEVELOPMENT,OOLU1064780,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2,8,8


In [20]:
#set up a number of days delayed column actual vs estimated
df['delay_days'] = (df['actual_arrival_date'] - df['estimated_arrival_date']).dt.days

In [29]:
df.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,container_number,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TGHU8036820,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1,8,8,5
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3172532,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3599329,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TRLU3176120,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,OOLU1064780,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2,8,8,1


In [28]:
df['delay_days'] = df.delay_days.astype('int16')

In [30]:
df.dtypes

identifier                             uint64
trade_update_date              datetime64[ns]
run_date                       datetime64[ns]
vessel_name                          category
port_of_unlading                     category
estimated_arrival_date         datetime64[ns]
foreign_port_of_lading               category
record_status_indicator              category
place_of_receipt                     category
port_of_destination                  category
foreign_port_of_destination          category
actual_arrival_date            datetime64[ns]
consignee_name                       category
shipper_party_name                   category
container_number                     category
description_sequence_number            uint64
piece_count                            uint64
description_text                     category
carrier                              category
day_of_week_est                         uint8
day_of_week_act                         uint8
month_est                         

In [None]:
#df.describe()

** Create pickle files **

In [31]:
#convert full file
df.to_pickle('D:/CUNY Files/capstone/code files/2014_data.pkl')

In [32]:
#read it back in to test it
df_test = pd.read_pickle('D:/CUNY Files/capstone/code files/2014_data.pkl')

In [33]:
#view first five rows
df_test.head()

Unnamed: 0,identifier,trade_update_date,run_date,vessel_name,port_of_unlading,estimated_arrival_date,foreign_port_of_lading,record_status_indicator,place_of_receipt,port_of_destination,...,container_number,description_sequence_number,piece_count,description_text,carrier,day_of_week_est,day_of_week_act,month_est,month_act,delay_days
0,2014082158845,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TGHU8036820,1,7881,IKEA HOME FURNISHING PRODUCTS TARIFF NUMBER 44...,HYUNDAI,3,1,8,8,5
1,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3172532,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
2,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,APZU3599329,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
3,2014082158905,2014-07-30,2014-08-21,HYUNDAI FORCE,"Seattle, Washington",2014-08-14,"Pusan,South Korea",New,"XINGANG, PC",,...,TRLU3176120,1,80,KLA-GARD PACKED ON 60 PALLETS -DRILLING MUD AD...,HYUNDAI,3,1,8,8,5
4,2014082158983,2014-08-06,2014-08-21,OOCL SHENZHEN,"Long Beach, California",2014-08-19,"Yantian,China (Mainland)",New,"YANTIAN,CHINA",,...,OOLU1064780,1,498,P.O.NO.050803/050938/051058/051197/051345/051 ...,OOCL,1,2,8,8,1


In [34]:
#check the data types
df_test.dtypes

identifier                             uint64
trade_update_date              datetime64[ns]
run_date                       datetime64[ns]
vessel_name                          category
port_of_unlading                     category
estimated_arrival_date         datetime64[ns]
foreign_port_of_lading               category
record_status_indicator              category
place_of_receipt                     category
port_of_destination                  category
foreign_port_of_destination          category
actual_arrival_date            datetime64[ns]
consignee_name                       category
shipper_party_name                   category
container_number                     category
description_sequence_number            uint64
piece_count                            uint64
description_text                     category
carrier                              category
day_of_week_est                         uint8
day_of_week_act                         uint8
month_est                         

In [35]:
#top 10 US ports in 2014
US_ports_top10 = ['Los Angeles, California','Long Beach, California',
                  'New York/Newark Area, Newark, New Jersey','Tacoma, Washington', 
                  'Houston, Texas','Savannah, Georgia', 'New York, New York',
                  'Charleston, South Carolina','Seattle, Washington', 'Norfolk, Virginia']

In [36]:
#output file with top 10 US ports
df_top10_ports = df[df.port_of_unlading.isin(US_ports_top10)]

In [37]:
df_top10_ports.shape, df.shape

((8003697, 24), (10816209, 24))

In [39]:
#convert full file
df_top10_ports.to_pickle('D:/CUNY Files/capstone/code files/2014_data_top10.pkl')