In [156]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [157]:
df = pd.read_excel("VesselData.xlsx")
df

Unnamed: 0,eta,ata,atd,vesseldwt,vesseltype,discharge1,load1,discharge2,load2,discharge3,...,load4,stevedorenames,hasnohamis,earliesteta,latesteta,traveltype,previousportid,nextportid,isremarkable,vesselid
0,2017-09-19 00:00:00+00,2017-09-19 00:00:00+00,2017-09-22 00:00:00+00,109290.0,5.0,0.0,0.0,0.0,0.0,90173.0,...,0.0,Stevedore_104,,2017-09-19 00:00:00+00,2017-09-19 00:00:00+00,ARRIVAL,981.0,731.0,f,2242.0
1,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,2017-10-03 00:00:00+00,67170.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_109,,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,ARRIVAL,19.0,15.0,f,5462.0
2,2017-09-30 00:00:00+00,2017-09-30 00:00:00+00,2017-10-01 00:00:00+00,67737.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_57,,2017-09-30 00:00:00+00,2017-09-30 00:00:00+00,ARRIVAL,19.0,19.0,f,5251.0
3,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,2017-10-03 00:00:00+00,43600.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_57,,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,ARRIVAL,15.0,18.0,f,5268.0
4,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,9231.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_98,,2017-10-02 00:00:00+00,2017-10-02 00:00:00+00,ARRIVAL,74.0,27.0,f,5504.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8203,2017-11-03 00:00:00+00,2017-11-03 00:00:00+00,2017-11-04 00:00:00+00,9587.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_64,,2017-11-02 00:00:00+00,2017-11-03 00:00:00+00,ARRIVAL,5.0,19.0,f,5681.0
8204,2017-11-04 00:00:00+00,2017-11-04 00:00:00+00,2017-11-06 00:00:00+00,9654.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,"Stevedore_110,Stevedore_57,Stevedore_99,Steved...",,2017-11-04 00:00:00+00,2017-11-05 00:00:00+00,SHIFT,391.0,102.0,f,4843.0
8205,2017-11-08 00:00:00+00,2017-11-07 00:00:00+00,2017-11-11 00:00:00+00,4726.0,5.0,0.0,0.0,0.0,0.0,0.0,...,3051.0,"Stevedore_89,Stevedore_79,Stevedore_75,Stevedo...",,2017-11-07 00:00:00+00,2017-11-10 00:00:00+00,SHIFT,1043.0,19.0,f,3115.0
8206,2017-11-10 00:00:00+00,2017-11-10 00:00:00+00,2017-11-10 00:00:00+00,13320.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,Stevedore_46,,2017-11-10 00:00:00+00,2017-11-10 00:00:00+00,ARRIVAL,54.0,71.0,f,4623.0


In [158]:
df.columns

Index(['eta', 'ata', 'atd', 'vesseldwt', 'vesseltype', 'discharge1', 'load1',
       'discharge2', 'load2', 'discharge3', 'load3', 'discharge4', 'load4',
       'stevedorenames', 'hasnohamis', 'earliesteta', 'latesteta',
       'traveltype', 'previousportid', 'nextportid', 'isremarkable',
       'vesselid'],
      dtype='object')

We need to check the null values

In [159]:
df.isnull().sum(axis = 0)

eta                  0
ata                  0
atd                  0
vesseldwt            2
vesseltype           0
discharge1           0
load1                0
discharge2           0
load2                0
discharge3           0
load3                0
discharge4           0
load4                0
stevedorenames       2
hasnohamis        8208
earliesteta          0
latesteta            0
traveltype           0
previousportid       0
nextportid           0
isremarkable         0
vesselid             0
dtype: int64

All the parameters of interest like load1, load2, etc. are devoid of null values hence we can proceed to the next step.

We need to find out the total of load and discharge activity, hence I am summing them up and storing as a column referred to as total. The problem is regression problem.

In [160]:
df["Total"] = df["load1"]+df["load2"]+ df["load3"] + df["load4"] + df["discharge1"]+df["discharge2"]+df["discharge3"]+df["discharge4"]

The column hasnohamis is full of nan values hence is dropped.

In [161]:
df = df.drop(columns = ["hasnohamis"])

In [162]:
df.columns

Index(['eta', 'ata', 'atd', 'vesseldwt', 'vesseltype', 'discharge1', 'load1',
       'discharge2', 'load2', 'discharge3', 'load3', 'discharge4', 'load4',
       'stevedorenames', 'earliesteta', 'latesteta', 'traveltype',
       'previousportid', 'nextportid', 'isremarkable', 'vesselid', 'Total'],
      dtype='object')

In [163]:
df.isnull().sum(axis = 0)

eta               0
ata               0
atd               0
vesseldwt         2
vesseltype        0
discharge1        0
load1             0
discharge2        0
load2             0
discharge3        0
load3             0
discharge4        0
load4             0
stevedorenames    2
earliesteta       0
latesteta         0
traveltype        0
previousportid    0
nextportid        0
isremarkable      0
vesselid          0
Total             0
dtype: int64

We replace the nan values with mode of the data, we could also have chosen median or mean. We could have even used linear regression and interplated the data, but since time is short I chose to use mode of the data and replace nan values with it.

In [164]:
df['stevedorenames'].fillna(df['stevedorenames'].mode()[0], inplace=True)
df['vesseldwt'].fillna(df['vesseldwt'].mode()[0], inplace=True)



In [165]:
df.isnull().sum(axis = 0)

eta               0
ata               0
atd               0
vesseldwt         0
vesseltype        0
discharge1        0
load1             0
discharge2        0
load2             0
discharge3        0
load3             0
discharge4        0
load4             0
stevedorenames    0
earliesteta       0
latesteta         0
traveltype        0
previousportid    0
nextportid        0
isremarkable      0
vesselid          0
Total             0
dtype: int64

In [166]:
len(set(df["stevedorenames"]))

1374

Now the column stevedorenames could has 1374 types(given the size of the data which is 8000, hence it feels like a good way to convert it to category)

In [167]:
le = LabelEncoder()

df["stevedorenames"] = le.fit_transform(df["stevedorenames"])
df["traveltype"] = le.fit_transform(df["traveltype"])
df["isremarkable"] = le.fit_transform(df["isremarkable"])





In [168]:
Y = np.array(df["Total"])

In [169]:
df = df.drop(["eta","ata","atd","earliesteta","latesteta","Total"],axis = 1)

In [170]:
df

Unnamed: 0,vesseldwt,vesseltype,discharge1,load1,discharge2,load2,discharge3,load3,discharge4,load4,stevedorenames,traveltype,previousportid,nextportid,isremarkable,vesselid
0,109290.0,5.0,0.0,0.0,0.0,0.0,90173.0,0.0,0.0,0.0,66,0,981.0,731.0,0,2242.0
1,67170.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116,0,19.0,15.0,0,5462.0
2,67737.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,547,0,19.0,19.0,0,5251.0
3,43600.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,547,0,15.0,18.0,0,5268.0
4,9231.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1188,0,74.0,27.0,0,5504.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8203,9587.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,729,0,5.0,19.0,0,5681.0
8204,9654.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,154,1,391.0,102.0,0,4843.0
8205,4726.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3537.0,3051.0,1128,1,1043.0,19.0,0,3115.0
8206,13320.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500,0,54.0,71.0,0,4623.0


In [171]:
X = np.array(df)

In [172]:
X

array([[1.0929e+05, 5.0000e+00, 0.0000e+00, ..., 7.3100e+02, 0.0000e+00,
        2.2420e+03],
       [6.7170e+04, 3.0000e+00, 0.0000e+00, ..., 1.5000e+01, 0.0000e+00,
        5.4620e+03],
       [6.7737e+04, 3.0000e+00, 0.0000e+00, ..., 1.9000e+01, 0.0000e+00,
        5.2510e+03],
       ...,
       [4.7260e+03, 5.0000e+00, 0.0000e+00, ..., 1.9000e+01, 0.0000e+00,
        3.1150e+03],
       [1.3320e+04, 3.0000e+00, 0.0000e+00, ..., 7.1000e+01, 0.0000e+00,
        4.6230e+03],
       [1.1020e+04, 3.0000e+00, 0.0000e+00, ..., 1.4000e+01, 0.0000e+00,
        4.7850e+03]])

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [175]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(X_train,y_train)
melb_preds = forest_model.predict(X_test)
print(mean_absolute_error(y_test, melb_preds))

518.8145182724253


There are several models that could have been chosen for this task like neural network based approaches(since it is time series data, so We could also have gone for LSTM, GRU), support vector machine. But time is short and I feel random forest is one of the best choice to make. It is one of the most powerful model, it is less prone to overfitting, we need not spend a lot of time for data processing and the results we get are great. Hence I chose this model. 


How long did you take to complete the test?

It took me around 1.5 hour to do it.


Was it easy or challenging? Which parts of the test were easy and which were challenging?

The test would have been very easy if I had one more hour. It took time to understand the data, decide as to what should be done. I had to find a solution which is quick. Now, if I had gone for other techniques like neural networks, I would have to spend some time to finalize the number of hidden layers, the network itself, should I use variants of RNN as this is a time series data. Even for svm I had to decide which kernel to use. But random forest is quite robust model and perhabs the best model to go for when the time is short.

In [None]:
What resources did you use to learn how to solve the test (i.e. Google, forums, books)?

I used stack overflow a lot, I used it for data processing, I used sklearn documentation for the model.

Briefly describe the process that you went through to find the solution for the problem.
The problem was to predict the total of load & discharge per cargo type. Its a regression problems, hence I summed all the variables like discharge and load to get the total. I dropped some of the columns which I felft were not that relevant like date. I changed some of the variables to categorical data. I divided the data as follows. 67 percent for training and 33 percent for testing used random forest model to make the prediction.

Are there any other key experiences/notes that you would like us to know in regards to your
experience in taking the test?

I have never been subjected to timed tests like these. So, it was a new experience for me. I had to always think in a way such that the job gets done on time. So, the quality of work is not good. I would definitely like to try out different ways to solve the problem. I added the discharges and load and made it a parameter, this is what I could think of at this time, I am not sure if my approach is right. I would love to know whether my solution is right or wrong and what could have been done for this problem.
