# Predicting Freight for the Average Case

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
data = pd.read_csv("C:/Users/msteinme/Documents/BDIdollarpredictfreightavg.csv")
freightdollar = data[['Date','BDI','Freight','Dollar_All']][0:1006]
freightdollar.tail()


Unnamed: 0,Date,BDI,Freight,Dollar_All
1001,3/13/2015,562,5.7,100.33
1002,3/20/2015,591,5.7,97.909
1003,3/27/2015,596,5.7,97.291
1004,4/3/2015,588,5.65,96.545
1005,4/10/2015,580,6.85,99.338


In [71]:
freightdollar.corr()

Unnamed: 0,BDI,Freight,Dollar_All
BDI,1.0,0.95391,-0.422487
Freight,0.95391,1.0,-0.551958
Dollar_All,-0.422487,-0.551958,1.0


In [72]:
from datetime import datetime
freightdollar['Date'] = pd.to_datetime(freightdollar['Date'])
date = freightdollar['Date']
freightdollar.dtypes

Date          datetime64[ns]
BDI                    int64
Freight              float64
Dollar_All           float64
dtype: object

In [8]:
#I will do both MLR and RDF just to have at least two models to predict freight in case I want to average them
# I have the model already have the model coefficients in BDI to Coal aand Freight ipynb

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X=freightdollar[['BDI','Dollar_All']].values
y=freightdollar['Freight'].values
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=1)
forest=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=125, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
forest.fit(X_train, y_train)
y_train_pred= forest.predict(X_train)
y_test_pred= forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test,y_test_pred)))
print(forest.feature_importances_)

MSE train: 1.758, test: 5.894
R^2 train: 0.983, test: 0.952
[ 0.93619907  0.06380093]


In [78]:
#adding to table to see predictions of the rdf and mlr model for avg case
freight_predicted = forest.predict(X)
freight_predicted = pd.DataFrame(freight_predicted)
freight_predicted.columns = ['Predicted_Freight_RDF']
freightdollar['Predicted_Freight_RDF'] = freight_predicted
rdf_error = (abs(freightdollar['Predicted_Freight_RDF'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['RDF_Error'] = rdf_error
mlr_predicted = 17.0825 + (freightdollar['BDI']*0.0044) - (freightdollar['Dollar_All']*0.1589)
freightdollar['Predicted_Freight_MLR'] = mlr_predicted
mlr_error = (abs(freightdollar['Predicted_Freight_MLR'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['MLR_Error'] = mlr_error

print("Average RDF error is: %.3f" % (rdf_error.mean())) 
print("Average MLR error is: %.3f" % (mlr_error.mean()))
freightdollar

Average RDF error is: 8.955
Average MLR error is: 18.823


Unnamed: 0,Date,BDI,Freight,Dollar_All,Predicted_Freight_RDF,RDF_Error,Predicted_Freight_MLR,MLR_Error
0,1996-01-05,1583,4.15,85.060,6.760079,62.893474,10.531666,153.775084
1,1996-01-12,1550,4.55,85.060,6.819291,49.874528,10.386466,128.273978
2,1996-01-19,1538,4.55,86.750,7.325654,61.003392,10.065125,121.211538
3,1996-01-26,1529,4.55,87.640,7.073984,55.472181,9.884104,117.233055
4,1996-02-02,1484,4.60,87.010,7.090538,54.142132,9.786211,112.743717
5,1996-02-09,1453,4.60,86.870,7.227075,57.110322,9.672057,110.262109
6,1996-02-16,1437,4.15,85.910,7.370948,77.613215,9.754201,135.040988
7,1996-02-23,1400,4.25,85.430,6.742460,58.646120,9.667673,127.474659
8,1996-03-01,1357,4.35,86.490,7.127693,63.855018,9.310039,114.023885
9,1996-03-08,1350,5.50,86.810,7.154582,30.083312,9.228391,67.788927


In [79]:
newpred2 = data [['Date','BDI','Dollar_All']][1006:]
newpred = data [['BDI','Dollar_All']][1006:]
newpred2.tail()

Unnamed: 0,Date,BDI,Dollar_All
2793,12/27/2020,2065,86.909863
2794,12/28/2020,2067,86.907397
2795,12/29/2020,2069,86.904932
2796,12/30/2020,2071,86.902466
2797,12/31/2020,2073,86.9


In [80]:
#new predictions
newpred2 = np.array(newpred2)
newpred2 = pd.DataFrame(newpred2)
newpred2.columns = ['Date', 'BDI','Dollar_All']
mlr_predictednew = 17.0825 + (newpred2['BDI']*0.0044) - (newpred2['Dollar_All']*0.1589)
newpred2['Predicted_Freight_MLR'] = mlr_predictednew
newpred_predicted = forest.predict(newpred)
df_new_pred = pd.DataFrame(newpred_predicted)
newpred2['Predicted_Freight_RDF'] = df_new_pred
newpred2['Date'] = pd.to_datetime(newpred2['Date'])
newpred2

Unnamed: 0,Date,BDI,Dollar_All,Predicted_Freight_MLR,Predicted_Freight_RDF
0,2015-04-17,597,97.52,4.21337,5.305703
1,2015-04-24,600,96.922,4.32159,5.305773
2,2015-05-01,587,95.297,4.52261,5.447493
3,2015-05-08,574,94.794,4.54533,5.469075
4,2015-05-15,634,93.135,5.07295,5.520270
5,2015-05-22,586,96.014,4.40428,5.435476
6,2015-05-29,589,96.907,4.27558,5.305773
7,2015-06-05,610,96.306,4.46348,5.290474
8,2015-06-12,642,94.972,4.81625,5.437867
9,2015-06-19,779,94.085,5.55999,5.325000


In [81]:
frames = [freightdollar,newpred2]
combined = pd.concat(frames)
combined

Unnamed: 0,BDI,Date,Dollar_All,Freight,MLR_Error,Predicted_Freight_MLR,Predicted_Freight_RDF,RDF_Error
0,1583,1996-01-05,85.06,4.15,153.775084,10.5317,6.760079,62.893474
1,1550,1996-01-12,85.06,4.55,128.273978,10.3865,6.819291,49.874528
2,1538,1996-01-19,86.75,4.55,121.211538,10.0651,7.325654,61.003392
3,1529,1996-01-26,87.64,4.55,117.233055,9.8841,7.073984,55.472181
4,1484,1996-02-02,87.01,4.60,112.743717,9.78621,7.090538,54.142132
5,1453,1996-02-09,86.87,4.60,110.262109,9.67206,7.227075,57.110322
6,1437,1996-02-16,85.91,4.15,135.040988,9.7542,7.370948,77.613215
7,1400,1996-02-23,85.43,4.25,127.474659,9.66767,6.742460,58.646120
8,1357,1996-03-01,86.49,4.35,114.023885,9.31004,7.127693,63.855018
9,1350,1996-03-08,86.81,5.50,67.788927,9.22839,7.154582,30.083312


In [82]:
combined.to_csv("C:/Users/msteinme/Documents/freightdollavg.csv")

# Predicting Freight for Best Case

In [52]:
data = pd.read_csv("C:/Users/msteinme/Documents/BDIdollarpredictfreightbest.csv")
freightdollar = data[['Date','BDI','Freight','Dollar_All']][0:1006]
freightdollar.dtypes

Date           object
BDI             int64
Freight       float64
Dollar_All    float64
dtype: object

In [53]:
from datetime import datetime
freightdollar['Date'] = pd.to_datetime(freightdollar['Date'])
date = freightdollar['Date']
freightdollar.dtypes

Date          datetime64[ns]
BDI                    int64
Freight              float64
Dollar_All           float64
dtype: object

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X=freightdollar[['BDI','Dollar_All']].values
y=freightdollar['Freight'].values
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=1)
forest=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=125, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
forest.fit(X_train, y_train)
y_train_pred= forest.predict(X_train)
y_test_pred= forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test,y_test_pred)))
print(forest.feature_importances_)

MSE train: 1.783, test: 5.854
R^2 train: 0.983, test: 0.952
[ 0.93423661  0.06576339]


In [57]:
#adding to table to see predictions of the rdf and mlr model for best case
freight_predicted = forest.predict(X)
freight_predicted = pd.DataFrame(freight_predicted)
freight_predicted.columns = ['Predicted_Freight_RDF']
freightdollar['Predicted_Freight_RDF'] = freight_predicted
rdf_error = (abs(freightdollar['Predicted_Freight_RDF'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['RDF_Error'] = rdf_error
mlr_predicted = 17.0825 + (freightdollar['BDI']*0.0044) - (freightdollar['Dollar_All']*0.1589)
freightdollar['Predicted_Freight_MLR'] = mlr_predicted
mlr_error = (abs(freightdollar['Predicted_Freight_MLR'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['MLR_Error'] = mlr_error

print("Average RDF error is: %.3f" % (rdf_error.mean())) 
print("Average MLR error is: %.3f" % (mlr_error.mean()))
freightdollar

Average RDF error is: 8.896
Average MLR error is: 18.823


Unnamed: 0,Date,BDI,Freight,Dollar_All,Predicted_Freight_RDF,RDF_Error,Predicted_Freight_MLR,MLR_Error
0,1996-01-05,1583,4.15,85.060,6.766295,63.043254,10.531666,153.775084
1,1996-01-12,1550,4.55,85.060,6.670972,46.614776,10.386466,128.273978
2,1996-01-19,1538,4.55,86.750,7.138009,56.879323,10.065125,121.211538
3,1996-01-26,1529,4.55,87.640,7.003704,53.927567,9.884104,117.233055
4,1996-02-02,1484,4.60,87.010,6.966547,51.446667,9.786211,112.743717
5,1996-02-09,1453,4.60,86.870,7.065555,53.599031,9.672057,110.262109
6,1996-02-16,1437,4.15,85.910,7.278426,75.383750,9.754201,135.040988
7,1996-02-23,1400,4.25,85.430,6.730514,58.365027,9.667673,127.474659
8,1996-03-01,1357,4.35,86.490,6.953851,59.858633,9.310039,114.023885
9,1996-03-08,1350,5.50,86.810,6.928959,25.981066,9.228391,67.788927


In [58]:
newpred2 = data [['Date','BDI','Dollar_All']][1006:]
newpred = data [['BDI','Dollar_All']][1006:]
newpred2.tail()

Unnamed: 0,Date,BDI,Dollar_All
2793,12/27/2020,3880,85.020822
2794,12/28/2020,3884,85.015616
2795,12/29/2020,3888,85.010411
2796,12/30/2020,3892,85.005205
2797,12/31/2020,3897,85.0


In [59]:
#new predictions
newpred2 = np.array(newpred2)
newpred2 = pd.DataFrame(newpred2)
newpred2.columns = ['Date', 'BDI','Dollar_All']
mlr_predictednew = 17.0825 + (newpred2['BDI']*0.0044) - (newpred2['Dollar_All']*0.1589)
newpred2['Predicted_Freight_MLR'] = mlr_predictednew
newpred_predicted = forest.predict(newpred)
df_new_pred = pd.DataFrame(newpred_predicted)
newpred2['Predicted_Freight_RDF'] = df_new_pred
newpred2['Date'] = pd.to_datetime(newpred2['Date'])
newpred2

Unnamed: 0,Date,BDI,Dollar_All,Predicted_Freight_MLR,Predicted_Freight_RDF
0,2015-04-17,597,97.52,4.21337,5.275017
1,2015-04-24,600,96.922,4.32159,5.286920
2,2015-05-01,587,95.297,4.52261,5.500874
3,2015-05-08,574,94.794,4.54533,5.489065
4,2015-05-15,634,93.135,5.07295,5.556831
5,2015-05-22,586,96.014,4.40428,5.456347
6,2015-05-29,589,96.907,4.27558,5.286920
7,2015-06-05,610,96.306,4.46348,5.286920
8,2015-06-12,642,94.972,4.81625,5.489065
9,2015-06-19,779,94.085,5.55999,5.328136


In [60]:
frames = [freightdollar,newpred2]
combined = pd.concat(frames)
combined

Unnamed: 0,BDI,Date,Dollar_All,Freight,MLR_Error,Predicted_Freight_MLR,Predicted_Freight_RDF,RDF_Error
0,1583,1996-01-05,85.06,4.15,153.775084,10.5317,6.766295,63.043254
1,1550,1996-01-12,85.06,4.55,128.273978,10.3865,6.670972,46.614776
2,1538,1996-01-19,86.75,4.55,121.211538,10.0651,7.138009,56.879323
3,1529,1996-01-26,87.64,4.55,117.233055,9.8841,7.003704,53.927567
4,1484,1996-02-02,87.01,4.60,112.743717,9.78621,6.966547,51.446667
5,1453,1996-02-09,86.87,4.60,110.262109,9.67206,7.065555,53.599031
6,1437,1996-02-16,85.91,4.15,135.040988,9.7542,7.278426,75.383750
7,1400,1996-02-23,85.43,4.25,127.474659,9.66767,6.730514,58.365027
8,1357,1996-03-01,86.49,4.35,114.023885,9.31004,6.953851,59.858633
9,1350,1996-03-08,86.81,5.50,67.788927,9.22839,6.928959,25.981066


In [61]:
combined.to_csv("C:/Users/msteinme/Documents/freightdollbest.csv")

# Predicting Freight for Worst Case

In [62]:
data = pd.read_csv("C:/Users/msteinme/Documents/BDIdollarpredictfreightworst.csv")
freightdollar = data[['Date','BDI','Freight','Dollar_All']][0:1006]
freightdollar.dtypes

Date           object
BDI             int64
Freight       float64
Dollar_All    float64
dtype: object

In [63]:
from datetime import datetime
freightdollar['Date'] = pd.to_datetime(freightdollar['Date'])
date = freightdollar['Date']
freightdollar.dtypes

Date          datetime64[ns]
BDI                    int64
Freight              float64
Dollar_All           float64
dtype: object

In [64]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
X=freightdollar[['BDI','Dollar_All']].values
y=freightdollar['Freight'].values
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=1)
forest=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=125, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
forest.fit(X_train, y_train)
y_train_pred= forest.predict(X_train)
y_test_pred= forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test,y_test_pred)))
print(forest.feature_importances_)

MSE train: 1.752, test: 5.887
R^2 train: 0.983, test: 0.952
[ 0.93720325  0.06279675]


In [65]:
#adding to table to see predictions of the rdf and mlr model for worst case
freight_predicted = forest.predict(X)
freight_predicted = pd.DataFrame(freight_predicted)
freight_predicted.columns = ['Predicted_Freight_RDF']
freightdollar['Predicted_Freight_RDF'] = freight_predicted
rdf_error = (abs(freightdollar['Predicted_Freight_RDF'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['RDF_Error'] = rdf_error
mlr_predicted = 17.0825 + (freightdollar['BDI']*0.0044) - (freightdollar['Dollar_All']*0.1589)
freightdollar['Predicted_Freight_MLR'] = mlr_predicted
mlr_error = (abs(freightdollar['Predicted_Freight_MLR'] - freightdollar['Freight'])/freightdollar['Freight'])*100
freightdollar['MLR_Error'] = mlr_error

print("Average RDF error is: %.3f" % (rdf_error.mean())) 
print("Average MLR error is: %.3f" % (mlr_error.mean()))
freightdollar

Average RDF error is: 8.946
Average MLR error is: 18.823


Unnamed: 0,Date,BDI,Freight,Dollar_All,Predicted_Freight_RDF,RDF_Error,Predicted_Freight_MLR,MLR_Error
0,1996-01-05,1583,4.15,85.060,6.813171,64.172799,10.531666,153.775084
1,1996-01-12,1550,4.55,85.060,6.678646,46.783437,10.386466,128.273978
2,1996-01-19,1538,4.55,86.750,7.011143,54.091044,10.065125,121.211538
3,1996-01-26,1529,4.55,87.640,6.988997,53.604321,9.884104,117.233055
4,1996-02-02,1484,4.60,87.010,7.007034,52.326824,9.786211,112.743717
5,1996-02-09,1453,4.60,86.870,7.019827,52.604945,9.672057,110.262109
6,1996-02-16,1437,4.15,85.910,7.527445,81.384219,9.754201,135.040988
7,1996-02-23,1400,4.25,85.430,6.778334,59.490210,9.667673,127.474659
8,1996-03-01,1357,4.35,86.490,6.946137,59.681310,9.310039,114.023885
9,1996-03-08,1350,5.50,86.810,6.902248,25.495419,9.228391,67.788927


In [66]:
newpred2 = data [['Date','BDI','Dollar_All']][1006:]
newpred = data [['BDI','Dollar_All']][1006:]
newpred2.tail()

Unnamed: 0,Date,BDI,Dollar_All
2793,12/27/2020,250,90.035068
2794,12/28/2020,250,90.026301
2795,12/29/2020,250,90.017534
2796,12/30/2020,250,90.008767
2797,12/31/2020,250,90.0


In [67]:
#new predictions
newpred2 = np.array(newpred2)
newpred2 = pd.DataFrame(newpred2)
newpred2.columns = ['Date', 'BDI','Dollar_All']
mlr_predictednew = 17.0825 + (newpred2['BDI']*0.0044) - (newpred2['Dollar_All']*0.1589)
newpred2['Predicted_Freight_MLR'] = mlr_predictednew
newpred_predicted = forest.predict(newpred)
df_new_pred = pd.DataFrame(newpred_predicted)
newpred2['Predicted_Freight_RDF'] = df_new_pred
newpred2['Date'] = pd.to_datetime(newpred2['Date'])
newpred2

Unnamed: 0,Date,BDI,Dollar_All,Predicted_Freight_MLR,Predicted_Freight_RDF
0,2015-04-17,597,97.52,4.21337,5.330665
1,2015-04-24,600,96.922,4.32159,5.349534
2,2015-05-01,587,95.297,4.52261,5.440685
3,2015-05-08,574,94.794,4.54533,5.478048
4,2015-05-15,634,93.135,5.07295,5.548799
5,2015-05-22,586,96.014,4.40428,5.434590
6,2015-05-29,589,96.907,4.27558,5.349534
7,2015-06-05,610,96.306,4.46348,5.349534
8,2015-06-12,642,94.972,4.81625,5.440685
9,2015-06-19,779,94.085,5.55999,5.301136


In [68]:
frames = [freightdollar,newpred2]
combined = pd.concat(frames)
combined

Unnamed: 0,BDI,Date,Dollar_All,Freight,MLR_Error,Predicted_Freight_MLR,Predicted_Freight_RDF,RDF_Error
0,1583,1996-01-05,85.06,4.15,153.775084,10.5317,6.813171,64.172799
1,1550,1996-01-12,85.06,4.55,128.273978,10.3865,6.678646,46.783437
2,1538,1996-01-19,86.75,4.55,121.211538,10.0651,7.011143,54.091044
3,1529,1996-01-26,87.64,4.55,117.233055,9.8841,6.988997,53.604321
4,1484,1996-02-02,87.01,4.60,112.743717,9.78621,7.007034,52.326824
5,1453,1996-02-09,86.87,4.60,110.262109,9.67206,7.019827,52.604945
6,1437,1996-02-16,85.91,4.15,135.040988,9.7542,7.527445,81.384219
7,1400,1996-02-23,85.43,4.25,127.474659,9.66767,6.778334,59.490210
8,1357,1996-03-01,86.49,4.35,114.023885,9.31004,6.946137,59.681310
9,1350,1996-03-08,86.81,5.50,67.788927,9.22839,6.902248,25.495419


In [69]:
combined.to_csv("C:/Users/msteinme/Documents/freightdollworst.csv")