__Machine Learning applied to Shipbuilding Market Analysis__

*Technical Univeristy of Denmark (DTU) - s182244 - MIT License*

__Imports__

In [1]:
from utilities import *
from framework import Baseline, Selector, Model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
ts = load_data('data/ts_std.csv')

Loaded: 852 samples x 72 features


In [2]:
estimators = [Baseline, LinearRegression, RandomForestRegressor, GradientBoostingRegressor, GaussianProcessRegressor, MLPRegressor]
names = ['Baseline', 'LR', 'RF', 'GB', 'GP', 'NN']

# <font color=orange> 1. </font> Short-Term Forecasts

In [3]:
idX_train = pd.period_range(start='2015-01', end='2018-12', freq='M')
idX_test  = pd.period_range(start='2019-01', end='2020-12', freq='M')
X_params = {'s':3, 'w': 12}
y_params = {'s':3}
B_params = {'w': 3}
selector = Selector(indIDs= ['999999'])
selector.set_params(**y_params)
y_true = selector.fit_transform(idX_test).reshape(-1)
y_true = inverse_transform(y_true, '999999', ts.columns)

## <font color=orange> 1.1 </font> Autoregressive Models

In [4]:
model_type = 'short-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.018065,0.800465,0.692808,0.743109,0.791396,0.6847
1,RAE,1.189387,0.935169,0.809395,0.868161,0.924574,0.799924
2,RMSE,1.178567,1.021305,0.873455,0.972818,0.97365,0.809379
3,R2,0.0,0.078412,0.325927,0.16384,0.162409,0.421198


## <font color=orange> 1.2 </font> Endogenous Demand Models

In [5]:
model_type = 'short-endog'

#Data
X_mask = (indicators.Type == 'Demand')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.018065,2.562144,3.697727,4.376711,1.976622,1.02911
1,RAE,1.189387,2.993308,4.31999,5.113236,2.309253,1.202291
2,RMSE,1.178567,3.100545,3.882823,4.583127,2.236951,1.293466
3,R2,0.0,0.0,0.0,0.0,0.0,0.0


## <font color=orange> 1.3 </font> Exogenous Models

In [6]:
model_type = 'short-exog'

#Data
X_mask = (indicators.Type == 'Economy') | (indicators.Type == 'Demand') | (indicators.Type == 'Prices') | (indicators.Type == 'Supply')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.018065,1.529236,0.731723,0.70903,1.978614,1.075537
1,RAE,1.189387,1.78658,0.854859,0.828348,2.311581,1.256531
2,RMSE,1.178567,1.867333,0.913738,0.89814,2.239822,1.368596
3,R2,0.0,0.0,0.262317,0.287288,0.0,0.0


## <font color=orange> 1.X </font> Autoregressive MSP Models

In [7]:
model_type = 'short-msp-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict_steps(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,0.914028,0.920497,0.916364,2.034364,1.118804,1.964589
1,RAE,1.067843,1.075401,1.070572,2.376712,1.307079,2.295195
2,RMSE,1.10305,1.113941,1.123408,2.296493,1.280145,2.476791
3,R2,0.0,0.0,0.0,0.0,0.0,0.0


# <font color=orange> 2. </font> Medium-Term Forecasts

In [8]:
idX_train = pd.period_range(start='2010-01', end='2015-12', freq='M')
idX_test  = pd.period_range(start='2016-01', end='2020-12', freq='M')
X_params = {'s':9, 'w': 24}
y_params = {'s':9}
B_params = {'w': 9}
selector = Selector(indIDs= ['999999'])
selector.set_params(**y_params)
y_true = selector.fit_transform(idX_test).reshape(-1)
y_true = inverse_transform(y_true, '999999', ts.columns)

## <font color=orange> 2.1 </font> Autoregressive Models

In [9]:
model_type = 'medium-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.491035,0.484111,1.105854,1.050972,1.373822,3.926182
1,RAE,1.131507,0.367379,0.839204,0.797555,1.042557,2.979476
2,RMSE,2.026267,1.242421,1.464294,1.448489,1.721613,6.497874
3,R2,0.0,0.566578,0.397953,0.41088,0.167768,0.0


## <font color=orange> 2.2 </font> Endogenous Demand Models

In [10]:
model_type = 'medium-endog'

#Data
X_mask = (indicators.Type == 'Demand')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.491035,,2.735495,2.469975,2.027848,8.997532
1,RAE,1.131507,,2.075895,1.874399,1.53888,6.82799
2,RMSE,2.026267,,2.959076,2.771232,2.38708,10.019207
3,R2,0.0,0.0,0.0,0.0,0.0,0.0


## <font color=orange> 2.3 </font> Exogenous Models

In [11]:
model_type = 'medium-exog'

#Data
X_mask = (indicators.Type == 'Economy') | (indicators.Type == 'Demand') | (indicators.Type == 'Prices') | (indicators.Type == 'Supply')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,1.491035,25.93284,3.533244,3.549585,2.106751,8.131966
1,RAE,1.131507,19.679748,2.681285,2.693687,1.598758,6.171135
2,RMSE,2.026267,27.276026,3.993682,4.231394,2.625312,9.562556
3,R2,0.0,0.0,0.0,0.0,0.0,0.0


## <font color=orange> 2.X </font> Autoregressive MSP Models

In [12]:
model_type = 'medium-msp-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict_steps(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,18.002924,,9.572283,7.614073,4.358428,
1,RAE,13.661944,,7.264153,5.778119,3.307496,
2,RMSE,18.181643,,9.713827,7.901389,5.449192,
3,R2,0.0,0.0,0.0,0.0,0.0,0.0


# <font color=orange> 3. </font> Long-Term Forecasts

In [13]:
idX_train = pd.period_range(start='2001-01', end='2009-12', freq='M')
idX_test  = pd.period_range(start='2010-01', end='2020-12', freq='M')
X_params = {'s':18, 'w': 36}
y_params = {'s':18}
B_params = {'w': 18}
selector = Selector(indIDs= ['999999'])
selector.set_params(**y_params)
y_true = selector.fit_transform(idX_test).reshape(-1)
y_true = inverse_transform(y_true, '999999', ts.columns)

## <font color=orange> 3.1 </font> Autoregressive Models

In [14]:
model_type = 'long-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,2.218025,0.242575,0.62238,0.631214,2.820071,1.784167
1,RAE,0.934139,0.102162,0.262121,0.265841,1.187696,0.751416
2,RMSE,3.159036,0.392429,0.866375,0.884786,3.688186,2.304881
3,R2,0.0,0.979663,0.900876,0.896618,0.0,0.298439


## <font color=orange> 3.2 </font> Endogenous Models

In [15]:
model_type = 'long-endog'

#Data
X_mask = (indicators.Type == 'Demand')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,2.218025,,0.545923,0.509423,3.719232,3.280614
1,RAE,0.934139,,0.22992,0.214548,1.566385,1.381658
2,RMSE,3.159036,,0.734745,0.674147,4.600766,3.911006
3,R2,0.0,0.0,0.928708,0.939983,0.0,0.0


## <font color=orange> 3.3 </font> Exogenous Models

In [16]:
model_type = 'long-exog'

#Data
X_mask = (indicators.Type == 'Economy') | (indicators.Type == 'Demand') | (indicators.Type == 'Prices') | (indicators.Type == 'Supply')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,2.218025,1.406168,1.94504,2.145994,3.721731,4.599282
1,RAE,0.934139,0.592219,0.819169,0.903803,1.567438,1.937025
2,RMSE,3.159036,1.84146,2.791823,3.074988,4.60116,5.500177
3,R2,0.0,0.55219,0.0,0.0,0.0,0.0


## <font color=orange> 3.X </font> Autoregressive MSP Models

In [17]:
model_type = 'long-msp-ar'

#Data
X_selector = Selector(indIDs= ['999999'])
y_selector = Selector(indIDs= ['999999'])
B_selector = Selector(indIDs= ['999999'])
X_selector.set_params(**X_params)
y_selector.set_params(**y_params)
B_selector.set_params(**B_params)

#Predictions
predictions = pd.DataFrame(y_true, index=idX_test, columns=['y_true'])
for estimator, name in zip(estimators, names):
    model = Model(X_selector, y_selector, estimator())
    if name=='Baseline':
        model = Model(B_selector, y_selector, estimator()) #Baseline needs only lagged targets as inputs
    model.fit(idX_train)
    y_pred = model.predict_steps(idX_test)
    y_pred = inverse_transform(y_pred.reshape(-1), '999999', ts.columns)
    predictions[name] = y_pred
    plot_pred(y_pred, idX_test, 999999, y_true, idX_test, show=False, save=path_fig+'forecasting/'+model_type+'-'+name+'.pdf')

get_scores(predictions, names, model_type)

Unnamed: 0,Measures,Baseline,LR,RF,GB,GP,NN
0,MAE,3.180599,19.316552,4.326637,2.526205,5.359041,
1,RAE,1.339535,8.135324,1.822199,1.063932,2.257004,
2,RMSE,4.096678,32.6133,5.370241,3.501291,6.046781,
3,R2,0.0,0.0,0.0,0.0,0.0,0.0
