__Machine Learning applied to Shipbuilding Market Analysis__

*Technical Univeristy of Denmark (DTU) - s182244 - MIT License*

__Imports__

In [1]:
from utilities import *
from framework import Splitter, Selector, Baseline, Model, CrossValidation

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

ts = load_data('data/ts_std.csv')
indicators = pd.read_csv('data/indicators.csv')

Loaded: 852 samples x 72 features


In [2]:
#Tag: remove
from xx_utilities import convert_nb
convert_nb(path='', name='evaluation')

__Framework Usage Examples__

In [3]:
idX = pd.period_range(start='2000-01', end='2020-12', freq='M')
tsX = ts.loc[idX].copy()
splitter = Splitter(n_splits=3, tt_split=0.85)
for train_split, test_split in splitter.split(idX):
    train_start, train_end = str(tsX.iloc[train_split].index.min()),str(tsX.iloc[train_split].index.max())
    test_start, test_end = str(tsX.iloc[test_split].index.min()),str(tsX.iloc[test_split].index.max())
    print('Train split: '+ train_start+' -> '+train_end + ', test split: '+test_start+' -> '+test_end)

Train split: 2000-02 -> 2005-12, test split: 2006-01 -> 2006-12
Train split: 2007-02 -> 2012-12, test split: 2013-01 -> 2013-12
Train split: 2014-02 -> 2019-12, test split: 2020-01 -> 2020-12


In [4]:
idX = pd.period_range(start='2020-01', end='2020-12', freq='M')
X_selector = Selector(indIDs= ['999999', '999998'])
y_selector = Selector(indIDs= ['999999'])
X_sel_params = {
    's' : 1, #Moving average smoothing window size
    'd' : 0, #Differentiation order
    'w' : 2, #Autoregressive window size
}
X_selector.set_params(**X_sel_params)
y_selector.set_params(**{'w':0, 's':0, 'd':0}) #Default parameters (no transformations)
X = X_selector.fit_transform(idX)
y = y_selector.fit_transform(idX)
print('X shape: %d x %d ' %X.shape)
print('y shape: %d x %d ' %y.shape)
print('\nLast values in y and X:')
for i, X_i, y_i in zip(range(3), X[-3:], y[-3:]):
    print('y_%d:'%i, y_i, 'X_%d:'%i, X_i)

X shape: 12 x 4 
y shape: 12 x 1 

Last values in y and X:
y_0: [-0.3979711] X_0: [ 0.80290981  0.32027148  0.37563704 -1.21588929]
y_1: [1.34039283] X_1: [-0.3979711  -1.18794807  0.80290981  0.32027148]
y_2: [0.15646264] X_2: [ 1.34039283 -0.35766817 -0.3979711  -1.18794807]


In [5]:
idX = pd.period_range(start='2020-01', end='2020-12', freq='M')
splitter = Splitter(n_splits=2, tt_split=0.6)
split = 0
for train_split, test_split in splitter.split(idX):
    print('Split %d'%split)
    train_idX, test_idX = idX[train_split].copy(), idX[test_split].copy()
    X_train, X_test = X_selector.fit_transform(train_idX), X_selector.transform(test_idX)
    y_train, y_test = y_selector.fit_transform(train_idX), y_selector.transform(test_idX)
    for p, X_i, y_i in zip(train_idX, X_train, y_train):
        print('Train:', str(p), y_i, X_i)
    for p, X_i, y_i in zip(test_idX, X_test, y_test):
        print('Test:', str(p), y_i, X_i)
    print('')
    split+=1

Split 0
Train: 2020-02 [-0.15007671] [0.89087102 1.21685736 1.53882928 1.49196871]
Train: 2020-03 [1.10444427] [-0.15007671  0.03315504  0.89087102  1.21685736]
Train: 2020-04 [0.64832836] [ 1.10444427  0.9475209  -0.15007671  0.03315504]
Test: 2020-05 [0.04408773] [0.64832836 1.11364206 1.10444427 0.9475209 ]
Test: 2020-06 [0.13616214] [ 0.04408773 -1.04136549  0.64832836  1.11364206]

Split 1
Train: 2020-08 [0.37563704] [-0.30425231 -0.68592606  0.13616214  0.71813093]
Train: 2020-09 [0.80290981] [ 0.37563704 -1.21588929 -0.30425231 -0.68592606]
Train: 2020-10 [-0.3979711] [ 0.80290981  0.32027148  0.37563704 -1.21588929]
Test: 2020-11 [1.34039283] [-0.3979711  -1.18794807  0.80290981  0.32027148]
Test: 2020-12 [0.15646264] [ 1.34039283 -0.35766817 -0.3979711  -1.18794807]



In [6]:
idX = pd.period_range(start='2020-01', end='2020-12', freq='M')
X_selector.set_params(**{'s':3, 'd':1, 'w':2})
y_selector.set_params(**{'s':1, 'd':1, 'w':0})
X = X_selector.fit_transform(idX)
y = y_selector.fit_transform(idX)
estimator = Baseline()
estimator.fit(X, y)
estimator.score(X, y)

-0.045270351641879536

In [7]:
model = Model(X_selector, y_selector, Baseline())
model.set_params(**{'X_selector__s': 3, 'X_selector__d':1, 'X_selector__w':2})
model.set_params(**{'y_selector__s': 1, 'y_selector__d':1, 'y_selector__w':0})
model.fit(idX)
model.score(idX)

-0.045270351641879536

In [8]:
estimator = LinearRegression()
estimator.fit(X, y)
estimator.score(X, y)

0.3441595774229117

In [9]:
model = Model(X_selector, y_selector, LinearRegression())
model.fit(idX)
model.score(idX)

0.3441595774229117

In [10]:
estimator = MLPRegressor()
estimator.fit(X, y)
estimator.score(X, y)

0.43729120766991547

In [11]:
model = Model(X_selector, y_selector, estimator=MLPRegressor())
model.fit(idX)
model.score(idX)

0.4691181564824761

Note: The MLPRegressor score is slightly lower when using the Model wrapper because it allows a few less iterations in the gradient descent.

__Demand Model Selection__

In [12]:
idX = pd.period_range(start='2000-01', end='2020-12', freq='M')
grid = {
    'X_selector__s' : [s for s in range(0, 37, 1)],  #Moving average smoothing window size
    'X_selector__d' : [d for d in range(0, 37, 1)],   #Differentiation order
    'X_selector__w' : [1]+[w for w in range(6, 61, 1)], #Autoregressive window size
}

#test grid
grid = {
    'X_selector__s' : [0], #X Moving average smoothing window size
    'X_selector__d' : [0], #X Differentiation order
    'X_selector__w' : [6, 12, 24], #X Autoregressive window size #1 yields worst scores
}

_Estimator: Baseline_

In [14]:
X_selector = Selector(indIDs= ['999999']) #The baseline takes only y as inputs
y_selector = Selector(indIDs= ['999999'])
splitter = Splitter(n_splits=3, tt_split=0.85)

model_B = Model(X_selector, y_selector, Baseline()) 
search_B = CrossValidation(model_B, grid, splitter)
search_B.fit(idX)
search_B.save_results('BSearch', path='results/')

_Estimator: Linear Regression_

In [15]:
X_mask = (indicators.Type == 'Economy') | (indicators.Type == 'Demand')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs=['999999'])
splitter = Splitter(n_splits=3, tt_split=0.85)

model_LR = Model(X_selector, y_selector, LinearRegression())
search_LR = CrossValidation(model_LR, grid, splitter)
search_LR.fit(idX)
search_LR.save_results('LRSearch', path='results/')

_Estimator: Feedforward Neural Network_

In [16]:
X_mask = (indicators.Type == 'Economy') | (indicators.Type == 'Demand')
X_selector = Selector(indIDs=list(indicators[X_mask].ID.apply(str)))
y_selector = Selector(indIDs=['999999'])
splitter = Splitter(n_splits=3, tt_split=0.85)

model_NN = Model(X_selector, y_selector, MLPRegressor())
search_NN = CrossValidation(model_NN, grid, splitter)
search_NN.fit(idX)
search_NN.save_results('NNSearch', path='results/')