In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

In [2]:
def preprocess_data(df, features, target_variable, scaler=None):
    X = df[features]
    y = df[target_variable]
    
    if scaler is None: 
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

    return X, y, scaler

In [3]:
def calculate_metrics(model, X, y_true):
    y_pred = model.predict(X)

    r_sq = model.score(X, y_true)
    print(f"coefficient of determination: {r_sq}")

    mse = mean_squared_error(y_true, y_pred) 
    print('RMSE: ', round(np.sqrt(mse),4))

In [4]:
def build_model(model, features, df_path='../data/data_vessel2.csv', df_eval_path='../data/data_vessel1.csv', target_variable="Speed Through Water (knots)", test_size = 0.25, random_state = 42):
    df = pd.read_csv(df_path)

    df_eval = pd.read_csv(df_eval_path)
    
    X, y, scaler = preprocess_data(df, features, target_variable)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model.fit(X_train, y_train)

    print('Metrics on test set')
    calculate_metrics(model, X_test, y_test)
    
    print('-'*30)

    X_eval, y_eval, _ = preprocess_data(df_eval, features, target_variable, scaler)

    print('Metrics on eval set')
    calculate_metrics(model, X_eval, y_eval)

    return model

In [5]:
features = ['Propulsion Power (MW)']

build_model(LinearRegression(), features)

Metrics on test set
coefficient of determination: 0.8154794963170807
RMSE:  3.2916
------------------------------
Metrics on eval set
coefficient of determination: 0.8259371783529124
RMSE:  3.1743


In [6]:
build_model(XGBRegressor(), features)

Metrics on test set
coefficient of determination: 0.9585003762412104
RMSE:  1.561
------------------------------
Metrics on eval set
coefficient of determination: 0.9664623709336826
RMSE:  1.3933


In [7]:
features = ['Power Galley (MW)', 'Power Service (MW)', 'HVAC Chiller Power (MW)', 'Scrubber Power (MW)', 'Sea Temperature (Celsius)', 'Boiler Fuel Flow Rate (L/h)', 'Incinerator 1 Fuel Flow Rate (L/h)', 'Relative Wind Angle (Degrees)', 'Relative Wind Direction (Degrees)', 'Draft (m)', 'Relative Wind Speed (knots)', 'Trim (m)', 'Propulsion Power (MW)', 'Bow Thruster Power (MW)', 'Stern Thruster Power (MW)']

build_model(LinearRegression(), features)

Metrics on test set
coefficient of determination: 0.8622358241275337
RMSE:  2.8442
------------------------------
Metrics on eval set
coefficient of determination: 0.8513340971007143
RMSE:  2.9336


In [8]:
build_model(XGBRegressor(), features)

Metrics on test set
coefficient of determination: 0.9919211436078355
RMSE:  0.6888
------------------------------
Metrics on eval set
coefficient of determination: 0.9420623195657902
RMSE:  1.8313
