In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

In [2]:
# preprocess data using standard scaler
# the scaler object is reused when evaluating the model

def preprocess_data(df, features, target_variable, scaler=None):
    X = df[features]
    y = df[target_variable]
    
    if scaler is None: 
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    else:
        X = scaler.transform(X)

    return X, y, scaler

In [3]:
# calculate r² and rmse

def calculate_metrics(model, X, y_true):
    y_pred = model.predict(X)

    r_sq = model.score(X, y_true)
    print(f"coefficient of determination: {r_sq}")

    mse = mean_squared_error(y_true, y_pred) 
    print('RMSE: ', round(np.sqrt(mse),4))

In [4]:
# read the 2 dataframes
# preprocess data
# fit model
# evaluate model on test and eval data

def build_model(model, features, df_path='../data/data_vessel2.csv', df_eval_path='../data/data_vessel1.csv', target_variable="Speed Through Water (knots)", test_size = 0.25, random_state = 42):
    df = pd.read_csv(df_path)
    df_eval = pd.read_csv(df_eval_path)
    
    X, y, scaler = preprocess_data(df, features, target_variable)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model.fit(X_train, y_train)
    
    print('Metrics on test set')
    calculate_metrics(model, X_test, y_test)
    
    print('-'*30)

    X_eval, y_eval, _ = preprocess_data(df_eval, features, target_variable, scaler)

    print('Metrics on eval set')
    calculate_metrics(model, X_eval, y_eval)

    return model

In [5]:
lr = LinearRegression()
xgb = XGBRegressor(random_state=42)

In [6]:
# use only 1 feature

features = ['Propulsion Power (MW)']

lr_model1 = build_model(lr, features)

Metrics on test set
coefficient of determination: 0.8154794963170807
RMSE:  3.2916
------------------------------
Metrics on eval set
coefficient of determination: 0.8259371783529124
RMSE:  3.1743


In [7]:
xgb_model1 = build_model(xgb, features)

Metrics on test set
coefficient of determination: 0.9585003762412104
RMSE:  1.561
------------------------------
Metrics on eval set
coefficient of determination: 0.9664623709336826
RMSE:  1.3933


In [8]:
# include other power consumption related features, and environmental effects
# omit diesel generator power and main engine fuel flow rate as they are highly correlated with propulsion power

features = ['Power Galley (MW)', 'Power Service (MW)', 'HVAC Chiller Power (MW)', 'Scrubber Power (MW)', 'Sea Temperature (Celsius)', 'Boiler Fuel Flow Rate (L/h)', 'Incinerator 1 Fuel Flow Rate (L/h)', 'Relative Wind Angle (Degrees)', 'Relative Wind Direction (Degrees)', 'Draft (m)', 'Relative Wind Speed (knots)', 'Trim (m)', 'Propulsion Power (MW)', 'Bow Thruster Power (MW)', 'Stern Thruster Power (MW)']

lr_model = build_model(lr, features)

Metrics on test set
coefficient of determination: 0.8622358241275337
RMSE:  2.8442
------------------------------
Metrics on eval set
coefficient of determination: 0.8513340971007143
RMSE:  2.9336


In [9]:
xgb_model = build_model(xgb, features)

Metrics on test set
coefficient of determination: 0.9919211436078355
RMSE:  0.6888
------------------------------
Metrics on eval set
coefficient of determination: 0.9420623195657902
RMSE:  1.8313


In [10]:
# show top 5 coefficients for linear regression model (values can be negative or greater than 1)

coef = pd.Series(lr_model.coef_, index=features)
coef_abs_sorted = coef.abs().sort_values(ascending=False)
print('Top 5 key features for linear regression model:')
coef[coef_abs_sorted.index][:5]

Top 5 key features for linear regression model:


Propulsion Power (MW)                 5.147457
Sea Temperature (Celsius)            -1.030675
Scrubber Power (MW)                   0.982692
Boiler Fuel Flow Rate (L/h)          -0.816490
Incinerator 1 Fuel Flow Rate (L/h)    0.730900
dtype: float64

In [11]:
# show top 5 important features for xgb regressor model (values are between 0-1)

feature_importances = pd.Series(xgb_model.feature_importances_, index=features)
feature_importances_sorted = feature_importances.sort_values(ascending=False)
print('Top 5 key features for XGB regression model:')
feature_importances_sorted[:5]

Top 5 key features for XGB regression model:


Propulsion Power (MW)                 0.934900
Bow Thruster Power (MW)               0.019853
Draft (m)                             0.008021
Incinerator 1 Fuel Flow Rate (L/h)    0.006431
Scrubber Power (MW)                   0.005884
dtype: float32