# 1. Creating the Training Node

In [1]:
import time
import pandas as pd
import scipy as sci
from typing import Tuple, Dict, Any
from sklearn import svm
from sklearn.preprocessing._data import StandardScaler


def train(
        X: pd.DataFrame,
        Y: pd.DataFrame,
        kernel: str,
        gamma: str,
    ) -> Tuple[svm._classes.SVR,Dict]:

    # Initializing hyperparameters
    response_scale = sci.stats.iqr(Y)
    box_constraint = response_scale/1.349
    epsilon = 2*response_scale/13.49
    
    # Initializing the model
    model = svm.SVR(
        kernel = kernel,
        C = box_constraint,
        gamma = gamma,
        epsilon = epsilon
    )

    # Training the model
    time_start = time.time()
    model.fit(X.drop('partition_key', axis=1), Y)
    time_end = time.time()

    # Elapsed time in seconds for tp_x
    elapsed_time = time_end - time_start

    metrics = {
        "training_elapsed_time": {
            "value": float(elapsed_time),
             "step": 1
         },
    }

    return model, metrics

  and should_run_async(code)


In [2]:
model, train_metrics = train(
    X = catalog.load("x_train"),
    Y = catalog.load("y_train"),
    kernel = catalog.load("params:regressor.hyperp.kernel"),
    gamma = catalog.load("params:regressor.hyperp.gamma"),
)

train_metrics

2021-02-24 18:52:35,865 - kedro.io.data_catalog - INFO - Loading data from `x_train` (CSVDataSet)...
2021-02-24 18:52:35,880 - kedro.io.data_catalog - INFO - Loading data from `y_train` (CSVDataSet)...
2021-02-24 18:52:35,889 - kedro.io.data_catalog - INFO - Loading data from `params:regressor.hyperp.kernel` (MemoryDataSet)...
2021-02-24 18:52:35,892 - kedro.io.data_catalog - INFO - Loading data from `params:regressor.hyperp.gamma` (MemoryDataSet)...


  return f(*args, **kwargs)


{'training_elapsed_time': {'value': 0.014623880386352539, 'step': 1}}

# 2. Creating the Testing Node

In [3]:
from sklearn.metrics import mean_squared_error


def test(
        model: svm._classes.SVR,
        X_scaler: StandardScaler,
        Y_scaler: StandardScaler,
        X_test: pd.DataFrame,
        Y_test: pd.DataFrame,
    ) -> Dict[str, Dict[str, Any]]:

    # Removing partition key from the data frame
    X_test.drop('partition_key', axis=1, inplace=True)

    # Generate predictions
    Y_pred = model.predict(X_test)
    
    # Inverse transform the data with the scaler
    X_test_inversed = X_scaler.inverse_transform(X_test)
    Y_test_inversed = Y_scaler.inverse_transform(Y_test)
    Y_pred_inversed = Y_scaler.inverse_transform(Y_pred)

    rmse = mean_squared_error(
        y_true = Y_test_inversed,
        y_pred = Y_pred_inversed
    )
    
    return {
        "rmse": {"value": float(rmse), "step": 1},
    }

  and should_run_async(code)


In [4]:
test_metrics = test(
    model = model,
    X_scaler = catalog.load("x_scaler"),
    Y_scaler = catalog.load("y_scaler"),
    X_test = catalog.load("x_test"),
    Y_test = catalog.load("y_test"),
)

test_metrics

2021-02-24 18:52:35,966 - kedro.io.data_catalog - INFO - Loading data from `x_scaler` (MlflowMlflowModelSaverDataSet)...
2021-02-24 18:52:35,975 - kedro.io.data_catalog - INFO - Loading data from `y_scaler` (MlflowMlflowModelSaverDataSet)...
2021-02-24 18:52:35,982 - kedro.io.data_catalog - INFO - Loading data from `x_test` (CSVDataSet)...
2021-02-24 18:52:35,995 - kedro.io.data_catalog - INFO - Loading data from `y_test` (CSVDataSet)...


{'rmse': {'value': 47.063334844394234, 'step': 1}}

# 3. Next steps

- 1. Update **nodes.py** file for data science pipeline
- 2. Update **pipeline.py** file for data science pipeline
- 3. Update **hooks.py** file
- 4. Update **conf/base/catalog.yml** file
- 5. **Commit code to repo**