# Multi-target Regression Model

Example of training a regression model to predict 3 target variables.

In [None]:
import pandas as pd
import seaborn as sns
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet

%matplotlib inline
pd.set_option("display.float_format", lambda x: "%.4f" % x)

# Load the Data

In [None]:
df = pd.read_csv("data.csv")

# Data Exploration

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.pairplot(df.iloc[:, :8]);

In [None]:
fig = plt.figure(figsize=(20, 20))
corrMatrix = df.iloc[:, :30].corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

# Prepare Data for Training

In [None]:
# drop target columns
target_columns = ["target_a", "target_b", "target_c"]
X = df.drop(target_columns, axis=1)
y = df[target_columns]

In [None]:
X

In [None]:
y

1). X_train - This includes your all independent variables, these will be used to train the model, also as we have specified the test_size = 0.2, this means 80% of observations from your complete data will be used to train/fit the model and rest 20% will be used to test the model.

2). X_test - This is remaining 20% portion of the independent variables from the data which will not be used in the training phase and will be used to make predictions to test the accuracy of the model.

3). y_train - This is your dependent variable which needs to be predicted by this model, this includes category labels against your independent variables, we need to specify our dependent variable while training/fitting the model.

4). y_test - This data has category labels for your test data, these labels will be used to test the accuracy between actual and predicted categories.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score


def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()


def print_evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R2 Square", r2_square)
    print("__________________________________")


def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square


def evaluate_model(pipeline, X_test, y_test):
    mod_name = str(pipeline.get_params()["multioutputregressor__estimator"]).split("(")[
        0
    ]
    print(f"Starting {mod_name}...")
    pred = pipeline.predict(X_test)
    actual = y_test.reset_index(drop=True)
    actual.columns = [
        f"act_{target_columns[0]}",
        f"act_{target_columns[1]}",
        f"act_{target_columns[2]}",
    ]
    act_df = actual
    pred_df = pd.DataFrame(
        pred,
        columns=[
            f"pred_{target_columns[0]}",
            f"pred_{target_columns[1]}",
            f"pred_{target_columns[2]}",
        ],
    )
    combined = pd.concat([act_df, pred_df], axis=1, join="inner")
    # print_evaluate(combined["act_dist"], combined["pred_dist"])
    # print_evaluate(combined["act_ah"], combined["pred_ah"])
    # print_evaluate(combined["act_av"], combined["pred_av"])
    results_df = pd.DataFrame(
        data=[
            [
                mod_name,
                evaluate(
                    combined[f"act_{target_columns[0]}"],
                    combined[f"pred_{target_columns[0]}"],
                )[3],
                evaluate(
                    combined[f"act_{target_columns[1]}"],
                    combined[f"pred_{target_columns[1]}"],
                )[3],
                evaluate(
                    combined[f"act_{target_columns[2]}"],
                    combined[f"pred_{target_columns[2]}"],
                )[3],
            ]
        ],
        columns=[
            "Model",
            f"R2_{target_columns[0]}",
            f"R2_{target_columns[1]}",
            f"R2_{target_columns[2]}",
        ],
    )
    results_df["Model"] = results_df.apply(
        lambda x: "CatBoostRegressor"
        if x["Model"].find("catboost") != -1
        else x["Model"],
        axis=1,
    )
    return combined, results_df

# Scale the variables

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([("std_scalar", StandardScaler())])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# Train the model

In [None]:
algs = [
    LinearRegression,
    XGBRegressor,
    SVR,
    LGBMRegressor,
    BayesianRidge,
    KernelRidge,
    GradientBoostingRegressor,
    RandomForestRegressor,
    CatBoostRegressor,
    ElasticNet,
]

In [None]:
combined_df = []
eval_results = []
for alg in algs:
    pipe = make_pipeline(MultiOutputRegressor(alg()))
    pipe.fit(X_train, y_train)
    model_eval = evaluate_model(pipe, X_test, y_test)
    combined_df.append(model_eval[0])
    eval_results.append(model_eval[1])

result_df = pd.concat(eval_results, ignore_index=True)
result_df["Mean"] = (
    result_df[f"R2_{target_columns[0]}"]
    + result_df[f"R2_{target_columns[1]}"]
    + result_df[f"R2_{target_columns[2]}"]
) / 3

In [None]:
result_df.sort_values(by=["Mean"], ascending=False).reset_index(
    drop=True
).style.background_gradient()

In [None]:
combined = combined_df[8]
pd.DataFrame(
    {
        "True Values": combined[f"act_{target_columns[0]}"],
        "Predicted Values": combined[f"pred_{target_columns[0]}"],
    }
).hvplot.scatter(x="True Values", y="Predicted Values")