## Set of Toy Multiple Regression Problems for HPH development (and general interest)


In [1]:
from sklearn.datasets import make_friedman1,make_friedman2,make_friedman3, make_regression
from sklearn.linear_model import LinearRegression, Ridge,MultiTaskLasso,MultiTaskElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import RadiusNeighborsRegressor,KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler


import pandas as pd
import numpy as np

### Set up a linear multiple regression problem using SKEARN "make_regression" function

In [11]:
#Trivial Linear Multiple Regression Problem with a little Noise (0.1)

X,Y = make_regression(n_samples=1000, n_features=4, n_informative=4, n_targets=4,noise=0.1,random_state=42)

#let's make pandas in case we need it
X_df = pd.DataFrame(X,columns=['x1','x2','x3','x4'])
Y_df = pd.DataFrame(Y, columns=['y1','y2','y3','y4'])
linear_df = pd.concat([X_df,Y_df],axis=1)
linear_df.head()
    

Unnamed: 0,x1,x2,x3,x4,y1,y2,y3,y4
0,0.572583,-0.571179,1.797687,0.640843,133.362376,53.348308,202.145002,110.54607
1,-0.513196,1.217959,-1.104863,0.220541,-40.926813,-17.096546,-34.122116,-21.819102
2,0.677875,-2.703232,0.189582,1.001046,63.650521,30.88557,-107.538471,-7.989816
3,-0.978373,0.195845,-0.53976,-0.778305,-91.537221,-70.295644,-148.930892,-62.683464
4,0.757508,0.614167,-0.112328,-0.22097,0.302214,38.655431,87.921205,4.225219


### Split for training/testing

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_scaled, test_size=0.10, random_state=42)

### Scale everything using standard scaler - good practice for all models

In [12]:
x_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.transform(X_test)

y_scaler = StandardScaler()
Y_train_scaled = y_scaler.fit_transform(Y_train)
Y_test_scaled = y_scaler.transform(Y_test)


### Several SKlearn Models - Plus XGBoost

In [8]:
#several SKLEARN models which support multiple regression

#Note Multi-output Regressor can allow support for many single-regression models - SVR is one of the most powerful

names = ["LinearRegression", "KNeighborsRegressor",  
         "DecisionTreeRegressor", "MultiTaskLasso", "MultiTaskElasticNet", 
         "Ridge", "MLPRegressor", "MultiOutputRegressor(SVR())","MultiOutputRegressor(XGBRegressor())"]

regressors = [
    LinearRegression(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    MultiTaskLasso(alpha=0.01),
    MultiTaskElasticNet(alpha=0.01),
    Ridge(alpha=0.05),
    MLPRegressor(hidden_layer_sizes=(5,),
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01),
    MultiOutputRegressor(SVR(kernel='linear', C=10.0)),
    MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7))]



In [13]:
mse = {} 
for name, clf in zip(names, regressors):
        print("training ",name)
        clf.fit(X_train_scaled, Y_train_scaled)
        mse[name] = mean_squared_error(Y_test_scaled, clf.predict(X_test_scaled))
        

training  LinearRegression
training  KNeighborsRegressor
training  DecisionTreeRegressor
training  MultiTaskLasso
training  MultiTaskElasticNet
training  Ridge
training  MLPRegressor
training  MultiOutputRegressor(SVR())
training  MultiOutputRegressor(XGBRegressor())


In [14]:
# NOTES
# simple linear models (linear, ridge, lasso) have few parameters and do better with a simple linear problem
#
# SVR (support vector regression) is super-powerful but scales poorly with more data and is very sensitive to it's hyperparameters
#
# MPL (neural networks) is theoretically the most powerful but requires by far the most fiddling with hyper-parameters
#
# tree-based models seem to be overfitting this simple dataset


sorted(mse.items(), key=lambda kv: kv[1])

[('LinearRegression', 1.6801173293002426e-06),
 ('Ridge', 1.688384444118822e-06),
 ('MultiTaskElasticNet', 8.865890093355445e-05),
 ('MultiTaskLasso', 9.122255569091759e-05),
 ('MultiOutputRegressor(SVR())', 0.0009746860904641888),
 ('MLPRegressor', 0.005245655963964053),
 ('MultiOutputRegressor(XGBRegressor())', 0.017870360224913985),
 ('KNeighborsRegressor', 0.036867841893117634),
 ('DecisionTreeRegressor', 0.13898775841571587)]

## Set up a hard Non-linear Problem using SKLEARN "make_friedman" functions

In [20]:
X1,y1 = make_friedman1(n_samples=1000, n_features=5, noise=0.1, random_state=42)
X1.shape

(1000, 5)

In [21]:
X2,y2 = make_friedman2(n_samples=1000,  noise=0.1, random_state=42)
X2.shape

(1000, 4)

In [22]:
X3,y3 = make_friedman3(n_samples=1000,  noise=0.1, random_state=42)
X3.shape

(1000, 4)

In [23]:
#13 inputs
Xnl = np.hstack((X1,X2,X3))
Xnl.shape

(1000, 13)

In [24]:
y1.T.shape

(1000,)

In [25]:
#make some brutal non-linear relationships to infer
Ynl = np.vstack((y1,y2,y3,y1*y2,y2*y3,y1*y2)).T
Ynl.shape

(1000, 6)

In [26]:
#let's make pandas in case we need it
X_nl_df = pd.DataFrame(Xnl,columns=['x'+str(i+1) for i in range(0,13)])
Y_nl_df = pd.DataFrame(Ynl, columns=['y'+str(i+1) for i in range(0,6)])
nl_df = pd.concat([X_nl_df ,Y_nl_df],axis=1)
nl_df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,y1,y2,y3,y4,y5,y6
0,0.37454,0.950714,0.731994,0.598658,0.156019,37.454012,1678.777388,0.731994,6.986585,37.454012,1678.777388,0.731994,6.986585,16.778564,1229.558301,1.673191,20630.2228,2057.285942,20630.2228
1,0.155995,0.058084,0.866176,0.601115,0.708073,15.601864,380.50075,0.058084,9.661761,15.601864,380.50075,0.058084,9.661761,12.278795,27.08412,0.987408,332.560345,26.743077,332.560345
2,0.020584,0.96991,0.832443,0.212339,0.181825,60.111501,1282.391023,0.020584,10.699099,60.111501,1282.391023,0.020584,10.699099,5.828467,65.591539,0.353135,382.298125,23.162659,382.298125
3,0.183405,0.304242,0.524756,0.431945,0.291229,83.244264,472.546861,0.181825,2.834045,83.244264,472.546861,0.181825,2.834045,7.623226,119.677802,0.846805,912.330974,101.343768,912.330974
4,0.611853,0.139494,0.292145,0.366362,0.45607,30.424224,982.9206,0.431945,3.912291,30.424224,982.9206,0.431945,3.912291,9.511135,425.610182,1.45335,4048.035749,618.560655,4048.035749


### Split for training/testing

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(Xnl, Ynl, test_size=0.10, random_state=42)

### Scale everything using standard scaler - good practice for all models

In [28]:
x_scaler_nl = StandardScaler()
X_train_scaled_nl = x_scaler_nl.fit_transform(X_train)
X_test_scaled_nl = x_scaler_nl.transform(X_test)

y_scaler_nl = StandardScaler()
Y_train_scaled_nl = y_scaler_nl.fit_transform(Y_train)
Y_test_scaled_nl = y_scaler_nl.transform(Y_test)


In [29]:
X_train_scaled_nl.shape

(900, 13)

In [30]:
msenl = {} 
for name, clf in zip(names, regressors):
        print("training ",name)
        clf.fit(X_train_scaled_nl, Y_train_scaled_nl)
        msenl[name] = mean_squared_error(Y_test_scaled_nl, clf.predict(X_test_scaled_nl))

training  LinearRegression
training  KNeighborsRegressor
training  DecisionTreeRegressor
training  MultiTaskLasso
training  MultiTaskElasticNet
training  Ridge
training  MLPRegressor
training  MultiOutputRegressor(SVR())
training  MultiOutputRegressor(XGBRegressor())


In [31]:
# NOTES
# The more complex models XGBoost and MLPRegressor do a better job on the highly non-linear dataset
# it's possible that with more fiddling MSE could be reduced to (usually required for fielding) ~0.01

sorted(msenl.items(), key=lambda kv: kv[1])

[('MultiOutputRegressor(XGBRegressor())', 0.05278559956656891),
 ('MLPRegressor', 0.12170215887420384),
 ('KNeighborsRegressor', 0.2537543277088086),
 ('LinearRegression', 0.2591236938628191),
 ('Ridge', 0.2591262850622057),
 ('MultiTaskElasticNet', 0.2600599092054938),
 ('MultiTaskLasso', 0.2605453559159993),
 ('MultiOutputRegressor(SVR())', 0.2822419835285024),
 ('DecisionTreeRegressor', 0.3286053406230079)]