In [27]:
import numpy as np
from sklearn import preprocessing
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [19]:
dat = np.load('/content/drive/MyDrive/data.npz')
X_train,y_train = dat['X_train'],dat['y_train']
X_val,y_val = dat['X_val'],dat['y_val']
X_test,y_test = dat['X_test'],dat['y_test']

In [12]:
def Dummy_Regr():
  dummy_regr = DummyRegressor()
  dummy_regr.fit(X_train, y_train)
  y_pred = dummy_regr.predict(X_test)
  print(f'Test MSE: {mse(y_test, y_pred):.3f}')

Dummy_Regr()

Test MSE: 13704.069


Explain your idea and why it would be a reasonable baseline. Implement
your baseline regressor and report the testing MSE.

Ans: This is a dummy Regressor which always predicts the mean value of the training data. So it can be treated as a resonable baseline as the predictions demonstrate almost to no learning at all. The above value is the baseline test_MSE of dummy regressor which has not learnt any weights post training. 

In [9]:
models = ["OLS", "Ridge", "LASSO"]
def grid_search(X_train, X_val, X_test):

    for model_name in models: 

      if model_name == "OLS":
        best_alpha = None
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_val_hat = model.predict(X_val)
        best_val_mse = mse(y_val, y_val_hat)
        best_model = model
        y_hat = model.predict(X_test)
      else:
        best_val_mse = float('inf')
        best_model = None
        best_alpha = None
        for alpha in np.linspace(-10, 10, 20):
          if model_name == 'LASSO':
            model = Lasso(alpha=alpha)
          else: 
            model = Ridge(alpha=alpha)
          model.fit(X_train, y_train)
          y_hat_val = model.predict(X_val)
          val_mse = mse(y_val,y_hat_val)
          if val_mse < best_val_mse:
            best_val_mse = val_mse
            best_model = model
            best_alpha = alpha
        y_hat = best_model.predict(X_test)

      print("")
      print(f'Model Coeffs.: {best_model.coef_}')
      print(f"Model       : {model_name}")
      print(f"Val_MSE     : {best_val_mse}")
      print(f"Test_MSE    : {mse(y_test, y_hat)}")
      print(f"Best lambda : {best_alpha}")

In [10]:
grid_search(X_train, X_val, X_test)


Model Coeffs.: [ 6.78392782e-08 -9.46961087e-07 -2.01559722e-05 -9.44137992e-03
 -3.58051575e-01  7.99676039e-03  7.76618779e-01  3.36026253e-01
  1.07191588e-02 -1.02676180e-01 -5.07021725e-03  1.70750137e-01
  1.70548307e-01 -7.85617873e-02  8.08841109e-02  1.91114765e-02
  1.51462054e-01  3.19292162e-02 -2.12960339e-01  2.88362100e-01
 -1.12241911e-02 -8.67993656e-01 -3.59423010e-01  6.18997673e-02
  4.31522108e-03  2.16059287e-03  1.92880833e-02 -4.09630461e-02
  6.54875549e-02 -2.05499951e-02  1.41233615e-01  5.99102193e-02
 -2.19882293e-04  8.13233954e-02 -1.98919313e-01  2.16878679e-04
  2.25805759e-03  2.22044605e-16  4.05009131e-01 -6.47656867e-01
 -3.10081883e-01 -3.54953823e-01  6.57339759e-01  6.40429550e-01
  3.85662550e-01 -3.70739286e-01 -6.63693444e-01  1.10864850e+00
 -5.97729892e-02  8.91788515e-01 -4.55125561e-01 -2.63133332e-01
 -5.58711691e-01]
Model       : OLS
Val_MSE     : 638.5917288630449
Test_MSE    : 11377.149595301527
Best lambda : None


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T



Model Coeffs.: [ 6.78336415e-08 -9.46622047e-07 -2.01546319e-05 -9.43948746e-03
 -3.57667644e-01  7.98769849e-03  7.76191279e-01  3.36071899e-01
  1.09373463e-02 -1.02666138e-01 -5.07456109e-03  1.70729499e-01
  1.70550256e-01 -7.85276767e-02  8.08215003e-02  1.91097430e-02
  1.51455006e-01  3.19769274e-02 -2.12941821e-01  2.87967800e-01
 -1.12149728e-02 -8.67555137e-01 -3.59462193e-01  6.16768682e-02
  4.31445667e-03  2.16439237e-03  1.92744938e-02 -4.09447337e-02
  6.54656118e-02 -2.05426505e-02  1.41233909e-01  5.99100855e-02
 -2.27811659e-04  8.13238195e-02 -1.98920470e-01  2.16896122e-04
  2.25800812e-03  0.00000000e+00  4.05030540e-01 -6.42787538e-01
 -3.06672763e-01 -3.53512869e-01  6.53799183e-01  6.35338126e-01
  3.82889292e-01 -3.69053430e-01 -6.61309331e-01  1.10361470e+00
 -6.29122480e-02  8.87735836e-01 -4.53244353e-01 -2.59531651e-01
 -5.54352953e-01]
Model       : Ridge
Val_MSE     : 638.5901256333506
Test_MSE    : 11377.0929798394
Best lambda : 10.0


  positive)
  positive)
  positive)
  positive)



Model Coeffs.: [ 5.82117421e-08 -2.29799128e-05 -1.35085655e-05 -0.00000000e+00
 -0.00000000e+00  1.87157208e-02  0.00000000e+00  3.25733542e-02
  4.61552767e-02 -0.00000000e+00 -8.49798340e-04  0.00000000e+00
  5.59733578e-02  0.00000000e+00 -0.00000000e+00  4.10013618e-04
  0.00000000e+00  1.02566189e-01  0.00000000e+00 -0.00000000e+00
 -2.14813365e-02  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  1.97680773e-01  0.00000000e+00
 -1.87258135e-02  2.54629575e-02 -1.79679921e-01  4.39740825e-04
  1.96134130e-03  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00]
Model       : LASSO
Val_MSE     : 634.5342810078747
Test_MSE    : 11339.621227295804
Best lambda : 10.0


  positive)


In [11]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

grid_search(X_train_scaled, X_val_scaled, X_test_scaled)


Model Coeffs.: [ 4.52849309e-01 -2.23140363e-02 -2.26974778e+00 -1.84115062e-01
 -8.38522184e+00  4.00769020e+00  6.87992051e+01  2.40985306e+01
  8.77405547e-01 -1.23485928e+00 -1.87308527e+00 -3.00590459e+11
  3.97802187e+00 -4.33924858e+00  1.84647889e-01  6.28758829e+00
  2.46546761e+11  4.22101606e-01 -1.09002660e+01  6.08389717e+00
 -5.32820872e+00 -7.18790912e+01 -2.43987728e+01  4.77850407e+00
  1.25723867e+00  8.00248383e-01  1.51475343e+11 -6.93806682e-01
  4.93493804e+00 -2.80440697e+00 -2.38865349e+11  2.22882104e+11
 -1.67104729e-02  2.91197922e+11 -4.15406799e+00  7.12585449e-02
  2.47277069e+00  5.83006817e+09  7.92939186e-01  4.90484477e+11
  5.26718161e+11  5.32952948e+11  5.47715823e+11  5.23769030e+11
  5.30072548e+11  5.17256389e+11 -3.97547272e+12 -3.81927501e+12
 -3.90428999e+12 -4.05512450e+12 -4.03245710e+12 -3.96244649e+12
 -3.98550008e+12]
Model       : OLS
Val_MSE     : 638.5407275830877
Test_MSE    : 11377.604447368545
Best lambda : None

Model Coeffs.: [ 4

i. Do you observe any difference on the learned coefficients among three
models? Explain.

As you can see, the regularization penalty actually depends on the magnitude of the coefficients, which in turn depends on the magnitude of the features themselves. So there you have it, when you change the scale of the features you also change the scale of the coefficients, which are thus penalized differently, resulting in different solutions. The exponents were mostly in negative range for most of the coefficients. They are adjusted and we can see coefficients with more positive exponential values and many exponents are closer to 0.

ii. Compare test_MSE with those in (b). Which methods have obvious changes
and which not? Explain

test_MSE of OLS has not changed which means there is no effect of standardization but significantly impacts L1 and L2 penalized regression. There is no term affecting the coefficients for OLS (like $\lambda$ for L1 and L2). So it is unaffected by change in scale of the values. 

With LASSO (L1), it fairs better than Ridge (L2) as it reduces the coefficients of insignificant features to 0 whereas L2 penalty is not strict and doesnt reduce the coefficients to 0 but to a lower value. So, this works well for feature selection in case we have a huge number of features. 

Due to reason stated above, as a result, we can see the test_MSE of L1 < test_MSE of L2



In [29]:
def CART():
  best_val_mse = float('inf')
  best_model = None
  best_height = 1
  for i in range(1,11):
    
    CARTRegr = DecisionTreeRegressor(criterion='mse', max_depth=i, max_features=None, random_state=0)
    CARTRegr.fit(X_train, y_train)
    y_hat_val = CARTRegr.predict(X_val)
    val_mse = mse(y_val, y_hat_val)
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_model = CARTRegr
        best_height = i
  
  y_hat = best_model.predict(X_test)
  print(f"Max Depth.  : {best_height}")
  print(f"Val_MSE     : {best_val_mse}")
  print(f"Test_MSE    : {mse(y_test, y_hat)}")
  return best_model

decision_tree_estimator = CART()

Max Depth.  : 3
Val_MSE     : 505.0707734117222
Test_MSE    : 10152.585792514701


In [26]:
def RandomForests():
  best_val_mse = float('inf')
  best_model = None
  best_estimator = 1
  for i in range(2,31):
    RandForesRegr = RandomForestRegressor(n_estimators=i, criterion='mse', max_depth=3)
    RandForesRegr.fit(X_train, y_train)
    y_hat_val = RandForesRegr.predict(X_val)
    val_mse = mse(y_val, y_hat_val)
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_model = RandForesRegr
        best_estimator = i
  
  y_hat = best_model.predict(X_test)
  print(f"Best Estimator. : {best_estimator}")
  print(f"Val_MSE         : {best_val_mse}")
  print(f"Test_MSE        : {mse(y_test, y_hat)}")
RandomForests()

Best Estimator. : 28
Val_MSE         : 411.19748505922365
Test_MSE        : 9285.749130124314


In [33]:
def AdaBoost():
  best_val_mse = float('inf')
  best_model = None
  best_lr = 1
  for i in np.linspace(1e-1,2,20):
    AdaBoostRegr = AdaBoostRegressor(base_estimator=decision_tree_estimator, learning_rate=i, n_estimators=28)
    AdaBoostRegr.fit(X_train, y_train)
    y_hat_val = AdaBoostRegr.predict(X_val)
    val_mse = mse(y_val, y_hat_val)
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_model = AdaBoostRegr
        best_estimator = i
  
  y_hat = best_model.predict(X_test)
  print(f"Best lr.    : {best_lr}")
  print(f"Val_MSE     : {best_val_mse}")
  print(f"Test_MSE    : {mse(y_test, y_hat)}")
AdaBoost()

Best lr.    : 1
Val_MSE     : 540.6539292581144
Test_MSE    : 8950.906714711358


(g) Do those regressors learn from the data compared with the baseline? Compare and comment on (explain) their performances relative to the baseline, and relative to each other.

| Model  | Val_MSE   |  test_MSE |  
|---|---|---|
| DummyRegressor  | N/A   |  13704.069 |
| LinearRegressor  | 638.54  | 11377.60  |
| Ridge(L2)  |638.37   |  11369.23 |
| LASSO(L1)  | 638.47  | 11306.21  |
|CART|505.07| 10152.58|
| RandomForests  | 411.19   | 9285.74  | 
|  AdaBoost | 540.65  |  8950.90 | 

As we can see from the table, test_MSE is reducing as we use more capable regressors and show that they have learnt better than the baseline model which doesnt change in state/prediction. 

Linear regression is weaker than other algorithms in terms of reducing error rates. Its possibly due to not being able to represent the features linearly. In simple linear regression, outliers can significantly disrupt the outcomes.

With LASSO (L1), it fairs better than Ridge (L2) as it reduces the coefficients of insignificant features to 0 whereas L2 penalty is not strict and doesnt reduce the coefficients to 0 but to a lower value. So, this works well for feature selection in case we have a huge number of features. As a result, we can see the test_MSE of L1 < test_MSE of L2

The CART model gives high importance to a particular set of features. But the random forest chooses features randomly during the training process. Therefore, it does not depend highly on any specific set of features. Thus, the random forest can generalize over the data in a better way. This randomized feature selection makes random forest much more accurate than a decision tree.

With random forests, you train however many decision trees using samples of BOTH the data points and the features. From there, each decision should be de-correlated. You can them take an average of the prediction (regression) 

With AdaBoost, you combine predictors by adaptively weighting the difficult-to-regress samples more heavily. From there, you make predictions based on the predictions of the various weak learners in the ensemble.This additive model (ensemble) works in a forward stage-wise manner, introducing a weak learner to improve the shortcomings of existing weak learners. 

If you carefully tune parameters, Adaboost results in better performance than random forests.


(h) Some of the features only have 0/1 values because they are converted from
categorical features. Now for some of those categorical features, you modify the
converted numerical values to 0/12345 (i.e., that attribute is either 0 or 12345) for all data including training, val and test and then do the regression problem again with the same random seed. Will your regressor give different estimations compared to your previous results? Give Yes/No answers for linear regression, CART, random forest and Adaboost, and explain why. Note: Assume that the change of value doesn’t affect any random process such as the
feature sampling in random forest


1.   Linear Regression : Yes. These values would be directly interpreted as a data point and would lead to different predictions and hence the categorical features have to be one hot encoded. 
2.   Random Forests, CART, AdaBoost are unaffected by some of the scaling methods. Trees are not affected by scaling because the splitting criterion first orders the values of each feature and then calculate the gini/entropy of the split. When CART looks for the best splits, it going to use entropy or gini to calculate information gain, this is not dependent on the scale of your predictor variable, rather on the resultant purity of the variable. Some scaling methods keep this order, so no change to the accuracy score as you should see the same variables and order of selection, just different cut off points.
