# HSE 2022: Mathematical Methods for Data Analysis

## Homework 2

In [277]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import datasets
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLSResults
from math import sqrt
import random
import sys



import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

sns.set(style="darkgrid")

### Data

For this homework we use Dataset from seaborn on diamonds prices.

In [278]:
data = sns.load_dataset('diamonds')
y = data.price
X = data.drop(['price'], axis=1)
columns = data.drop(['price'], axis=1).columns

## Linear regression

#### 0. [0.25 points] Encode categorical variables.

In [279]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

print(X.info())
categories = (data.dtypes =="category")
object_cols = list(categories[categories].index)
print("Categorical variables:")
print(object_cols)

encoded_data = X.copy()
label_encoder = LabelEncoder()
for col in object_cols:
    encoded_data[col] = label_encoder.fit_transform(encoded_data[col])

# check encoded
encoded_data.head()

# encoded with labelEncoder, maybe to rewrite with OneHotEncoder

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   x        53940 non-null  float64 
 7   y        53940 non-null  float64 
 8   z        53940 non-null  float64 
dtypes: category(3), float64(6)
memory usage: 2.6 MB
None
Categorical variables:
['cut', 'color', 'clarity']


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75


#### 1. [0.25 points] Split the data into train and test sets with ratio 80:20 with random_state=17.

In [280]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ
X_train, X_test, y_train, y_test = train_test_split(encoded_data, y, test_size=0.2, random_state=17)

#### 2. [1 point] Train models on train data using StatsModels library and apply it to the test set; use $RMSE$ and $R^2$ as the quality measure.

* [`LinearRegression`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html);
* [`Ridge`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) with $\alpha = 0.01$;
* [`Lasso`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) with $\alpha = 0.01$
* [`ElasticNet`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html) with $\alpha = 0.01$, $l_{1}$_$ratio = 0.6$

Don't forget to scale the data before training the models with StandardScaler!

In [281]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

# scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [282]:
# for statsmodels
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
model= sm.OLS(y_train, X_train)

# Linear Regression statsmodels
results_lr = model.fit()
y_test_predicted = results_lr.predict(X_test)
y_train_predicted = results_lr.predict(X_train)

print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
print("Test RMSE = %.4f" % mean_squared_error(y_test, y_test_predicted, squared=False))
print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
print("Test R2 = %.4f" % r2_score(y_test, y_test_predicted))

Train RMSE = 1347.9933
Test RMSE = 1370.9682
Train R2 = 0.8853
Test R2 = 0.8839


In [283]:
# Ridge statsmodels
results_ridge = model.fit_regularized(L1_wt=0, alpha=0.01)
y_test_predicted = results_ridge.predict(X_test)
y_train_predicted = results_ridge.predict(X_train)

print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
print("Test RMSE = %.4f" % mean_squared_error(y_test, y_test_predicted, squared=False))
print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
print("Test R2 = %.4f" % r2_score(y_test, y_test_predicted))

Train RMSE = 1365.9920
Test RMSE = 1383.9541
Train R2 = 0.8822
Test R2 = 0.8817


In [284]:
# Lasso statsmodels
results_lasso = model.fit_regularized(L1_wt=1, alpha=0.01)
# results.summary2()
y_test_predicted = results_lasso.predict(X_test)
y_train_predicted = results_lasso.predict(X_train)

print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
print("Test RMSE = %.4f" % mean_squared_error(y_test, y_test_predicted, squared=False))
print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
print("Test R2 = %.4f" % r2_score(y_test, y_test_predicted))

Train RMSE = 1348.6037
Test RMSE = 1370.0240
Train R2 = 0.8852
Test R2 = 0.8841


In [285]:
# elastic statsmodels
results_elastic = model.fit_regularized(L1_wt=0.6, alpha=0.01)
y_test_predicted = results_elastic.predict(X_test)
y_train_predicted = results_elastic.predict(X_train)

print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
print("Test RMSE = %.4f" % mean_squared_error(y_test, y_test_predicted, squared=False))
print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
print("Test R2 = %.4f" % r2_score(y_test, y_test_predicted))

Train RMSE = 1353.6044
Test RMSE = 1372.8616
Train R2 = 0.8844
Test R2 = 0.8836


In [286]:
# # LinearRegression
# model_lr = LinearRegression()
# model_lr.fit(X_train, y_train)
# y_predicted = model_lr.predict(X_test)
# y_train_predicted = model_lr.predict(X_train)
#
# print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
# print("Test RMSE = %.4f" % mean_squared_error(y_test, y_predicted, squared=False))
# print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
# print("Test R2 = %.4f" % r2_score(y_test, y_predicted))

In [287]:
# # ridge
# model_ridge = Ridge(alpha=0.01)
# model_ridge.fit(X_train, y_train)
# y_predicted = model_ridge.predict(X_test)
# y_train_predicted = model_ridge.predict(X_train)
#
# print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
# print("Test RMSE = %.4f" % mean_squared_error(y_test, y_predicted, squared=False))
# print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
# print("Test R2 = %.4f" % r2_score(y_test, y_predicted))

In [288]:
# # Lasso
# model_lasso = Lasso(alpha=0.01)
# model_lasso.fit(X_train, y_train)
# y_predicted = model_lasso.predict(X_test)
# y_train_predicted = model_lasso.predict(X_train)
#
# print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
# print("Test RMSE = %.4f" % mean_squared_error(y_test, y_predicted, squared=False))
# print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
# print("Test R2 = %.4f" % r2_score(y_test, y_predicted))

In [289]:
# # ElasticNet
# model_elastic = ElasticNet(alpha=0.01, l1_ratio=0.6)
# model_elastic.fit(X_train, y_train)
# y_predicted = model_elastic.predict(X_test)
# y_train_predicted = model_elastic.predict(X_train)
#
# print("Train RMSE = %.4f" % mean_squared_error(y_train, y_train_predicted, squared=False))
# print("Test RMSE = %.4f" % mean_squared_error(y_test, y_predicted, squared=False))
# print("Train R2 = %.4f" % r2_score(y_train, y_train_predicted))
# print("Test R2 = %.4f" % r2_score(y_test, y_predicted))

#### 3. [1 point] Explore the values of the parameters of the resulting models and compare the number of zero weights in them. Comment on the significance of the coefficients, overal model significance and other related factors from the results table

In [290]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

# linear reg
results_lr.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.885
Dependent Variable:,price,AIC:,744418.8263
Date:,2022-10-13 00:49,BIC:,744505.5512
No. Observations:,43152,Log-Likelihood:,-372200.0
Df Model:,9,F-statistic:,37010.0
Df Residuals:,43142,Prob (F-statistic):,0.0
R-squared:,0.885,Scale:,1817500.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,3928.6813,6.4899,605.3537,0.0000,3915.9610,3941.4016
x1,5257.1453,31.0710,169.1979,0.0000,5196.2456,5318.0450
x2,76.4610,6.6590,11.4824,0.0000,63.4093,89.5128
x3,-455.4350,6.8100,-66.8778,0.0000,-468.7826,-442.0874
x4,491.4240,6.7022,73.3231,0.0000,478.2876,504.5604
x5,-226.2704,7.9533,-28.4498,0.0000,-241.8590,-210.6818
x6,-213.2612,6.9923,-30.4996,0.0000,-226.9662,-199.5562
x7,-1383.2878,48.3537,-28.6077,0.0000,-1478.0619,-1288.5137
x8,42.1665,29.5137,1.4287,0.1531,-15.6809,100.0138

0,1,2,3
Omnibus:,11265.146,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,360275.892
Skew:,0.611,Prob(JB):,0.0
Kurtosis:,17.103,Condition No.:,18.0


In [291]:
# ridge
OLSResults(model, results_ridge.params, model.normalized_cov_params).summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.882
Dependent Variable:,price,AIC:,745563.5541
Date:,2022-10-13 00:49,BIC:,745650.279
No. Observations:,43152,Log-Likelihood:,-372770.0
Df Model:,9,F-statistic:,35910.0
Df Residuals:,43142,Prob (F-statistic):,0.0
R-squared:,0.882,Scale:,1866400.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,3889.7835,6.5765,591.4627,0.0000,3876.8933,3902.6736
x1,4219.7372,31.4858,134.0201,0.0000,4158.0243,4281.4500
x2,81.8556,6.7479,12.1306,0.0000,68.6296,95.0816
x3,-424.7483,6.9009,-61.5498,0.0000,-438.2742,-411.2225
x4,496.6610,6.7917,73.1280,0.0000,483.3492,509.9728
x5,-162.6547,8.0595,-20.1817,0.0000,-178.4514,-146.8579
x6,-203.6562,7.0856,-28.7421,0.0000,-217.5442,-189.7682
x7,-312.9137,48.9993,-6.3861,0.0000,-408.9533,-216.8741
x8,21.4162,29.9077,0.7161,0.4739,-37.2035,80.0359

0,1,2,3
Omnibus:,13066.102,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,183804.27
Skew:,1.067,Prob(JB):,0.0
Kurtosis:,12.883,Condition No.:,18.0


In [292]:
# lasso
OLSResults(model, results_lasso.params, model.normalized_cov_params).summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.885
Dependent Variable:,price,AIC:,744457.9003
Date:,2022-10-13 00:49,BIC:,744544.6252
No. Observations:,43152,Log-Likelihood:,-372220.0
Df Model:,9,F-statistic:,36970.0
Df Residuals:,43142,Prob (F-statistic):,0.0
R-squared:,0.885,Scale:,1819200.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,3928.6713,6.4928,605.0782,0.0000,3915.9452,3941.3974
x1,5063.8698,31.0850,162.9037,0.0000,5002.9425,5124.7970
x2,77.5642,6.6620,11.6428,0.0000,64.5066,90.6218
x3,-453.1266,6.8130,-66.5087,0.0000,-466.4802,-439.7729
x4,495.2762,6.7052,73.8644,0.0000,482.1339,508.4186
x5,-214.9442,7.9569,-27.0135,0.0000,-230.5399,-199.3485
x6,-213.4343,6.9954,-30.5105,0.0000,-227.1455,-199.7231
x7,-1201.4586,48.3756,-24.8361,0.0000,-1296.2757,-1106.6416
x8,54.7441,29.5270,1.8540,0.0637,-3.1294,112.6176

0,1,2,3
Omnibus:,11656.263,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315053.588
Skew:,0.713,Prob(JB):,0.0
Kurtosis:,16.16,Condition No.:,18.0


In [293]:
# elastic
OLSResults(model, results_elastic.params, model.normalized_cov_params).summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.884
Dependent Variable:,price,AIC:,744777.3304
Date:,2022-10-13 00:49,BIC:,744864.0552
No. Observations:,43152,Log-Likelihood:,-372380.0
Df Model:,9,F-statistic:,36660.0
Df Residuals:,43142,Prob (F-statistic):,0.0
R-squared:,0.884,Scale:,1832700.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,3913.0232,6.5169,600.4416,0.0000,3900.2499,3925.7965
x1,4675.8946,31.2003,149.8669,0.0000,4614.7414,4737.0478
x2,79.7319,6.6867,11.9240,0.0000,66.6258,92.8379
x3,-440.5288,6.8383,-64.4208,0.0000,-453.9320,-427.1256
x4,496.4432,6.7301,73.7649,0.0000,483.2521,509.6342
x5,-190.1759,7.9864,-23.8124,0.0000,-205.8294,-174.5223
x6,-209.3948,7.0214,-29.8225,0.0000,-223.1568,-195.6328
x7,-766.0692,48.5550,-15.7774,0.0000,-861.2378,-670.9005
x8,20.1115,29.6365,0.6786,0.4974,-37.9767,78.1996

0,1,2,3
Omnibus:,12406.161,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,244618.648
Skew:,0.894,Prob(JB):,0.0
Kurtosis:,14.526,Condition No.:,18.0


#### 4. [1 point] Implement one of the elimination algorithms that were described in the Seminar_4 (Elimination by P-value, Forward elimination, Backward elimination), make conclusions.

In [294]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

#### 5. [1 point] Find the best (in terms of RMSE) $\alpha$ for Lasso regression using cross-validation with 4 folds. You must select values from range $[10^{-4}, 10^{3}]$.

In [295]:
# your code here
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

## Gradient descent

#### 6. [3.5 points] Implement a Ridge regression model for the MSE loss function, trained by gradient descent.

All calculations must be vectorized, and python loops can only be used for gradient descent iterations. As a stop criterion, you must use (simultaneously):

* checking for the Absolute-value norm of the weight difference on two adjacent iterations (for example, less than some small number of the order of $10^{-6}$, set by the `tolerance` parameter);
* reaching the maximum number of iterations (for example, 10000, set by the `max_iter` parameter).

You need to implement:

* Full gradient descent:

$$
w_{k + 1} = w_{k} - \eta_{k} \nabla_{w} Q(w_{k}).
$$

* Stochastic Gradient Descent:

$$
w_{k + 1} = w_{k} - \eta_{k} \nabla_{w} q_{i_{k}}(w_{k}).
$$

$\nabla_{w} q_{i_{k}}(w_{k}) \, $ is the estimate of the gradient over the batch of objects selected randomly.

* Momentum method:

$$
h_0 = 0, \\
h_{k + 1} = \alpha h_{k} + \eta_k \nabla_{w} Q(w_{k}), \\
w_{k + 1} = w_{k} - h_{k + 1}.
$$

* Adagrad method:

$$
G_0 = 0, \\
G_{k + 1} = G_{k} + (\nabla_{w} Q(w_{k+1}))^2, \\
w_{k + 1} = w_{k} - \eta * \frac{\nabla_{w} Q(w_{k+1})}{\sqrt{G_{k+1} + \epsilon}}.
$$



To make sure that the optimization process really converges, we will use the `loss_history` class attribute. After calling the `fit` method, it should contain the values of the loss function for all iterations, starting from the first one (before the first step on the anti-gradient).

You need to initialize the weights with a random vector from normal distribution. The following is a template class that needs to contain the code implementing all variations of the models.

In [296]:
from sklearn.base import BaseEstimator

class LinReg(BaseEstimator):
    def __init__(self, delta=1.0, gd_type='Momentum', 
                 tolerance=1e-4, max_iter=1000, w0=None, eta=1e-2, alpha=1e-3):
        """
        gd_type: str
            'GradientDescent', 'StochasticDescent', 'Momentum', 'Adagrad'
        delta: float
            proportion of object in a batch (for stochastic GD)
        tolerance: float
            for stopping gradient descent
        max_iter: int
            maximum number of steps in gradient descent
        w0: np.array of shape (d)
            init weights
        eta: float
            learning rate
        alpha: float
            momentum coefficient
        reg_cf: float
            regularization coefficient
        epsilon: float
            numerical stability
        """
        
        self.delta = delta
        self.gd_type = gd_type
        self.tolerance = tolerance
        self.max_iter = max_iter
        self.w0 = w0
        self.alpha = alpha
        self.w = None
        self.eta = eta
        self.loss_history = None # list of loss function values at each training iteration
    
    def fit(self, X, y):
        """
        X: np.array of shape (l, d)
        y: np.array of shape (l)
        ---
        output: self
        """
        self.loss_history = []

        return self
    
    def predict(self, X):
        if self.w is None:
            raise Exception('Not trained yet')
        
        pass
    
    def calc_gradient(self, X, y):
        """
        X: np.array of shape (l, d) (l can be equal to 1 if stochastic)
        y: np.array of shape (l)
        ---
        output: np.array of shape (d)
        """
        pass

    def calc_loss(self, X, y):
        """
        X: np.array of shape (l, d)
        y: np.array of shape (l)
        ---
        output: float 
        """ 
        pass

#### 7. [1 points] Train and validate "hand-written" models on the same data, and compare the quality with the Sklearn or StatsModels methods. Investigate the effect of the `max_iter` and `alpha` parameters on the optimization process. Is it consistent with your expectations?

In [297]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ

#### 8. [1 points] Plot graphs (on the same picture) of the dependence of the loss function value on the iteration number for Full GD, SGD, Momentum and Adagrad. Draw conclusions about the rate of convergence of various modifications of gradient descent.

Don't forget about what *beautiful* graphics should look like!

In [298]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ