# 09 Regularized Model-LASSO code 실습

## [목적]
### 1. LASSO
* Regularized Linear Model을 활용하여 Overfitting을 방지함
* Hyperparameter lambda를 튜닝할 때 for loop와 GridsearchCV를 활용
### 2. Regularized Linear Models의 경우 X's scaling을 필수적으로 진행해야함
## [Process]
### 1. Define X's &y
### 2. Split Train & Valid dataset
### 3. Modeling
### 4. Model 해석

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_diabetes

In [2]:
import scipy
from sklearn import metrics

def sse(clf, X, y):
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]


def adj_r2_score(clf, X, y):
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y):
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y):
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y):
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))

In [3]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

data_df = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     442 non-null    int64  
 1   SEX     442 non-null    int64  
 2   BMI     442 non-null    float64
 3   BP      442 non-null    float64
 4   S1      442 non-null    int64  
 5   S2      442 non-null    float64
 6   S3      442 non-null    float64
 7   S4      442 non-null    float64
 8   S5      442 non-null    float64
 9   S6      442 non-null    int64  
 10  Y       442 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


In [4]:
y = data_df['Y']
X = data_df.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX'])
X.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,0,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,1,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,0,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,1,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,1,0


In [6]:
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(len(train_idx), len(valid_idx))

309 133


In [7]:
scaler = MinMaxScaler().fit(X.iloc[train_idx]) # train data로만 훈련
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)
X_scal.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,0.666667,0.57384,0.565217,0.294118,0.297578,0.197368,0.318471,0.562217,0.439394,0.0,1.0
1,0.483333,0.130802,0.362319,0.421569,0.355248,0.618421,0.159236,0.222437,0.166667,1.0,0.0
2,0.883333,0.506329,0.449275,0.289216,0.299885,0.236842,0.318471,0.496578,0.409091,0.0,1.0
3,0.083333,0.28692,0.318841,0.495098,0.517878,0.223684,0.477707,0.572923,0.469697,1.0,0.0
4,0.516667,0.189873,0.565217,0.465686,0.483276,0.381579,0.318471,0.362385,0.333333,1.0,0.0


## [LASSO Regression]
* Hyperparameter Tuning using for Loop
* Hyperparameter Tuning using GridSearchCV

## [LASSO Regression Parameters]
  - Package : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
  - alpha : L1-norm Penalty Term
    - alpha : 0 일 때, Just Linear Regression
  - fit_intercept : Centering to zero
    - 베타0를 0로 보내는 것 (베타0는 상수이기 때문에)
  - max_iter : Maximum number of interation
    - Loss Function의 LASSO Penalty Term은 절대 값이기 때문에 Gradient Descent와 같은 최적화가 필요함
    - Penalty Term : $||y - Xw||^2 + alpha * ||w||^1$

In [12]:
penalty = [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.02, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 5, 10]

In [13]:
for p in penalty:
    model = Lasso(alpha=p).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
    score = model.score(X_scal.iloc[valid_idx], y.iloc[valid_idx])
    pred_y = model.predict(X_scal.iloc[valid_idx])
    mse = mean_squared_error(y.iloc[valid_idx], pred_y)
    print(f"Alpha: {p:.7f}, R2: {score:.7f}, MSE: {mse:.7f}, RMSE: {np.sqrt(mse):.7f}")

Alpha: 0.0000100, R2: 0.5301656, MSE: 3084.6092464, RMSE: 55.5392586
Alpha: 0.0000500, R2: 0.5301674, MSE: 3084.5972157, RMSE: 55.5391503
Alpha: 0.0001000, R2: 0.5301697, MSE: 3084.5822667, RMSE: 55.5390157
Alpha: 0.0010000, R2: 0.5302081, MSE: 3084.3301897, RMSE: 55.5367463
Alpha: 0.0100000, R2: 0.5304264, MSE: 3082.8971348, RMSE: 55.5238429
Alpha: 0.0200000, R2: 0.5306024, MSE: 3081.7414647, RMSE: 55.5134350
Alpha: 0.0500000, R2: 0.5295088, MSE: 3088.9213387, RMSE: 55.5780653
Alpha: 0.1000000, R2: 0.5281775, MSE: 3097.6617244, RMSE: 55.6566413
Alpha: 0.3000000, R2: 0.5210059, MSE: 3144.7454151, RMSE: 56.0780297
Alpha: 0.5000000, R2: 0.5131125, MSE: 3196.5679788, RMSE: 56.5381993
Alpha: 0.7000000, R2: 0.5050718, MSE: 3249.3577616, RMSE: 57.0031382
Alpha: 1.0000000, R2: 0.4906431, MSE: 3344.0866656, RMSE: 57.8280785
Alpha: 5.0000000, R2: 0.2420607, MSE: 4976.1074015, RMSE: 70.5415296
Alpha: 10.0000000, R2: -0.0256145, MSE: 6733.4784067, RMSE: 82.0577748


In [14]:
model_best = Lasso(alpha=0.02).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
summary(model_best, X_scal.iloc[valid_idx], y.iloc[valid_idx], xlabels=X.columns)

Coefficients:
              Estimate    Std. Error  t value   p value
_intercept    4.144976  3.802265e+08   0.0000  1.000000
AGE         -13.359132  2.394732e+01  -0.5579  0.577888
BMI         127.876677  3.181667e+01   4.0192  0.000098
BP           66.897382  2.831870e+01   2.3623  0.019623
S1         -153.025383  1.637126e+02  -0.9347  0.351640
S2          102.155001  1.161538e+02   0.8795  0.380739
S3          -10.583686  7.080118e+01  -0.1495  0.881399
S4            9.263867  5.689926e+01   0.1628  0.870916
S5          181.017864  5.011407e+01   3.6121  0.000430
S6           18.390761  3.419937e+01   0.5378  0.591654
SEX_1        20.777166  3.802265e+08   0.0000  1.000000
SEX_2        -0.000000  3.802265e+08  -0.0000  1.000000
---
R-squared:  0.530602,    Adjusted R-squared:  0.487930,    MSE: 3081.7


In [17]:
# using LassoCV
lasso_cv = LassoCV(alphas=penalty, cv=5)
model = lasso_cv.fit(X_scal.iloc[train_idx], y.iloc[train_idx])
print(f"best alpha : {model.alpha_: .5f}")

best alpha :  0.30000


In [19]:
# LassoCV result
model_best = Lasso(alpha=model.alpha_).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
score = model_best.score(X_scal.iloc[valid_idx], y.iloc[valid_idx])
pred_y = model_best.predict(X_scal.iloc[valid_idx])
mse = np.sqrt(mean_squared_error(y.iloc[valid_idx], pred_y))
print(f"alpha : {model.alpha_: .5f}")
summary(model_best, X_scal.iloc[valid_idx], y.iloc[valid_idx], xlabels=X_scal.columns)

alpha :  0.30000
Coefficients:
              Estimate    Std. Error  t value   p value
_intercept   35.476784  3.840936e+08   0.0000  1.000000
AGE          -3.436299  2.360743e+01  -0.1456  0.884491
BMI         127.199861  3.204797e+01   3.9690  0.000118
BP           60.468162  2.836414e+01   2.1319  0.034871
S1           -6.083815  1.651941e+02  -0.0368  0.970678
S2           -0.000000  1.170923e+02  -0.0000  1.000000
S3          -62.922734  7.104278e+01  -0.8857  0.377389
S4            0.000000  5.734173e+01   0.0000  1.000000
S5          124.200563  5.040213e+01   2.4642  0.015018
S6            8.099838  3.405674e+01   0.2378  0.812379
SEX_1        18.108153  3.840936e+08   0.0000  1.000000
SEX_2        -0.000000  3.840936e+08  -0.0000  1.000000
---
R-squared:  0.521006,    Adjusted R-squared:  0.477461,    MSE: 3144.7


In [25]:
# BMI, S3, S5 만 LinearRegression 적용
target_column = ['BMI', 'BP', 'S3', 'S5']

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
results = LinearRegression().fit(X.iloc[train_idx][target_column], y.iloc[train_idx])

In [28]:
summary(results, X.iloc[valid_idx][target_column], y.loc[valid_idx], xlabels=target_column)

Coefficients:
              Estimate  Std. Error  t value   p value
_intercept -242.768534   64.246649  -3.7787  0.000238
BMI           5.719775    1.133027   5.0482  0.000001
BP            0.737781    0.189800   3.8871  0.000160
S3           -0.670649    0.296573  -2.2613  0.025374
S5           45.066168    9.656949   4.6667  0.000007
---
R-squared:  0.504676,    Adjusted R-squared:  0.489197,    MSE: 3252.0
