# 07 Reqularized Model - Ridge Code 실습
## 목표
### 1. Linear Regression
* 변수의 중요도 및 방향성 파악
* 큰 데이터에 적합하지 않음
* 설명력에서는 큰 장점이 있음
### 2. Ridge Regression
* Regularized Linear Model을 활용하여 Overffing을 방지
* Hyperparameter lambda를 튜닝할 때 for loop 뿐만 아니라 GridsearchCV를 통해 도출함
### 3. Regularized Linear Models의 경우 X's Scaling을 필수적으로 진행해야함
## Process
* Define X's & Y
* Split Train & Valid dataset
* Modeling
* Model 해석

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_diabetes # 당료병

In [2]:
data = load_diabetes()

In [3]:
data['data'].shape

(442, 10)

In [4]:
data['target'].shape

(442,)

In [5]:
data_df = pd.DataFrame(data['data'], columns=data['feature_names'])
data_df.describe().applymap(lambda x: f'{x:0.2f}')

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
std,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
min,-0.11,-0.04,-0.09,-0.11,-0.13,-0.12,-0.1,-0.08,-0.13,-0.14
25%,-0.04,-0.04,-0.03,-0.04,-0.03,-0.03,-0.04,-0.04,-0.03,-0.03
50%,0.01,-0.04,-0.01,-0.01,-0.0,-0.0,-0.01,-0.0,-0.0,-0.0
75%,0.04,0.05,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03
max,0.11,0.05,0.17,0.13,0.15,0.2,0.18,0.19,0.13,0.14


In [6]:
print(data['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [7]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

data_df = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     442 non-null    int64  
 1   SEX     442 non-null    int64  
 2   BMI     442 non-null    float64
 3   BP      442 non-null    float64
 4   S1      442 non-null    int64  
 5   S2      442 non-null    float64
 6   S3      442 non-null    float64
 7   S4      442 non-null    float64
 8   S5      442 non-null    float64
 9   S6      442 non-null    int64  
 10  Y       442 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


In [8]:
y = data_df['Y']
X = data_df.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX'])
X

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,59,32.1,101.00,157,93.2,38.0,4.00,4.8598,87,0,1
1,48,21.6,87.00,183,103.2,70.0,3.00,3.8918,69,1,0
2,72,30.5,93.00,156,93.6,41.0,4.00,4.6728,85,0,1
3,24,25.3,84.00,198,131.4,40.0,5.00,4.8903,89,1,0
4,50,23.0,101.00,192,125.4,52.0,4.00,4.2905,80,1,0
...,...,...,...,...,...,...,...,...,...,...,...
437,60,28.2,112.00,185,113.8,42.0,4.00,4.9836,93,0,1
438,47,24.9,75.00,225,166.0,42.0,5.00,4.4427,102,0,1
439,60,24.9,99.67,162,106.6,43.0,3.77,4.1271,95,0,1
440,36,30.0,95.00,201,125.2,42.0,4.79,5.1299,85,1,0


In [9]:
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(len(train_idx), len(valid_idx))

309 133


In [10]:
results = LinearRegression().fit(X.iloc[train_idx], y.iloc[train_idx])

In [11]:
import scipy
from sklearn import metrics

def sse(clf, X, y):
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]


def adj_r2_score(clf, X, y):
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y):
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y):
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y):
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))

In [12]:
summary(results, X.iloc[valid_idx], y.loc[valid_idx], xlabels=X.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept -353.422717  5.805365e+08+0.000000e+00j -0.0000+0.0000j  1.000000
AGE          -0.241046  6.848335e+00+0.000000e+00j -0.0352+0.0000j  0.971975
BMI           5.364734  1.280634e+00+0.000000e+00j  4.1891-0.0000j  0.000051
BP            0.973515  3.714890e-01+0.000000e+00j  2.6206-0.0000j  0.009807
S1           -1.128987  3.149810e-01+0.000000e+00j -3.5843+0.0000j  0.000474
S2            0.935342  3.621640e-01+0.000000e+00j  2.5827-0.0000j  0.010894
S3            0.295834  4.261810e-01+0.000000e+00j  0.6942+0.0000j  0.488808
S4            2.577375  1.013887e+01+0.000000e+00j  0.2542-0.0000j  0.799731
S5           72.840272  2.226409e+01+0.000000e+00j  3.2716+0.0000j  0.001364
S6            0.292290  3.922070e-01+0.000000e+00j  0.7452-0.0000j  0.457449
SEX_1        10.444984  5.805365e+08+0.000000e+00j  0.0000+0.0000j  1.000000
SEX_2       -10.444984  5.805364e+08+0.000000e+00j -0.0000+0.0

In [13]:
summary(results, X.iloc[train_idx], y.loc[train_idx], xlabels=X.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept -353.422717  7.843017e+08+3.233200e-02j -0.0000+0.0000j  1.000000
AGE          -0.241046  4.648774e+00+3.960723e+00j -0.0300+0.0256j  0.968542
BMI           5.364734  8.470680e-01+7.852000e-03j  6.3328-0.0587j  0.000000
BP            0.973515  2.558100e-01-1.800000e-05j  3.8056+0.0003j  0.000171
S1           -1.128987  2.130450e-01-4.780000e-04j -5.2993-0.0119j  0.000000
S2            0.935342  2.437880e-01-4.790000e-04j  3.8367+0.0075j  0.000151
S3            0.295834  3.389010e-01+3.558000e-03j  0.8728-0.0092j  0.383413
S4            2.577375  6.594358e+00+3.056900e-02j  0.3908-0.0018j  0.696185
S5           72.840272  1.421280e+01+2.071070e-01j  5.1239-0.0747j  0.000001
S6            0.292290  3.059540e-01+4.600000e-05j  0.9553-0.0001j  0.340155
SEX_1        10.444984  7.843016e+08+1.826773e+00j  0.0000-0.0000j  1.000000
SEX_2       -10.444984  7.843016e+08+1.342689e+00j -0.0000+0.0

In [14]:
scaler = MinMaxScaler().fit(X.iloc[train_idx]) # train data로만 훈련
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)
X_scal.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,0.666667,0.57384,0.565217,0.294118,0.297578,0.197368,0.318471,0.562217,0.439394,0.0,1.0
1,0.483333,0.130802,0.362319,0.421569,0.355248,0.618421,0.159236,0.222437,0.166667,1.0,0.0
2,0.883333,0.506329,0.449275,0.289216,0.299885,0.236842,0.318471,0.496578,0.409091,0.0,1.0
3,0.083333,0.28692,0.318841,0.495098,0.517878,0.223684,0.477707,0.572923,0.469697,1.0,0.0
4,0.516667,0.189873,0.565217,0.465686,0.483276,0.381579,0.318471,0.362385,0.333333,1.0,0.0


In [15]:
results = LinearRegression().fit(X_scal.iloc[train_idx], y.iloc[train_idx])
summary(results, X_scal.iloc[valid_idx], y.iloc[valid_idx], xlabels=X_scal.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept   -2.765884  3.804036e+08+0.000000e+00j -0.0000+0.0000j  1.000000
AGE         -14.462769  2.365959e+01-0.000000e+00j -0.6113-0.0000j  0.542062
BMI         127.144195  3.180163e+01+0.000000e+00j  3.9980+0.0000j  0.000106
BP           67.172560  2.822302e+01-0.000000e+00j  2.3801+0.0000j  0.018740
S1         -230.313267  1.637134e+02+0.000000e+00j -1.4068+0.0000j  0.161834
S2          162.188278  1.161212e+02+0.000000e+00j  1.3967+0.0000j  0.164843
S3           22.483360  7.068009e+01-0.000000e+00j  0.3181+0.0000j  0.750912
S4           16.185916  5.686624e+01-0.000000e+00j  0.2846+0.0000j  0.776373
S5          207.514650  5.002405e+01+0.000000e+00j  4.1483+0.0000j  0.000060
S6           19.291168  3.403284e+01-0.000000e+00j  0.5668+0.0000j  0.571786
SEX_1        10.444984  3.804036e+08+0.000000e+00j  0.0000+0.0000j  1.000000
SEX_2       -10.444984  3.804036e+08+0.000000e+00j -0.0000+0.0

## [Ridge Regression]
* Hyperparameter Tuning using for Loop
* Hyperparameter Tuning using for GridSearchCV

## [Ridge Regression Parameters]
* Package : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
* alpha : L2-norm Penalty Term
    * alpha : 0 일 때, Just Linear Regression
    * fit_intercept : Centering to zero
        - 베타0를 0로 보내는 것 (베타0는 상수이기 때문에)
    * max_iter : Maximum number of interation
        - Loss Function의 Ridge Penalty Term은 Closed Form 값이기는 하지만 값을 찾아 나감
        - Penalty Term : (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_2

In [16]:
penalty = [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 5, 10]

In [17]:
# for loop, using ridge
for p in penalty:
    model = Ridge(alpha=p).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
    score = model.score(X_scal.iloc[valid_idx], y.iloc[valid_idx])
    pred_y = model.predict(X_scal.iloc[valid_idx])
    mse = mean_squared_error(y.iloc[valid_idx], pred_y)
    print(f"Alpha {p: .5f}, R2 {score: .7f}, MSE {mse: .7f}, RMSE {np.sqrt(mse): .7f}")

Alpha  0.00001, R2  0.5301655, MSE  3084.6095744, RMSE  55.5392616
Alpha  0.00005, R2  0.5301672, MSE  3084.5988322, RMSE  55.5391648
Alpha  0.00010, R2  0.5301692, MSE  3084.5854446, RMSE  55.5390443
Alpha  0.00100, R2  0.5302048, MSE  3084.3519225, RMSE  55.5369420
Alpha  0.01000, R2  0.5304637, MSE  3082.6522133, RMSE  55.5216373
Alpha  0.05000, R2  0.5306739, MSE  3081.2719053, RMSE  55.5092056
Alpha  0.10000, R2  0.5304511, MSE  3082.7343410, RMSE  55.5223769
Alpha  0.30000, R2  0.5294946, MSE  3089.0142040, RMSE  55.5789007
Alpha  0.50000, R2  0.5285641, MSE  3095.1232917, RMSE  55.6338323
Alpha  0.70000, R2  0.5275205, MSE  3101.9751863, RMSE  55.6953785
Alpha  1.00000, R2  0.5257398, MSE  3113.6657269, RMSE  55.8002305
Alpha  5.00000, R2  0.4932187, MSE  3327.1766532, RMSE  57.6816839
Alpha  10.00000, R2  0.4513724, MSE  3601.9109235, RMSE  60.0159222


In [18]:
model_best = Ridge(alpha=0.05).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
summary(model_best, X_scal.iloc[valid_idx], y.iloc[valid_idx], xlabels=X_scal.columns)

Coefficients:
              Estimate    Std. Error  t value   p value
_intercept   17.825740  3.801975e+08   0.0000  1.000000
AGE         -13.785283  2.357059e+01  -0.5849  0.559646
BMI         127.651862  3.178432e+01   4.0162  0.000099
BP           67.519687  2.819772e+01   2.3945  0.018048
S1         -138.819177  1.636202e+02  -0.8484  0.397738
S2           90.895898  1.160603e+02   0.7832  0.434925
S3          -17.147463  7.064699e+01  -0.2427  0.808599
S4            9.075568  5.681209e+01   0.1597  0.873324
S5          174.903805  4.997324e+01   3.4999  0.000635
S6           19.246301  3.401304e+01   0.5659  0.572456
SEX_1        10.540953  3.801975e+08   0.0000  1.000000
SEX_2       -10.540953  3.801975e+08  -0.0000  1.000000
---
R-squared:  0.530674,    Adjusted R-squared:  0.488008,    MSE: 3081.3


In [19]:
# using RidgeCV
ridge_cv = RidgeCV(alphas=penalty, cv=5)
model = ridge_cv.fit(X_scal.iloc[train_idx], y.iloc[train_idx])
print(f"best alpha : {model.alpha_: .5f}, R2: {model.best_score_:.4f}")

best alpha :  1.00000, R2: 0.4419


In [20]:
# RidgeCV result
model_best = Ridge(alpha=model.alpha_).fit(X_scal.iloc[train_idx], y.iloc[train_idx])
score = model_best.score(X_scal.iloc[valid_idx], y.iloc[valid_idx])
pred_y = model_best.predict(X_scal.iloc[valid_idx])
mse = np.sqrt(mean_squared_error(y.iloc[valid_idx], pred_y))
print(f"alpha : {model.alpha_: .5f}, R2: {model.best_score_:.4f}")
summary(model_best, X_scal.iloc[valid_idx], y.iloc[valid_idx], xlabels=X_scal.columns)

alpha :  1.00000, R2: 0.4419
Coefficients:
              Estimate    Std. Error  t value   p value
_intercept   43.015535  3.821909e+08   0.0000  1.000000
AGE         -10.212337  2.384223e+01  -0.4283  0.669109
BMI         120.864809  3.195330e+01   3.7825  0.000234
BP           66.451269  2.837207e+01   2.3421  0.020670
S1          -19.444296  1.644912e+02  -0.1182  0.906082
S2           -6.763887  1.166733e+02  -0.0580  0.953858
S3          -55.007168  7.100133e+01  -0.7747  0.439882
S4           18.933213  5.710622e+01   0.3315  0.740760
S5          117.213598  5.025068e+01   2.3326  0.021183
S6           22.940872  3.418407e+01   0.6711  0.503331
SEX_1        10.460773  3.821908e+08   0.0000  1.000000
SEX_2       -10.460773  3.821908e+08  -0.0000  1.000000
---
R-squared:  0.525740,    Adjusted R-squared:  0.482625,    MSE: 3113.7


In [21]:
# BMI, S3, S5 만 LinearRegression 적용
target_column = ['BMI', 'BP', 'S3', 'S5']

In [22]:
results = LinearRegression().fit(X.iloc[train_idx][target_column], y.iloc[train_idx])

In [23]:
summary(results, X.iloc[valid_idx][target_column], y.loc[valid_idx], xlabels=target_column)

Coefficients:
              Estimate  Std. Error  t value   p value
_intercept -242.768534   64.246649  -3.7787  0.000238
BMI           5.719775    1.133027   5.0482  0.000001
BP            0.737781    0.189800   3.8871  0.000160
S3           -0.670649    0.296573  -2.2613  0.025374
S5           45.066168    9.656949   4.6667  0.000007
---
R-squared:  0.504676,    Adjusted R-squared:  0.489197,    MSE: 3252.0
