In [57]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression

from scipy.stats import pearsonr

제3유형 다중회귀분석 및 상관분석

- 당뇨병 환자의 질병 진행정도 데이터셋

In [58]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()

x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target)

y.columns = ['target']

In [59]:
# sklearn 라이브러리 (LinearRegression 활용)

x = x[['age','sex','bmi']]

print(x.head())
print(y.head())

        age       sex       bmi
0  0.038076  0.050680  0.061696
1 -0.001882 -0.044642 -0.051474
2  0.085299  0.050680  0.044451
3 -0.089063 -0.044642 -0.011595
4  0.005383 -0.044642 -0.036385
   target
0   151.0
1    75.0
2   141.0
3   206.0
4   135.0


- 회귀식 : y = b0 + b1x1 + b2x2 + b3x3

  (x1=age, x2=sex, x3=bmi)

In [60]:
linear_model = LinearRegression()

linear_model.fit(x, y)

In [61]:
# 회귀분석 관련 지표 출력

# 1. Rsq(R2 Score 결정계수) 출력하기
print(round(linear_model.score(x, y),3))    # .score  : 결정계수를 의미!

0.351


In [62]:
# 2 회귀계수 출력  (coef_)

print(np.round(linear_model.coef_,2))  # 전체 계수
print(np.round(linear_model.coef_[0,0],2))  # X1의 회귀계수(b1)
print(np.round(linear_model.coef_[0,1],2))  # X2의 회귀계수(b2)
print(np.round(linear_model.coef_[0,2],2))  # X3의 회귀계수(b3)

[[138.9  -36.14 926.91]]
138.9
-36.14
926.91


In [63]:
# 회귀계수 (절편 b0) intercept_

print(np.round(linear_model.intercept_,2)[0])

152.13


즉, 회귀식은 
#### y = 152.13 + 138.9age -36.14sex + 926.91bmi

In [64]:
# by statsmodel.formula 활용
import statsmodels.api as sm

# 독립변수와 종속변수 결정
x = x[['age','sex','bmi']]
y = y['target']

print(x.head())
print(y.head())

        age       sex       bmi
0  0.038076  0.050680  0.061696
1 -0.001882 -0.044642 -0.051474
2  0.085299  0.050680  0.044451
3 -0.089063 -0.044642 -0.011595
4  0.005383 -0.044642 -0.036385
0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64


In [70]:
x = sm.add_constant(x) # 주의! 반드시 상수항을 추가해주어야 한다!
model = sm.OLS(y,x).fit() # 순서 주의! 종속변수, 독립변수

summary = model.summary()

print(summary) # 회귀식의 p-value를 요구할 경우 사용하여야 한다!

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.346
Method:                 Least Squares   F-statistic:                     78.94
Date:                Fri, 01 Dec 2023   Prob (F-statistic):           7.77e-41
Time:                        14:09:46   Log-Likelihood:                -2451.6
No. Observations:                 442   AIC:                             4911.
Df Residuals:                     438   BIC:                             4928.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.1335      2.964     51.321      0.0