# 단순 회귀분석
## 단순 회귀분석의 특징
- 연속형 종속변수와 독립변수 간 선형관계 및 설명력을 확인하는 기법
- 종속변수와 독립변수가 각각 하나인 경우의 단순 선형 회귀 모형
- 설명력과 더불어 오차 평가 지표로 모델의 성능을 평가\

## 주요 함수 및 메서드
__statsmodels-ols()__
- ols() 함수 내에 종속변수와 독립변수 선언
- 변수명에 온점 등 특정 특수문자가 있는 경우 오류 발생
- 모델 객체의 predict() 메서드로 예측

__sklearn-LinearRegression__
- fit_intercept로 절편 적합 여부 설정 가능
- 모델 객체의 coef_와 intercept_ 어트리뷰트로 각각 계수와 절편 확인
- 모델 객체의 predict() 메서드로 예측

__sklearn-mean_absolute_error()__
- MAE(Mean Absolute Error) 연산

__sklearn-mean_squared_error()__
- MSE(Mean Squared Error)
- RMSE

In [1]:
import pandas as pd
from statsmodels.formula.api import ols

In [2]:
df = pd.read_csv("Data/iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [5]:
df.columns = ["SL", "SW", "PL", "PW", "species"]
df.head(2)

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [7]:
model = ols(formula = "SL ~ SW", data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,SL,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.074
Date:,"Sun, 08 May 2022",Prob (F-statistic):,0.152
Time:,22:29:17,Log-Likelihood:,-183.0
No. Observations:,150,AIC:,370.0
Df Residuals:,148,BIC:,376.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5262,0.479,13.628,0.000,5.580,7.473
SW,-0.2234,0.155,-1.440,0.152,-0.530,0.083

0,1,2,3
Omnibus:,4.389,Durbin-Watson:,0.952
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.237
Skew:,0.36,Prob(JB):,0.12
Kurtosis:,2.6,Cond. No.,24.2


In [8]:
model = ols(formula = "PL ~ PW", data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,PL,R-squared:,0.927
Model:,OLS,Adj. R-squared:,0.927
Method:,Least Squares,F-statistic:,1882.0
Date:,"Sun, 08 May 2022",Prob (F-statistic):,4.6800000000000005e-86
Time:,22:30:06,Log-Likelihood:,-101.18
No. Observations:,150,AIC:,206.4
Df Residuals:,148,BIC:,212.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0836,0.073,14.850,0.000,0.939,1.228
PW,2.2299,0.051,43.387,0.000,2.128,2.332

0,1,2,3
Omnibus:,2.438,Durbin-Watson:,1.43
Prob(Omnibus):,0.295,Jarque-Bera (JB):,1.966
Skew:,0.211,Prob(JB):,0.374
Kurtosis:,3.369,Cond. No.,3.7


y = 2.2299x + 1.0836

In [9]:
model.predict(df.iloc[:6, ])

0    1.529546
1    1.529546
2    1.529546
3    1.529546
4    1.529546
5    1.975534
dtype: float64

In [12]:
df['pred'] = model.predict(df)
df.head()

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546
1,4.9,3.0,1.4,0.2,setosa,1.529546
2,4.7,3.2,1.3,0.2,setosa,1.529546
3,4.6,3.1,1.5,0.2,setosa,1.529546
4,5.0,3.6,1.4,0.2,setosa,1.529546


In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
df["PL"].head()

0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: PL, dtype: float64

In [16]:
df[["PL"]].head(2)

Unnamed: 0,PL
0,1.4
1,1.4


In [17]:
df.iloc[0, ]

SL              5.1
SW              3.5
PL              1.4
PW              0.2
species      setosa
pred       1.529546
Name: 0, dtype: object

In [18]:
df.iloc[[0], ]

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546


In [19]:
model = LinearRegression().fit(X = df[["PL"]], 
                               y = df["PW"])
model

LinearRegression()

In [20]:
model.coef_

array([0.41575542])

In [21]:
model.intercept_

-0.3630755213190291

In [22]:
model.predict(df[["PL"]])

array([0.21898206, 0.21898206, 0.17740652, 0.2605576 , 0.21898206,
       0.34370869, 0.21898206, 0.2605576 , 0.21898206, 0.2605576 ,
       0.2605576 , 0.30213314, 0.21898206, 0.09425544, 0.13583098,
       0.2605576 , 0.17740652, 0.21898206, 0.34370869, 0.2605576 ,
       0.34370869, 0.2605576 , 0.0526799 , 0.34370869, 0.42685977,
       0.30213314, 0.30213314, 0.2605576 , 0.21898206, 0.30213314,
       0.30213314, 0.2605576 , 0.2605576 , 0.21898206, 0.2605576 ,
       0.13583098, 0.17740652, 0.21898206, 0.17740652, 0.2605576 ,
       0.17740652, 0.17740652, 0.17740652, 0.30213314, 0.42685977,
       0.21898206, 0.30213314, 0.21898206, 0.2605576 , 0.21898206,
       1.59097494, 1.50782385, 1.67412602, 1.29994614, 1.54939939,
       1.50782385, 1.59097494, 1.00891735, 1.54939939, 1.2583706 ,
       1.09206844, 1.38309723, 1.29994614, 1.59097494, 1.13364398,
       1.46624831, 1.50782385, 1.34152169, 1.50782385, 1.2583706 ,
       1.63255048, 1.29994614, 1.67412602, 1.59097494, 1.42467

In [23]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [24]:
mean_absolute_error(y_true = df["PL"], y_pred = df["PW"])

2.558666666666667

In [25]:
mean_squared_error(y_true = df["PL"], y_pred = df["PW"])

7.645466666666667

In [26]:
# RMSE
mean_squared_error(y_true = df["PL"], y_pred = df["PW"]) ** 0.5

2.76504370067937

In [27]:
# ======================================================
# 1. 종속변수를 registered, 독립변수를 temp로 했을 때 결정계수
# bike.csv
# statsmodels 함수 활용
# 학습데이터 비율 70%, seed 123

# 정답: 0.106
# ======================================================
df = pd.read_csv("Data/bike.csv")

In [28]:
from sklearn.model_selection import train_test_split

In [31]:
df_train, df_test = train_test_split(df, train_size = 0.7, random_state=123)
df_train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70


In [32]:
model = ols(formula = "registered ~ temp", data = df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Sun, 08 May 2022",Prob (F-statistic):,1.92e-187
Time:,22:47:34,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


In [33]:
# ======================================================
# 2. 종속변수를 casual, 독립변수를 atemp로 했을 때 RMSE?
# bike.csv
# statsmodels 함수 활용
# 학습데이터 비율 70%, seed 123

# 정답: 44.5
# ======================================================
df = pd.read_csv("Data/bike.csv")

df_train, df_test = train_test_split(df, train_size = 0.7, random_state=123)
df_train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70


In [34]:
model = ols(formula = "casual ~ atemp", data = df_train).fit()
pred = model.predict(df_test)
pred[:4]

6495    31.499001
7050    12.626390
558     10.537120
5085    33.588271
dtype: float64

In [35]:
mean_squared_error(y_pred = pred,
                  y_true = df_test["casual"])**0.5

44.462370102714324

In [37]:
# ======================================================
# 3. 종속변수를 casual, 독립변수를 atemp로 했을 때
# 여름과 겨울의 RMSE 차이는? (절대값 취하기)
# bike.csv
# statsmodels 함수 활용
# 학습데이터 비율 70%, seed 123

# 정답: 8.6
# ======================================================
df = pd.read_csv("Data/bike.csv")

In [38]:
df_s2 = df.loc[df["season"] == 2,]
df_s4 = df.loc[df["season"] == 4,]

In [39]:
df_s2_train, df_s2_test = train_test_split(df_s2,
                                          train_size = 0.7,
                                          random_state=123)
df_s4_train, df_s4_test = train_test_split(df_s4,
                                          train_size = 0.7,
                                          random_state=123)

In [40]:
model_s2 = ols(formula = "casual ~ atemp", data = df_s2_train).fit()
model_s4 = ols(formula = "casual ~ atemp", data = df_s4_train).fit()
pred_s2 = model_s2.predict(df_s2_test)
pred_s4 = model_s4.predict(df_s4_test)
RMSE_s2 = mean_squared_error(y_pred = pred_s2,
                            y_true = df_s2_test["casual"]) ** 0.5
RMSE_s4 = mean_squared_error(y_pred = pred_s4,
                            y_true = df_s4_test["casual"]) ** 0.5
abs(RMSE_s2 - RMSE_s4)

8.648423450414178