In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 선형회귀 관련 sklearn 라이브러리 불러오기
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# 선형회귀 관련 statsmodels 라이브러리 불러오기
import statsmodels.api as sm


In [2]:
# 간단한 맛보기
np.array([[0,1], [1,2], [2,2.5]])

array([[0. , 1. ],
       [1. , 2. ],
       [2. , 2.5]])

In [3]:
np.array([0,1.2, 1.6])

array([0. , 1.2, 1.6])

In [4]:
# 데이터
X = np.array([[0,1], [1,2], [2,2.5]])
y = np.array([0,1.2, 1.6])

In [5]:
# 선형회귀 객체 생성

reg = linear_model.LinearRegression()

In [6]:
# 훈련 데이터 >> fit 모델 적합

reg.fit(X, y)


LinearRegression()

In [8]:
# train data 로 예측
pred_train = reg.predict(X)
pred_train

array([4.4408921e-16, 1.2000000e+00, 1.6000000e+00])

In [9]:
# test data

pred_test = reg.predict([[1.5,2]])
pred_test

array([1.])

In [10]:
# coefficient(기울기, beta1, beta2)

reg.coef_

array([-0.4,  1.6])

In [12]:
# advertising 데이터 불러오기

ad = pd.read_csv('./Advertising.csv', index_col = 0)
ad[:10]

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
6,8.7,48.9,75.0,7.2
7,57.5,32.8,23.5,11.8
8,120.2,19.6,11.6,13.2
9,8.6,2.1,1.0,4.8
10,199.8,2.6,21.2,10.6


In [13]:
# advertising data 선형회귀

train = ad[:-20]
test = ad[-20:]

In [14]:
# train data의 feature의 결과변수 분리

train_X = train.drop('Sales', axis = 1)
train_y = train[['Sales']]

test_X = test.drop('Sales', axis = 1)
test_y = test[['Sales']]

In [15]:
# 선형회귀 객체 생성

reg = linear_model.LinearRegression()

In [18]:
# train_data 이용 >> fit(적합)

reg.fit(train_X, train_y)

LinearRegression()

In [20]:
# train data 활용 예측
train_y_pred = reg.predict(train_X)

In [21]:
# test data 활용  예측
test_y_pred = reg.predict(test_X)

In [22]:
# coefficients

print('Coefficients :', reg.coef_)

Coefficients : [[ 0.04638909  0.18867512 -0.0024597 ]]


In [26]:
# train data MSE
print('Train Data MSE : %.3f'%mean_squared_error(train_y, train_y_pred))

Train Data MSE : 2.827


In [27]:
# test data MSE
print('Test Data MSE : %.3f'%mean_squared_error(test_y, test_y_pred))

Test Data MSE : 2.453


In [32]:
# train data 모델의 설명력(r2_score)
print('Train Data r2_score : %.3f'%r2_score(train_y, train_y_pred))

Train Data r2_score : 0.892


In [30]:
# test data 모델의 설명력(r2_score)
print('Test Data r2_score : %.3f'%r2_score(test_y, test_y_pred))

Test Data r2_score : 0.929


In [33]:
# 결과 해석 : 과소적합 문제 발생 >> n(샘플 수 를 늘려야 함)
# train data 활용 모델 예측 설명도 0.892보다 test data 활용 모델 예측 설명도 0.929가 더높다

In [34]:
# Advertising data 선형회귀(statsmodels)

In [35]:
sm_train_X = train_X.copy()
sm_train_X['X0'] = 1

In [36]:
sm_test_X = test_X.copy()
sm_test_X['X0'] = 1

In [37]:
sm_train_X

Unnamed: 0,TV,Radio,Newspaper,X0
1,230.1,37.8,69.2,1
2,44.5,39.3,45.1,1
3,17.2,45.9,69.3,1
4,151.5,41.3,58.5,1
5,180.8,10.8,58.4,1
...,...,...,...,...
176,276.9,48.9,41.8,1
177,248.4,30.2,20.3,1
178,170.2,7.8,35.2,1
179,276.7,2.3,23.7,1


In [41]:
# 훈련

result = sm.OLS(train_y, sm_train_X).fit()
result.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.892
Model:,OLS,Adj. R-squared:,0.891
Method:,Least Squares,F-statistic:,486.3
Date:,"Tue, 14 Feb 2023",Prob (F-statistic):,6.570000000000001e-85
Time:,15:48:48,Log-Likelihood:,-348.95
No. Observations:,180,AIC:,705.9
Df Residuals:,176,BIC:,718.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TV,0.0464,0.001,31.154,0.000,0.043,0.049
Radio,0.1887,0.009,20.347,0.000,0.170,0.207
Newspaper,-0.0025,0.006,-0.395,0.693,-0.015,0.010
X0,2.8399,0.342,8.293,0.000,2.164,3.516

0,1,2,3
Omnibus:,56.196,Durbin-Watson:,2.104
Prob(Omnibus):,0.0,Jarque-Bera (JB):,140.467
Skew:,-1.343,Prob(JB):,3.15e-31
Kurtosis:,6.394,Cond. No.,467.0


In [45]:
# 다항회귀(Polynomial Feautres) 예제

In [69]:
from sklearn.preprocessing import PolynomialFeatures

In [71]:
X = np.arange(6).reshape(3,2)

In [72]:
# [1, a, b, a^2, ab, b^2] feature 생성

poly = PolynomialFeatures(2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [73]:
# interaction(상호작용) featrue만 생성

poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(X)


array([[ 1.,  0.,  1.,  0.],
       [ 1.,  2.,  3.,  6.],
       [ 1.,  4.,  5., 20.]])

In [74]:
# auto data 활용, 다항회귀분석

auto = pd.read_csv('Auto.csv')
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [76]:
# horsepower ? 값 0 대체

auto['horsepower'] = auto['horsepower'].replace(to_replace='?', value = 0)
auto[:35]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
5,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl


In [78]:
# horsepower의 데이터타입 변경(object > numeric 숫자)
auto['horsepower'] = pd.to_numeric(auto['horsepower'])

auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64  
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    int64  
 4   weight        397 non-null    int64  
 5   acceleration  397 non-null    float64
 6   year          397 non-null    int64  
 7   origin        397 non-null    int64  
 8   name          397 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.0+ KB


In [79]:
# auto data 다중선형회귀

# horsepower의 다양한 feature 생성

auto['horsepower_2'] = auto['horsepower']**2
auto['horsepower_3'] = auto['horsepower']**3
auto['horsepower_4'] = auto['horsepower']**4
auto['horsepower_5'] = auto['horsepower']**5
auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name,horsepower_2,horsepower_3,horsepower_4,horsepower_5
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,16900,2197000,285610000,37129300000
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,27225,4492125,741200625,122298103125
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,22500,3375000,506250000,75937500000
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,22500,3375000,506250000,75937500000
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,19600,2744000,384160000,53782400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl,7396,636056,54700816,4704270176
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup,2704,140608,7311616,380204032
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage,7056,592704,49787136,4182119424
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger,6241,493039,38950081,3077056399


In [80]:
# train / test data 분리

train = auto[:-40]
test = auto[-40:]

In [81]:
auto.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name', 'horsepower_2',
       'horsepower_3', 'horsepower_4', 'horsepower_5'],
      dtype='object')

In [82]:
# 훈련용 데이터의 feature와 response 분리
train_X = train[['horsepower','horsepower_2','horsepower_3', 'horsepower_4', 'horsepower_5']]
train_y = train[['mpg']]

In [83]:
# 테스트 데이터의 feature와 response 분리
test_X = test[['horsepower','horsepower_2','horsepower_3', 'horsepower_4', 'horsepower_5']]
test_y = test[['mpg']]

In [87]:
# 선형회귀 객체 생성
reg = linear_model.LinearRegression()

# 훈련용 데이터 이용 >> 모델 적합하게 함
reg.fit(train_X[['horsepower']], train_y)

LinearRegression()

In [88]:
# 훈련용 데이터 이용, 예측
train_y_pred = reg.predict(train_X[['horsepower']])

# 테스트 데이터 이용 예측
test_y_pred = reg.predict(test_X[['horsepower']])

In [89]:
# coefficient

print('Coefficents :', reg.coef_)

Coefficents : [[-0.14160083]]


In [90]:
# train data MSE

print('Train_data MSE : %.3f'%mean_squared_error(train_y, train_y_pred))

Train_data MSE : 24.027


In [91]:
# test data MSE
print('Test_data MSE : %.3f'%mean_squared_error(test_y, test_y_pred))

Test_data MSE : 43.997


In [93]:
# train data r2_score
print('Train_data r2_score : %.3f'%r2_score(train_y, train_y_pred))

Train_data r2_score : 0.587


In [94]:
# test data r2_score
print('Test_data r2_score : %.3f'%r2_score(test_y, test_y_pred))

# r2_score가 음수값? 모델이 안맞는것 >> 다항회귀로 해보자!

Test_data r2_score : -0.361


In [95]:
# 선형회귀 객체 생성
reg = linear_model.LinearRegression()

# 훈련용 데이터 이용 >> 모델 적합하게 함
reg.fit(train_X[['horsepower', 'horsepower_2']], train_y)

# 훈련용 데이터 이용, 예측
train_y_pred = reg.predict(train_X[['horsepower', 'horsepower_2']])

# 테스트 데이터 이용 예측
test_y_pred = reg.predict(test_X[['horsepower', 'horsepower_2']])

# coefficient
print('Coefficents :', reg.coef_)
print()

# train data MSE
print('Train_data MSE : %.3f'%mean_squared_error(train_y, train_y_pred))
print()

# test data MSE
print('Test_data MSE : %.3f'%mean_squared_error(test_y, test_y_pred))
print()

# train data r2_score
print('Train_data r2_score : %.3f'%r2_score(train_y, train_y_pred))
print()

# test data r2_score
print('Test_data r2_score : %.3f'%r2_score(test_y, test_y_pred))

Coefficents : [[-0.24477644  0.00043038]]

Train_data MSE : 22.941

Test_data MSE : 44.437

Train_data r2_score : 0.606

Test_data r2_score : -0.374


In [96]:
# 선형회귀 객체 생성
reg = linear_model.LinearRegression()

# 훈련용 데이터 이용 >> 모델 적합하게 함
reg.fit(train_X, train_y)

# 훈련용 데이터 이용, 예측
train_y_pred = reg.predict(train_X)

# 테스트 데이터 이용 예측
test_y_pred = reg.predict(test_X)

# coefficient
print('Coefficents :', reg.coef_)
print()

# train data MSE
print('Train_data MSE : %.3f'%mean_squared_error(train_y, train_y_pred))
print()

# test data MSE
print('Test_data MSE : %.3f'%mean_squared_error(test_y, test_y_pred))
print()

# train data r2_score
print('Train_data r2_score : %.3f'%r2_score(train_y, train_y_pred))
print()

# test data r2_score
print('Test_data r2_score : %.3f'%r2_score(test_y, test_y_pred))

Coefficents : [[ 9.58661050e-01 -2.56072458e-02  2.31442418e-04 -9.21280966e-07
   1.37074693e-09]]

Train_data MSE : 17.283

Test_data MSE : 38.549

Train_data r2_score : 0.703

Test_data r2_score : -0.192


In [None]:
# 점점 양수로 향해 가고있지만 여전히 음수이다 >> 안좋은모델