# 실습 - 다중선형회귀


# **1. 필요한 라이브러리 불러오기**

In [1]:
# 데이터, 시각화 관련 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 선형회귀 관련 scikit-learn 라이브러리
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# 선형회귀 관련 statsmodels 라이브러리
import statsmodels.api as sm

# **2. 간단한 예제**

In [2]:
# 데이터 
X = np.array([[0, 1], [1, 2], [2, 2.5]])  # 2차원 데이터
y = np.array([0, 1.2, 1.6])

# 선형회귀 object 생성
reg = linear_model.LinearRegression()

# training
reg.fit(X, y)

# training data 예측
pred_train = reg.predict(X) 

# test
pred_test = reg.predict([[1.5, 2]])

In [None]:
# test data 예측값
pred_test

In [None]:
# coefficient
reg.coef_

# **3. Advertising 데이터 불러오기**



In [None]:
# 데이터 loading
from google.colab import drive
drive.mount('/content/drive')

ad = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/Advertising.csv", index_col=0)

In [None]:
ad

# **4. Advertising data 선형회귀(scikit-learn)**

In [7]:
# training/test data 분리
train = ad[:-20]
test = ad[-20:]

# training data의 feature와 response 분리
train_X = train[["TV", "Radio", "Newspaper"]]
train_y = train[["Sales"]]

# test data의 feature와 response 분리
test_X = test[["TV", "Radio", "Newspaper"]]
test_y = test[["Sales"]]

In [8]:
# 선형회귀 object 생성
regr = linear_model.LinearRegression()

# training data를 이용하여 적합
regr.fit(train_X[["TV", "Radio", "Newspaper"]], train_y)

# training data를 이용하여 예측
train_y_pred = regr.predict(train_X[["TV", "Radio", "Newspaper"]])

# test data를 이용하여 예측
test_y_pred = regr.predict(test_X[["TV", "Radio", "Newspaper"]])

In [None]:
# coefficients
print("Coefficients: \n", regr.coef_)

# training MSE
print("Training MSE: %.3f" % mean_squared_error(train_y, train_y_pred))

# test MSE
print("Test MSE: %.3f" % mean_squared_error(test_y, test_y_pred))

# training R^2
print("R^2: %.3f" % r2_score(train_y, train_y_pred))

# **5. Advertising data 선형회귀(statsmodels)**

In [10]:
# statsmodels 사용을 위한 X0 feature 추가
# statsmodels의 OLS 함수는 데이터 내에 intercept에 해당하는 feature 필요
sm_train_X = train_X
sm_train_X["X0"] = 1 

sm_test_X = test_X
sm_test_X["X0"] = 1 

In [None]:
sm_train_X

In [None]:
# training
results = sm.OLS(train_y, sm_train_X[["X0", "TV", "Radio", "Newspaper"]]).fit()

# training 결과 확인
results.summary()

# **6. PolynomialFeatures() 간단한 예제**

In [13]:
from sklearn.preprocessing import PolynomialFeatures

X = np.arange(6).reshape(3, 2)

In [None]:
X

In [None]:
# [1, a, b, a^2, ab, b^2] feature 생성
poly = PolynomialFeatures(2)
poly.fit_transform(X)

In [None]:
# [1, a, b, a^2, ab, b^2, a^3, a^2b, b^2a, b^3] feature 생성
poly = PolynomialFeatures(3)
poly.fit_transform(X)

In [None]:
# interaction feature만 생성
poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(X)

# **7. Auto 데이터 불러오기**



In [None]:
# 데이터 loading
from google.colab import drive
drive.mount('/content/drive')

auto = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/Auto.csv")

In [19]:
# horsepower의 ? 값을 0 으로 대체
auto["horsepower"] = auto["horsepower"].replace(to_replace="?",value=0)

# horserpower의 데이터 타입을 object에서 numeric으로 변경
auto["horsepower"] = pd.to_numeric(auto["horsepower"])

In [None]:
auto

# **8. Auto data 다중선형회귀(scikit-learn)**

In [21]:
# horsepower의 다양한 feature 생성
auto["horsepower_2"] = auto["horsepower"]**2
auto["horsepower_3"] = auto["horsepower"]**3
auto["horsepower_4"] = auto["horsepower"]**4
auto["horsepower_5"] = auto["horsepower"]**5

In [None]:
auto

In [23]:
# training/test data 분리
train = auto[:-40]
test = auto[-40:]

# training data의 feature와 response 분리
train_X = train[["horsepower", "horsepower_2", "horsepower_3", "horsepower_4", "horsepower_5"]]
train_y = train[["mpg"]]

# test data의 feature와 response 분리
test_X = test[["horsepower", "horsepower_2", "horsepower_3", "horsepower_4", "horsepower_5"]]
test_y = test[["mpg"]]

In [24]:
# 선형회귀 object 생성
regr = linear_model.LinearRegression()

# training data를 이용하여 적합
regr.fit(train_X[["horsepower"]], train_y)
# regr.fit(train_X[["horsepower", "horsepower_2"]], train_y)

# training data를 이용하여 예측
train_y_pred = regr.predict(train_X[["horsepower"]])
# train_y_pred = regr.predict(train_X[["horsepower", "horsepower_2"]])

# test data를 이용하여 예측
test_y_pred = regr.predict(test_X[["horsepower"]])
# test_y_pred = regr.predict(test_X[["horsepower", "horsepower_2"]])

In [None]:
# coefficients
print("Coefficients: \n", regr.coef_)

# training MSE
print("Training MSE: %.3f" % mean_squared_error(train_y, train_y_pred))

# test MSE
print("Test MSE: %.3f" % mean_squared_error(test_y, test_y_pred))

# training R^2
print("R^2: %.3f" % r2_score(train_y, train_y_pred))