# 실습 - 단순선형회귀


# **1. 필요한 라이브러리 불러오기**

In [1]:
# 데이터, 시각화 관련 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 선형회귀 관련 scikit-learn 라이브러리
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# 선형회귀 관련 statsmodels 라이브러리
import statsmodels.api as sm

# **2. 간단한 예제**

In [2]:
# 데이터 
X = np.array([[0], [1], [2]])
y = np.array([0, 1.2, 1.6])

# 선형회귀 object 생성
reg = linear_model.LinearRegression()

# training
reg.fit(X, y)

# training data 예측
pred_train = reg.predict(X) 

# test
pred_test = reg.predict([[1.5]])

In [None]:
# Plot outputs
plt.scatter(X, y, color="black")
plt.scatter(1.5, pred_test, color="red")
plt.plot(X, pred_train, color="blue", linewidth=3)

plt.xlabel("X")
plt.ylabel("y")

plt.grid()

plt.show()

In [None]:
# test data 예측값
pred_test

In [None]:
# coefficient
reg.coef_

# **3. Advertising 데이터 불러오기**



In [None]:
# 데이터 loading
from google.colab import drive
drive.mount('/content/drive')

ad = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/Advertising.csv", index_col=0)

In [None]:
ad

# **4. Advertising data 선형회귀(scikit-learn)**

In [8]:
# training/test data 분리
train = ad[:-20]
test = ad[-20:]

# training data의 feature와 response 분리
train_X = train[["TV", "Radio", "Newspaper"]]
train_y = train[["Sales"]]

# test data의 feature와 response 분리
test_X = test[["TV", "Radio", "Newspaper"]]
test_y = test[["Sales"]]

In [9]:
# 선형회귀 object 생성
regr = linear_model.LinearRegression()

# training data를 이용하여 적합
regr.fit(train_X[["TV"]], train_y)

# training data를 이용하여 예측
train_y_pred = regr.predict(train_X[["TV"]])

# test data를 이용하여 예측
test_y_pred = regr.predict(test_X[["TV"]])

In [None]:
# coefficients
print("Coefficients: \n", regr.coef_)

# training MSE
print("Training MSE: %.3f" % mean_squared_error(train_y, train_y_pred))

# test MSE
print("Test MSE: %.3f" % mean_squared_error(test_y, test_y_pred))

# training R^2
print("R^2: %.3f" % r2_score(train_y, train_y_pred))

# **5. Advertising data 선형회귀(statsmodels)**

In [11]:
# statsmodels 사용을 위한 X0 feature 추가
# statsmodels의 OLS 함수는 데이터 내에 intercept에 해당하는 feature 필요
sm_train_X = train_X
sm_train_X["X0"] = 1 

sm_test_X = test_X
sm_test_X["X0"] = 1 

In [None]:
sm_train_X

In [None]:
# training
results = sm.OLS(train_y, sm_train_X[["X0", "TV"]]).fit()

# training 결과 확인
results.summary()

In [None]:
# Plot outputs
plt.scatter(train_X["TV"], train_y["Sales"], color="gray")
plt.scatter(test_X["TV"], test_y["Sales"], color="red")
plt.plot(train_X["TV"], train_y_pred, color="blue", linewidth=2)
plt.scatter(test_X["TV"], test_y_pred, color="red", marker="^")

plt.xlabel("TV")
plt.ylabel("Sales")

plt.grid()

plt.show()