In [1]:
# 광고 데이터에서 Sales를 가장 잘 예측하는 feature를 확인

In [2]:
import numpy as np
import pandas as pd

In [3]:
# 1. 데이터 로드
df = pd.read_csv("./data/advertising.csv")

In [4]:
# 2. 데이터 전처리 : feature, target
features = df.iloc[:, :-1]
target = df.iloc[:, -1]

In [5]:
# 3. 모델 학습

In [6]:
from sklearn.linear_model import LinearRegression

In [18]:
features.columns

Index(['TV', 'radio', 'newspaper'], dtype='object')

In [19]:
features.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [20]:
target

0      22.1
1      10.4
2       9.3
3      18.5
4      12.9
       ... 
195     7.6
196     9.7
197    12.8
198    25.5
199    13.4
Name: sales, Length: 200, dtype: float64

In [7]:
model_1 = LinearRegression().fit(features[["TV"]], target)
model_2 = LinearRegression().fit(features[["radio"]], target)
model_3 = LinearRegression().fit(features[["newspaper"]], target)

In [8]:
models = {}
for column in features.columns:
    models[column] = LinearRegression().fit(features[[column]], target)

In [21]:
models

{'TV': LinearRegression(),
 'radio': LinearRegression(),
 'newspaper': LinearRegression()}

In [9]:
# list comprehension
datas = [data ** 2 for data in range(10) if data % 2]
datas

[1, 9, 25, 49, 81]

In [10]:
# dict comprehension
models = {
    column: LinearRegression().fit(features[[column]], target) 
    for column in features.columns
}

In [11]:
# 4. 모델 성능 평가 : MAE

In [12]:
from sklearn.metrics import mean_absolute_error

In [13]:
for column in features.columns:
    pred = models[column].predict(features[[column]])
    mae = mean_absolute_error(pred, target) 
    print(column, np.round(mae, 2))

TV 2.55
radio 3.32
newspaper 4.15


In [14]:
# 5. 각 feature의 결정 계수 출력, 모델의 성능과 비교

In [15]:
df.corr() ** 2

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.003004,0.003209,0.611875
radio,0.003004,1.0,0.125389,0.332032
newspaper,0.003209,0.125389,1.0,0.05212
sales,0.611875,0.332032,0.05212,1.0


In [16]:
model = LinearRegression()
model.fit(features, target)

LinearRegression()

In [17]:
total_pred = model.predict(features)
mae = mean_absolute_error(total_pred, target) 
print(mae)

1.2520112296870685


In [22]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_pred=total_pred, y_true=target)

2.784126314510936