# Regression 종합실습 : Car seat sales
유아용 카시트 매출액을 예측해 봅시다.

* 카시트에 대해서 지역 매장 별 매출액을 예측하고자 합니다.

![](https://cdn.images.express.co.uk/img/dynamic/24/590x/child-car-seat-986556.jpg?r=1532946857754)

## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

### (2) Data Loading

In [2]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)

**변수설명**
> * Sales - 각 지역 판매량(단위 : 1000개) <== Target
* CompPrice - 각 지역 경쟁사 가격
* Income - 각 지역 평균 소득수준(단위 : 1000달러)
* Advertising - 각 지역, 회사의 광고 예산(단위 : 1000달러)
* Population - 지역 인구수(단위 : 1000명)
* Price - 자사 지역별 판매가격
* ShelveLoc - 진열상태
* Age - 지역 인구의 평균 연령
* Education - 각 지역 교육수준 레벨
* Urban - 매장 도시 지역 여부
* US - 매장이 미국에 있는지 여부

## 2.데이터 이해

* 둘러보기

In [3]:
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 3.데이터 준비

### (1) 데이터 정리

In [4]:
data.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

### (2) 데이터분할1 : x, y 나누기

In [5]:
target = 'Sales'

In [6]:
x = data.drop(target,axis=1)
y = data.loc[:,target]

### (3) NA 조치

### (4) 가변수화

In [7]:
target = ['ShelveLoc','Urban','US']
x = pd.get_dummies(x,columns=target,drop_first=True)
x.head()

Unnamed: 0,CompPrice,Income,Advertising,Population,Price,Age,Education,ShelveLoc_Good,ShelveLoc_Medium,Urban_Yes,US_Yes
0,138,73,11,276,120,42,17,0,0,1,1
1,111,48,16,260,83,65,10,1,0,1,1
2,113,35,10,269,80,59,12,0,1,1,1
3,117,100,4,466,97,55,14,0,1,1,1
4,141,64,3,340,128,38,13,0,0,1,0


### (5) 데이터분할2 : train : validation 나누기

In [8]:
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.3,random_state=2022)

In [9]:
x_train.shape

(280, 11)

In [10]:
x_val.shape

(120, 11)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_mm = scaler.fit_transform(x_train)
x_val_mm = scaler.transform(x_val)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_ss = scaler.fit_transform(x_train)
x_val_ss = scaler.transform(x_val)

In [73]:
x_train_mm = pd.DataFrame(x_train_mm, columns = list(x))
x_val_mm = pd.DataFrame(x_val_mm, columns = list(x))
x_train_ss = pd.DataFrame(x_train_ss, columns = list(x))
x_val_ss = pd.DataFrame(x_val_ss, columns = list(x))

## 4.모델링 : 선형회귀

* 변수를 조절하며 최소 2개 이상의 모델을 생성하고 예측하고 평가해 봅시다.

In [14]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

* 모델1

In [15]:
features = ['Income','Advertising','Price','Education']
x_trainL1 = x_train[features]
x_valL1 = x_val[features]

In [16]:
modelL1 = LinearRegression()

In [17]:
modelL1.fit(x_trainL1,y_train)

LinearRegression()

In [18]:
print(modelL1.coef_,modelL2.intercept_)

NameError: name 'modelL2' is not defined

In [None]:
predL1 = modelL1.predict(x_valL1)

In [None]:
r2_score(y_val,predL1)

In [None]:
mean_squared_error(y_val,predL1,squared=False)

In [None]:
mean_absolute_error(y_val,predL1)

In [None]:
mean_absolute_percentage_error(y_val,predL1)

* 모델2 (전체)

In [19]:
modelL2 = LinearRegression()

In [20]:
modelL2.fit(x_train,y_train)

LinearRegression()

In [21]:
print(modelL2.coef_,modelL2.intercept_)

[ 9.07529162e-02  1.50107692e-02  1.29632695e-01 -2.40199333e-04
 -9.20716371e-02 -4.50122718e-02 -1.10739568e-02  4.82206767e+00
  1.94996860e+00  9.28393738e-02 -2.68465709e-01] 5.6110258378122895


In [22]:
predL2 = modelL2.predict(x_val)

In [23]:
r2_score(y_val,predL2)

0.879427135648263

In [24]:
mean_squared_error(y_val,predL2,squared=False)

1.0241647217116308

In [25]:
mean_absolute_error(y_val,predL2)

0.8163852580798341

In [26]:
mean_absolute_percentage_error(y_val,predL2)

0.2116929520288113

## 5.모델링 : KNN

* 하이퍼파라미터를 조절하며 모델을 최소 3가지 이상 생성하시오.

In [27]:
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import *

* 모델3

In [103]:
feature = ['Population','ComPrice']
x_train_mm3 = x_train_mm[features]
x_val_mm3 = x_val_mm[features]
x_train_ss3 = x_train_ss[features]
x_val_ss3 = x_val_ss[features]

In [104]:
model3m = KNeighborsRegressor()

In [105]:
model3s = KNeighborsRegressor()

In [106]:
model3m.fit(x_train_mm3,y_train)

KNeighborsRegressor()

In [107]:
model3s.fit(x_train_ss3,y_train)

KNeighborsRegressor()

In [108]:
pred_mm3 = model3m.predict(x_val_mm3)

In [109]:
pred_ss3 = model3s.predict(x_val_ss3)

In [110]:
mean_squared_error(y_val,pred_mm3,squared=False)

2.707039194396712

In [111]:
mean_squared_error(y_val,pred_ss3,squared=False)

2.587695744608834

In [112]:
mean_absolute_error(y_val,pred_mm3)

2.141966666666667

In [113]:
mean_absolute_error(y_val,pred_ss3)

2.0332666666666666

In [114]:
mean_absolute_percentage_error(y_val,pred_mm3)

0.7016140105332087

In [115]:
mean_absolute_percentage_error(y_val,pred_ss3)

0.6476844355546009

* 모델4

In [74]:
features = ['Income','Advertising','Price','Education']
x_train_mm1 = x_train_mm[features]
x_val_mm1 = x_val_mm[features]

In [75]:
x_train_ss1 = x_train_ss[features]
x_val_ss1 = x_val_ss[features]

In [76]:
model4m = KNeighborsRegressor()

In [77]:
model4s = KNeighborsRegressor()

In [78]:
model4m.fit(x_train_mm1,y_train)

KNeighborsRegressor()

In [79]:
model4s.fit(x_train_ss1,y_train)

KNeighborsRegressor()

In [81]:
pred_mm4 = model4m.predict(x_val_mm1)

In [82]:
pred_ss4 = model4s.predict(x_val_ss1)

In [83]:
mean_squared_error(y_val,pred_mm4,squared=False)

2.707039194396712

In [84]:
mean_squared_error(y_val,pred_ss4,squared=False)

2.587695744608834

In [85]:
mean_absolute_error(y_val,pred_mm4)

2.141966666666667

In [86]:
mean_absolute_error(y_val,pred_ss4)

2.0332666666666666

In [87]:
mean_absolute_percentage_error(y_val,pred_mm4)

0.7016140105332087

In [88]:
mean_absolute_percentage_error(y_val,pred_ss4)

0.6476844355546009

* 모델5

In [32]:
model_mm3 = KNeighborsRegressor()

In [33]:
model_ss3 = KNeighborsRegressor()

In [34]:
model_mm3.fit(x_train_mm,y_train)

KNeighborsRegressor()

In [35]:
model_ss3.fit(x_train_ss,y_train)

KNeighborsRegressor()

In [36]:
pred_mm3 = model_mm3.predict(x_val_mm)

In [37]:
pred_ss3 = model_ss3.predict(x_val_ss)

In [38]:
mean_squared_error(y_val,pred_mm3,squared=False)

2.45017136135414

In [39]:
mean_squared_error(y_val,pred_ss3,squared=False)

2.189272086029205

In [40]:
mean_absolute_error(y_val,pred_mm3)

2.0191166666666667

In [41]:
mean_absolute_error(y_val,pred_ss3)

1.7926000000000002

In [42]:
mean_absolute_percentage_error(y_val,pred_mm3)

0.7126276869776597

In [43]:
mean_absolute_percentage_error(y_val,pred_ss3)

0.638464956811579

## 6.성능비교