#### 돌아온 auto-mpg
- 목표 : 연비예측
- 데이터 셋 : auto-mpg.csv
- 피쳐/속성 : 연비와 상관 관계를 보고 결정
- 라벨/라벨 : 연비
- 학습방법 : 지도학습 > 회귀
- 알고리즘 : 선형회귀(LinearRegression)

In [74]:
# 다중 선형 회귀
# 점수 = 공부시간*가중치 + 과외횟수*가중치 + 절편
# y= ax + bx + c

In [75]:
import matplotlib.pyplot as plt
import pandas as pd

##### 1. 데이터 준비

In [76]:
# DF 및 시리즈 제작 >> 데이터 준비
file = '../data/auto_mpg.csv'
dataDF = pd.read_csv(file)

In [77]:
dataDF.dropna(inplace=True)

In [78]:
dataDF.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [79]:
dataDF.replace('?', 0,inplace=True)

In [80]:
dataDF['horsepower'].astype('int64')

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: int64

In [81]:
dataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [82]:
dataDF.drop(columns=['car name','origin','model year'], inplace = True)

In [83]:
dataDF.corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration
mpg,1.0,-0.775396,-0.804203,-0.754276,-0.831741,0.420289
cylinders,-0.775396,1.0,0.950721,0.821656,0.896017,-0.505419
displacement,-0.804203,0.950721,1.0,0.871927,0.932824,-0.543684
horsepower,-0.754276,0.821656,0.871927,1.0,0.837987,-0.67394
weight,-0.831741,0.896017,0.932824,0.837987,1.0,-0.417457
acceleration,0.420289,-0.505419,-0.543684,-0.67394,-0.417457,1.0


##### 피쳐 >>> displacement, weight
##### 타겟 >>> mpg

In [84]:
dataDF1 = dataDF[['displacement','weight']]
dataDF1

Unnamed: 0,displacement,weight
0,307.0,3504
1,350.0,3693
2,318.0,3436
3,304.0,3433
4,302.0,3449
...,...,...
393,140.0,2790
394,97.0,2130
395,135.0,2295
396,120.0,2625


In [85]:
dataDF1.shape

(398, 2)

In [86]:
mpgSR = dataDF['mpg']

In [87]:
from sklearn.preprocessing import MinMaxScaler

In [88]:
mmScaler = MinMaxScaler()

In [89]:
mmScaler.fit(dataDF1)

In [90]:
dataDF1_Scaler = mmScaler.transform(dataDF1)

In [91]:
dataDF1_Scaler

array([[0.61757106, 0.5361497 ],
       [0.72868217, 0.58973632],
       [0.64599483, 0.51686986],
       [0.60981912, 0.51601928],
       [0.60465116, 0.52055571],
       [0.93281654, 0.77346187],
       [0.99741602, 0.77714772],
       [0.96124031, 0.76523958],
       [1.        , 0.79727814],
       [0.83204134, 0.63425007],
       [0.81395349, 0.5528778 ],
       [0.70284238, 0.56592005],
       [0.85788114, 0.60901616],
       [1.        , 0.41763538],
       [0.11627907, 0.21519705],
       [0.33591731, 0.34590303],
       [0.33850129, 0.32917494],
       [0.34108527, 0.27615537],
       [0.0749354 , 0.1465835 ],
       [0.0749354 , 0.06294301],
       [0.10852713, 0.30025517],
       [0.10077519, 0.23164162],
       [0.09302326, 0.21604763],
       [0.1369509 , 0.17607031],
       [0.33850129, 0.29345052],
       [0.75452196, 0.85114828],
       [0.61757106, 0.78338531],
       [0.64599483, 0.78508648],
       [0.60981912, 0.88432095],
       [0.0749354 , 0.1465835 ],
       [0.

In [92]:
dataDF1_Scaler = pd.DataFrame(dataDF1_Scaler)

In [93]:
dataDF1_Scaler.columns = [['displacement','weight']]
dataDF1_Scaler.head()

Unnamed: 0,displacement,weight
0,0.617571,0.53615
1,0.728682,0.589736
2,0.645995,0.51687
3,0.609819,0.516019
4,0.604651,0.520556


In [94]:
from sklearn.linear_model import LinearRegression

In [95]:
model = LinearRegression()

In [96]:
model.fit(dataDF1_Scaler,mpgSR)

In [97]:
# 학습 후 모델 파라미터 > 피쳐가 2개 >>> coef_ = 2개
print(f'model.coef_ : {len(model.coef_)}개, {model.coef_}')
print(f'model.intercept_ : {model.intercept_}')

model.coef_ : 2개, [ -6.33068256 -20.44864376]
model.intercept_ : 33.43633438254717


In [98]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [99]:
scroe = model.score(dataDF1_Scaler,mpgSR)
print(f'score : {scroe}')

score : 0.6979764850769228


In [101]:
pre_mpg = model.predict(dataDF1_Scaler)

In [102]:
mse = mean_squared_error(mpgSR, pre_mpg)
rmse = mean_squared_error(mpgSR, pre_mpg, squared=False)
mea = mean_absolute_error(mpgSR, pre_mpg) 

# 얼마나 장답에 가깝게 값을 예측 했느냐를 나타내는 지표
# ==> 정답과 예측값을 제공: 1에 가까울수록 좋음
r2 = r2_score(mpgSR, pre_mpg)

In [103]:
# 손실 > 비용함수 값은 0에 가까울 수록
# 결정계수 값은 1에 가까울 수록 성능 좋은 모델
print(f'mse : {mse}') # 비용/손실 함수
print(f'rmse : {rmse}') # 비용/손실 함수
print(f'mea : {mea}') # 비용/손실 함수
print(f'r2 : {r2}') # 결제계수

mse : 18.404140933712643
rmse : 4.290004770826327
mea : 3.27299278425503
r2 : 0.6979764850769228
