## 데이터 불러오기

In [36]:
import pandas as pd
df=pd.read_csv('https://raw.githubusercontent.com/ralbu85/DataScience_2022S/master/data/auto.csv')
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504.0,12.0,70,1,18.0
1,8,350.0,165.0,3693.0,11.5,70,1,15.0
2,8,318.0,150.0,3436.0,11.0,70,1,18.0
3,8,304.0,150.0,3433.0,12.0,70,1,16.0
4,8,302.0,140.0,3449.0,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,1,27.0
388,4,97.0,52.0,2130.0,24.6,82,2,44.0
389,4,135.0,84.0,2295.0,11.6,82,1,32.0
390,4,120.0,79.0,2625.0,18.6,82,1,28.0


## 전처리

In [37]:
df=pd.get_dummies(df,columns=['origin'])
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,mpg,origin_1,origin_2,origin_3
0,8,307.0,130.0,3504.0,12.0,70,18.0,1,0,0
1,8,350.0,165.0,3693.0,11.5,70,15.0,1,0,0
2,8,318.0,150.0,3436.0,11.0,70,18.0,1,0,0
3,8,304.0,150.0,3433.0,12.0,70,16.0,1,0,0
4,8,302.0,140.0,3449.0,10.5,70,17.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,27.0,1,0,0
388,4,97.0,52.0,2130.0,24.6,82,44.0,0,1,0
389,4,135.0,84.0,2295.0,11.6,82,32.0,1,0,0
390,4,120.0,79.0,2625.0,18.6,82,28.0,1,0,0


## X, y 나누기

In [38]:
y=df['mpg']

In [39]:
X=df.drop(columns=['mpg'])

## 회귀분석 모듈 불러오기

In [40]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

## 학습

In [41]:
reg.fit(X,y)

LinearRegression()

## 회귀모형의 정확도 평가

## RMSE

<font size="4">
$$ RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2} $$

where:
- `n` is the total number of data points
- `y_i` is the true value of the `i`th data point
- `\hat{y}_i` is the predicted value of the `i`th data point
- `(y_i - \hat{y}_i)^2` is the squared difference between the true and predicted values of the `i`th data point
- The square root is taken to convert the result back to the same units as the original data.
When rendered, the equation will appear in the output cell like this:
</font>    


## RMSE from scratch

In [42]:
y_pred = reg.predict(X)
y_pred[:5]

array([14.95325212, 14.04009845, 15.23055101, 14.99408418, 14.90194083])

In [43]:
y_pred-y

0     -3.046748
1     -0.959902
2     -2.769449
3     -1.005916
4     -2.098059
         ...   
387    1.108037
388   -8.534024
389   -0.970261
390    1.100271
391   -2.552462
Name: mpg, Length: 392, dtype: float64

In [44]:
import numpy as np

In [45]:
np.sqrt(np.sum((y_pred-y)**2)/y.count())

3.2683515153304166

## R2_score

<font size="4">
  $$ 
  R^{2} = \dfrac {TSS-RSS} {TSS} = 1 - \dfrac {RSS} {TSS} \\
  TSS = \sum_{i = 1}^{n}(y_{i}-\overline{y})^{2}\\
  RSS = \sum_{i = 1}^{n}(y_{i}-\hat{y})^{2} 
  $$

where:
- `n` is the total number of data points
- `y_i` is the true value of the `i`th data point
- `\hat{y}_i` is the predicted value of the `i`th data point
- `\bar{y}` is the mean of the true values
- `(y_i - \hat{y}_i)^2` is the squared difference between the true and predicted values of the `i`th data point
- `(y_i - \bar{y})^2` is the squared difference between the true value and the mean of the true values
- The R2 score ranges from 0 to 1, where 0 indicates that the model does not explain any of the variance in the data, and 1 indicates that the model perfectly explains the variance in the data.
</font>    

## R2_score from scratch

In [46]:
y.mean()


23.44591836734694

In [47]:
tss=np.sum((y-y.mean())**2)
tss

23818.99346938775

In [48]:
rss=np.sum((y-y_pred)**2)
rss

4187.391678082951

In [49]:
1-rss/tss

0.8241994699119171

## sklearn 패키지를 이용한 평가지표 구하기

In [50]:
from sklearn.metrics import mean_squared_error, r2_score

* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

### RMSE

In [51]:
mean_squared_error(y,y_pred)

10.682121627762632

In [52]:
np.sqrt(mean_squared_error(y,y_pred))

3.2683515153304166

### R2_Score

In [53]:
r2_score(y,y_pred)

0.8241994699119172

## MAE
## MAE from scratch

<font size=4>
$$ MAE = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i| $$


where:
- `n` is the total number of data points
- `y_i` is the true value of the `i`th data point
- `\hat{y}_i` is the predicted value of the `i`th data point
- `|y_i - \hat{y}_i|` is the absolute difference between the true and predicted values of the `i`th data point
$$
</font>

In [73]:
np.mean(np.abs(y-y_pred))
# np.mean(np.abs(y-y_pred))

2.5053892973783345

## sklearn implementation of MAE

In [55]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y,y_pred)

2.505389297378334

## MAPE

## MAPE from scratch

<font size=4>
$$ MAPE = \frac{1}{n} \sum_{i=1}^{n} \left|\frac{y_i - \hat{y}_i}{y_i}\right| $$

where:
- `n` is the total number of data points
- `y_i` is the true value of the `i`th data point
- `\hat{y}_i` is the predicted value of the `i`th data point
- `\left|\frac{y_i - \hat{y}_i}{y_i}\right|` is the absolute percentage difference between the true and predicted values of the `i`th data point
- `100\%` is used to convert the result to a percentage value
</font>

In [56]:
np.mean(np.abs((y-y_pred)/y))

0.11649088492978556

## MAPE with sklearn

In [57]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y,y_pred)

0.11649088492978557

## y의 단위가 각 지표에 미치는 영향

In [82]:
z=y*0.01 # 정답 데이터의 스케일을 100분에 1로 낮춤
z_pred = y_pred*0.01 # 예측값의 스케일을 100분에 1로 낮춤

In [83]:
print('mse: ',mean_squared_error(z,z_pred))
print('r2_score: ',r2_score(z,z_pred))
print('mae: ',mean_absolute_error(z,z_pred))
print('mape: ',mean_absolute_percentage_error(z,z_pred))

mse:  0.0010682121627762632
r2_score:  0.8241994699119172
mae:  0.02505389297378334
mape:  0.11649088492978557


## MAPE의 값이 이상하게 나오는 경우

In [84]:
z[0] = 0.0000000000000000001 # 실제 레이블의 정답이 0에 가까운 경우

In [85]:
print(mean_squared_error(z,z_pred))
print(r2_score(z,z_pred))
print(mean_absolute_error(z,z_pred))
print(mean_absolute_percentage_error(z,z_pred))

0.0011228848863155043
0.8191441792227707
0.025357630326779124
1717945425250.9465
