# 다중 선형 회귀 (Multiple Linear Regression)

## 오존량 예측 모델
* 독립변수 - 온도, 태양광 세기, 바람 세기
* 종속변수 - 오존량

In [31]:
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats 
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model

## Data Preprocessing
-----

### 1. Raw Data Loading

In [32]:
df = pd.read_csv('data/ozone.csv')
training_data = df[['Temp', 'Wind', 'Solar.R', 'Ozone']]
display(training_data) # 153 rows × 4 columns

Unnamed: 0,Temp,Wind,Solar.R,Ozone
0,67,7.4,190.0,41.0
1,72,8.0,118.0,36.0
2,74,12.6,149.0,12.0
3,62,11.5,313.0,18.0
4,56,14.3,,
...,...,...,...,...
148,70,6.9,193.0,30.0
149,77,13.2,145.0,
150,75,14.3,191.0,14.0
151,76,8.0,131.0,18.0


### 2. 결측치(NaN) 제거

In [33]:
training_data = training_data.dropna(how='any')
display(training_data) # 111 rows × 4 columns

Unnamed: 0,Temp,Wind,Solar.R,Ozone
0,67,7.4,190.0,41.0
1,72,8.0,118.0,36.0
2,74,12.6,149.0,12.0
3,62,11.5,313.0,18.0
6,65,8.6,299.0,23.0
...,...,...,...,...
147,63,16.6,20.0,14.0
148,70,6.9,193.0,30.0
150,75,14.3,191.0,14.0
151,76,8.0,131.0,18.0


### 3. 이상치(Outlier) 제거
* 독립변수(지대값) - Temp, Wind, Solar.R
* 종속변수(outlier) - Ozone

#### 독립변수와 종속변수에 대한 이상치를 한꺼번에 처리

In [34]:
zscore_threshold = 1.8


# 독립변수가 여러 개, 종속변수 1개
print(training_data.columns)
for col in training_data.columns:
    tmp = ~(np.abs(stats.zscore(training_data[col])) > zscore_threshold)
    training_data = training_data.loc[tmp]
    
print(training_data.shape) # (86, 4)
display(training_data)

Index(['Temp', 'Wind', 'Solar.R', 'Ozone'], dtype='object')
(86, 4)


Unnamed: 0,Temp,Wind,Solar.R,Ozone
0,67,7.4,190.0,41.0
1,72,8.0,118.0,36.0
2,74,12.6,149.0,12.0
3,62,11.5,313.0,18.0
6,65,8.6,299.0,23.0
...,...,...,...,...
146,69,10.3,49.0,7.0
148,70,6.9,193.0,30.0
150,75,14.3,191.0,14.0
151,76,8.0,131.0,18.0


### 4. 정규화 처리

Sklearn 라이브러리를 통해 Min-Max Scaler 사용하여 정규화 처리를 진행
* 값을 변환시키는 역할을 하는 Min-Max Scaler라고 불리는 객체 생성
* 독립변수와 종속변수에 대해 각각 객체를 생성

#### 독립변수
Fancy Indexing을 통해 여러 개의 독립변수에 대한 정규화 처리 진행

In [35]:
scaler_x = MinMaxScaler()
scaler_x.fit(training_data[['Temp', 'Wind', 'Solar.R']].values) # DataFrame -> value 값: 2차원 형태

# Min-Max Scaling
training_data_x = scaler_x.transform(training_data[['Temp', 'Wind', 'Solar.R']].values)
display(training_data_x)
             

array([[0.18181818, 0.25688073, 0.51677852],
       [0.33333333, 0.31192661, 0.27516779],
       [0.39393939, 0.73394495, 0.37919463],
       [0.03030303, 0.63302752, 0.9295302 ],
       [0.12121212, 0.36697248, 0.88255034],
       [0.24242424, 0.46788991, 0.73825503],
       [0.15151515, 0.42201835, 0.85234899],
       [0.21212121, 0.57798165, 0.79865772],
       [0.09090909, 0.63302752, 1.        ],
       [0.15151515, 0.67889908, 0.90939597],
       [0.21212121, 0.63302752, 0.95973154],
       [0.03030303, 0.46788991, 0.02684564],
       [0.        , 0.67889908, 0.18791946],
       [0.60606061, 0.94495413, 0.72483221],
       [0.45454545, 0.25688073, 0.81543624],
       [0.63636364, 0.46788991, 0.30536913],
       [0.87878788, 0.8440367 , 0.8557047 ],
       [0.78787879, 0.63302752, 0.96308725],
       [0.63636364, 0.31192661, 0.37583893],
       [0.48484848, 0.94495413, 0.52013423],
       [0.12121212, 0.42201835, 0.0033557 ],
       [0.36363636, 0.63302752, 0.28187919],
       [0.

#### 종속변수

In [36]:
scaler_t = MinMaxScaler() 
scaler_t.fit(training_data['Ozone'].values.reshape(-1,1)) # Series -> value 값: 1차원 벡터 형태

# Min-Max Scaling
training_data_t = scaler_t.transform(training_data['Ozone'].values.reshape(-1,1))
display(training_data_t)

array([[0.37777778],
       [0.32222222],
       [0.05555556],
       [0.12222222],
       [0.17777778],
       [0.1       ],
       [0.04444444],
       [0.07777778],
       [0.07777778],
       [0.3       ],
       [0.25555556],
       [0.04444444],
       [0.27777778],
       [0.42222222],
       [0.33333333],
       [0.24444444],
       [0.71111111],
       [0.35555556],
       [0.17777778],
       [0.15555556],
       [0.14444444],
       [0.05555556],
       [0.06666667],
       [0.46666667],
       [0.27777778],
       [0.63333333],
       [0.36666667],
       [0.77777778],
       [1.        ],
       [1.        ],
       [0.86666667],
       [0.03333333],
       [0.22222222],
       [0.        ],
       [0.45555556],
       [0.31111111],
       [0.6       ],
       [0.8       ],
       [0.62222222],
       [0.81111111],
       [0.14444444],
       [0.5       ],
       [0.83333333],
       [0.47777778],
       [0.63333333],
       [0.57777778],
       [0.35555556],
       [0.1  

## Modeling
-----
### 1. Tensorflow

In [39]:
# Training Data Set
training_data_x = scaler_x.transform(training_data[['Temp', 'Wind', 'Solar.R']].values)
training_data_t = scaler_t.transform(training_data['Ozone'].values.reshape(-1,1))

# Placeholder
X = tf.placeholder(shape=[None, 3], dtype=tf.float32)
T = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# Weight & bias
W = tf.Variable(tf.random.normal([3,1]), name='weight')
b = tf.Variable(tf.random.normal([1]), name='bias')

# Hypothesis
H = tf.matmul(X, W) + b

# Loss function
loss = tf.reduce_mean(tf.square(H-T))

# train - Gradient Descent Algorithm
train = tf.train.GradientDescentOptimizer(learning_rate=1e-4).minimize(loss)

# session, 초기화
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# learning
for step in range(300000):
    _, W_val, b_val, loss_val = sess.run([train, W, b, loss], feed_dict={X: training_data_x, T: training_data_t})
    
    if step % 30000 == 0:
        print('W: {}, b: {}, loss: {}'.format(W_val, b_val, loss_val))

W: [[-0.19128387]
 [ 2.3624008 ]
 [ 1.0309532 ]], b: [-0.05253943], loss: 2.1682400703430176
W: [[0.1060145]
 [1.1861943]
 [0.544905 ]], b: [-0.591002], loss: 0.2557504177093506
W: [[0.47458383]
 [0.66970867]
 [0.46399206]], b: [-0.48883507], loss: 0.11278313398361206
W: [[0.6757769 ]
 [0.35086977]
 [0.40204966]], b: [-0.40463257], loss: 0.06067613139748573
W: [[0.78021014]
 [0.15025645]
 [0.354778  ]], b: [-0.33491597], loss: 0.040877338498830795
W: [[0.82937515]
 [0.02092053]
 [0.31869274]], b: [-0.276924], loss: 0.03280353173613548
W: [[ 0.8475587 ]
 [-0.06496914]
 [ 0.29108083]], b: [-0.22847974], loss: 0.029147829860448837
W: [[ 0.8488571 ]
 [-0.12396806]
 [ 0.2698459 ]], b: [-0.18786746], loss: 0.027267973870038986
W: [[ 0.8415172 ]
 [-0.16597898]
 [ 0.25344366]], b: [-0.15372781], loss: 0.026175765320658684
W: [[ 0.8299871 ]
 [-0.19696222]
 [ 0.24070056]], b: [-0.12490664], loss: 0.025477861985564232


### 2. Sklearn

In [46]:
# Training Data Set
training_data_x = training_data[['Temp', 'Wind', 'Solar.R']].values
training_data_t = training_data['Ozone'].values.reshape(-1,1)

# Model
model = linear_model.LinearRegression()

# WRONG DATA for Learning
# model.fit(training_data_x, training_data_t) # 결측치, 이상치, 정규화 처리가 다 된 데이터

# 결측치만 처리되고 이상치, 정규화는 처리 되지 않은 데이터로 학습 
model.fit(training_data_x, training_data_t) 
print('W: {}, b: {}'.format(model.coef_, model.intercept_))

W: [[ 1.9400749  -2.7453281   0.05651878]], b: [-97.42698439]


## Prediction
-----
* 온도: 80
* 바람: 10
* 태양광 세기: 150

### 1. Tensorflow
정규화를 시켜서 모델을 만들었으면 예측할 시에는 항상 디스케일링해줘야 한다. <br>
즉, 예측할 값을 정규화한 뒤 예측값 도출 시에는 원래 스케일로 원상복구시켜야 한다.

In [47]:
# WRONG CODE
# tensorflow_result = sess.run(H, feed_dict={X:[[80.0, 10.0, 150.0]]}) # [[97.68829]]


# Proper Prediction Method
# Normalization
predict_data = np.array([[80.0, 10.0, 150.0]])
scaled_predict_data = scaler_x.transform(predict_data) # scaler_x: 스케일 기준
tensorflow_result = sess.run(H, feed_dict={X: scaled_predict_data}) 
print(tensorflow_result) #[[0.34895673]]

# Denormalization
tensorflow_result = scaler_t.inverse_transform(tensorflow_result) # scaler_t: 디스케일 기준
print(tensorflow_result) # [[38.406105]]

[[0.34895673]]
[[38.406105]]


### 2. Sklearn

In [42]:
sklearn_result = model.predict([[80, 10, 150]])
print(sklearn_result) # [[38.8035437]]

[[38.8035437]]
