In [1]:
import numpy as np
import joblib
import warnings 
warnings.simplefilter('ignore')
from sklearn.metrics import r2_score ,mean_squared_error
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from sklearn.svm import SVR

def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# 讀取清洗過且經過預處理的資料

In [2]:
data = joblib.load('../data/dataset.pkl')
b = data['stream']
A = np.hstack((data['action'].reshape(len(b),-1),data['state']))

batch_size = b.shape[0]

print(A.shape,b.shape)

(59213, 434) (59213, 1)


# 切分資料

In [3]:
sp1 = -10000
sp2 = -1585
A_train = A[sp1:sp2]
b_train = b[sp1:sp2]
A_test = A[sp2:]
b_test = b[sp2:]

# 用 Lasso 找到一組參數 w

In [4]:
def evaluate(y_real,y_pred,name):
    score = pd.DataFrame(index = ['表現'],columns = ['R2','MSE','MAPE'])
    score['R2'] = r2_score(y_real,y_pred)
    score['MSE'] = mean_squared_error(y_real,y_pred)
    score['MAPE'] = mape(y_real,y_pred)
    score.index.name = name
    return score

In [28]:
params = {}
s = 2.907000e-08
t = s * 0.01
e = s + 10*t
for a in tqdm(np.arange(s,e,t)):
    model = Lasso(a).fit(A_train,b_train)
    y_pred = data['mm_stream'].inverse_transform(model.predict(A_test).reshape(-1,1))
    y_real = data['mm_stream'].inverse_transform(b_test)
    params[a] = evaluate(y_real,y_pred,'bababa')

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




# 觀察不同lambda對r2分數的影響 選一個最好的lambda

In [29]:
temp = pd.DataFrame([ i.R2.values[0] for i in params.values()],index=params.keys(),columns=['R2'])
temp.index.name = 'lambda'
temp

Unnamed: 0_level_0,R2
lambda,Unnamed: 1_level_1
2.907e-08,0.649661
2.93607e-08,0.649688
2.96514e-08,0.649702
2.99421e-08,0.64971
3.02328e-08,0.649711
3.05235e-08,0.649718
3.08142e-08,0.649713
3.11049e-08,0.649696
3.13956e-08,0.64968
3.16863e-08,0.649653


In [35]:
a = temp.index[np.argmax(temp)]
a

3.05235e-08

# 建模

In [36]:
model = Lasso(a).fit(A_train,b_train)

In [37]:
data['action_col']

['MLPAP_FQ-0619.PV',
 'MLPAP_TJC-0627.PV',
 'MLPAP_TRC-0610.PV',
 'MLPAP_FR-0632.PV',
 'MLPAP_TJC-0757A.PV',
 'MLPAP_PIC-0761.PV',
 'MLPAP_TJC-1721.PV',
 'MLPAP_TRC-1734.PV',
 'MLPAP_TRC-0747.PV',
 'MLPAP_TJ-0881.PV',
 'MLPAP_TJ-0757B.PV',
 '濃度']

In [38]:
data['action'].shape

(59213, 36, 12)

In [39]:
action = data['mm_action'].inverse_transform(data['action'].reshape(59213*36,12))
feed = action[:,0]
feed = feed[::36]
feed.shape

(59213,)

In [40]:
y_pred = data['mm_stream'].inverse_transform(model.predict(A_train).reshape(-1,1)).reshape(-1)
y_real = data['mm_stream'].inverse_transform(b_train).reshape(-1)
evaluate(feed[:8415]/y_real,feed[:8415]/y_pred,'train(蒸氣單耗)')

Unnamed: 0_level_0,R2,MSE,MAPE
train(蒸氣單耗),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
表現,0.998436,5.73289e-07,0.270739


In [41]:
y_pred = data['mm_stream'].inverse_transform(model.predict(A_test).reshape(-1,1)).reshape(-1)
y_real = data['mm_stream'].inverse_transform(b_test).reshape(-1)
evaluate(feed[-1585:]/y_real,feed[-1585:]/y_pred,'test(蒸氣單耗)')

Unnamed: 0_level_0,R2,MSE,MAPE
test(蒸氣單耗),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
表現,0.684766,5.142293e-07,0.2947


# 保存權重

In [42]:
joblib.dump(model.coef_,'../model/stream_lasso_weight.pkl')

['../model/stream_lasso_weight.pkl']