In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import plotly.express as px

1. Z1-Temp값을 목표변수로 회귀 모델을 구성하고자 한다. 각 소입로(Tube Furnace) 온도계수(Temp)와 열전도율(OP), CP, ST값을 입력했을 때, Z1-Temp를 예측할 수 있는 모델을 **다중 선형 회귀 모델**로 구성하시오.
    - X : 'Tube Furnace CP', 'Tube Furnace1 OP',
    'Tube Furnace1 Temp', 'Tube Furnace2 OP', 'Tube Furnace2 Temp',
    'Tube Furnace3 OP', 'Tube Furnace3 Temp', 'Tube Furnace4 OP',
    'Tube Furnace4 Temp', 'ST'
    - 먼저 다중 선형회귀 모델을 구성한 뒤, 성능을 확인하시오.
    - 다항회귀분석을 이용해, 3차원 다항 회귀 모델을 구성한 뒤, 성능을 확인하시오.
    - 규제선형회귀 모델 Lasso 활용해 회귀모델을 구성한 뒤, 성능을 확인하시오.
    - 본인이 만든 모델 중 가장 성능이 좋은 모델을 Best.sav로 저장하시오.
    - 가장 성능이 좋은 모델의 회귀 계수를 확인하시오.
    - 현재 만든 모델을 "08_Data_Test.csv"파일을 넣어, Test Set Score (R을 예측하는 함수를 구성하시오.

In [2]:
df1 = pd.read_csv(r'C:\Users\UserK\Desktop\Ranee\data\ML\08_Data.csv')

In [30]:
print(df1.shape)
df1.info()

(93043, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93043 entries, 0 to 93042
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          93043 non-null  int64  
 1   Code_Num            93043 non-null  object 
 2   Datetime            93043 non-null  object 
 3   Process_Type        93043 non-null  object 
 4   ST                  93028 non-null  float64
 5   Tube Furnace CP     93043 non-null  float64
 6   Tube Furnace1 OP    92864 non-null  float64
 7   Tube Furnace1 Temp  93039 non-null  float64
 8   Tube Furnace2 OP    93043 non-null  float64
 9   Tube Furnace2 Temp  93041 non-null  float64
 10  Tube Furnace3 OP    93043 non-null  float64
 11  Tube Furnace3 Temp  93036 non-null  float64
 12  Tube Furnace4 OP    93043 non-null  float64
 13  Tube Furnace4 Temp  93038 non-null  float64
 14  Z1-OP1              93043 non-null  float64
 15  Z1-OP2              93043 non-null  float

In [31]:
df1.isnull().sum()

Unnamed: 0              0
Code_Num                0
Datetime                0
Process_Type            0
ST                     15
Tube Furnace CP         0
Tube Furnace1 OP      179
Tube Furnace1 Temp      4
Tube Furnace2 OP        0
Tube Furnace2 Temp      2
Tube Furnace3 OP        0
Tube Furnace3 Temp      7
Tube Furnace4 OP        0
Tube Furnace4 Temp      5
Z1-OP1                  0
Z1-OP2                  0
Z1-Temp                15
dtype: int64

In [35]:
df2 = df1.dropna(subset=['Z1-Temp'])
df2.shape

(93028, 17)

In [36]:
X = df2[['Tube Furnace CP', 'Tube Furnace1 OP', 'Tube Furnace1 Temp', 'Tube Furnace2 OP', 'Tube Furnace2 Temp',
     'Tube Furnace3 OP', 'Tube Furnace3 Temp', 'Tube Furnace4 OP', 'Tube Furnace4 Temp', 'ST']]
Y = df2['Z1-Temp']

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

In [38]:
X_train, X_test, Y_train, Y_test= train_test_split(X,Y,random_state=1234)

In [39]:
prepro_pipe = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
model_pipe = make_pipeline( prepro_pipe, LinearRegression() )
grid_model = GridSearchCV(model_pipe, param_grid={}, cv=3)
grid_model.fit(X_train,Y_train)

best_model = grid_model.best_estimator_

In [40]:
best_model['linearregression'].intercept_

100.50931113512884

In [41]:
best_model['linearregression'].coef_ # 회귀 모델의 회귀 계수를 계산
df_coef = pd.DataFrame()
df_coef['coef'] = best_model['linearregression'].coef_  # 회귀 모델의 회귀 계수를 계산
df_coef['X'] = X.columns

In [42]:
df_coef.sort_values(by='coef') 

Unnamed: 0,coef,X
0,-0.023549,Tube Furnace CP
7,-0.015409,Tube Furnace4 OP
4,-0.001692,Tube Furnace2 Temp
8,-0.000421,Tube Furnace4 Temp
3,-6.8e-05,Tube Furnace2 OP
6,0.000355,Tube Furnace3 Temp
2,0.000764,Tube Furnace1 Temp
5,0.001888,Tube Furnace3 OP
1,0.004619,Tube Furnace1 OP
9,0.12519,ST


In [44]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [47]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)

In [53]:
print('학습 결정계수 : ', r2_score(Y_test, Y_test_pred))
print('일반화 결정계수 : ', r2_score(Y_train, Y_train_pred))
print('학습 MSE : ', mean_squared_error(Y_test, Y_test_pred))
print('일반화 MSE : ', mean_squared_error(Y_train, Y_train_pred))
print('학습 MAE : ', mean_absolute_error(Y_test, Y_test_pred))
print('일반화 MAE : ', mean_absolute_error(Y_train, Y_train_pred))

학습 결정계수 :  0.08247320323191787
일반화 결정계수 :  0.08043810458110523
학습 MSE :  0.18922059253324164
일반화 MSE :  0.18909528020467534
학습 MAE :  0.3287498866472179
일반화 MAE :  0.3288878463344254


In [52]:
from sklearn.preprocessing import PolynomialFeatures

In [54]:
model_pipe2 = make_pipeline(prepro_pipe, PolynomialFeatures(degree=3), LinearRegression())
grid_model2 = GridSearchCV(model_pipe2, param_grid={}, cv=3)
grid_model2.fit(X_train,Y_train)
best_model2 = grid_model2.best_estimator_
Y_train_pred = best_model2.predict(X_train)
Y_test_pred = best_model2.predict(X_test)

In [55]:
print('학습 결정계수 : ', r2_score(Y_test, Y_test_pred))
print('일반화 결정계수 : ', r2_score(Y_train, Y_train_pred))
print('학습 MSE : ', mean_squared_error(Y_test, Y_test_pred))
print('일반화 MSE : ', mean_squared_error(Y_train, Y_train_pred))
print('학습 MAE : ', mean_absolute_error(Y_test, Y_test_pred))
print('일반화 MAE : ', mean_absolute_error(Y_train, Y_train_pred))

학습 결정계수 :  0.16131851074830095
일반화 결정계수 :  0.16798778066293907
학습 MSE :  0.1729604071530792
일반화 MSE :  0.17109189118540608
학습 MAE :  0.31958748879983095
일반화 MAE :  0.31898167503751657


In [56]:
from sklearn.linear_model import Lasso