ref
1. Kaggle decision tree
https://www.kaggle.com/code/marklvl/decision-tree-regressor-on-bike-sharing-dataset
2. Avoid $R^2$ to evaluate regression model
https://towardsdatascience.com/avoid-r-squared-to-judge-regression-model-performance-5c2bc53c8e2e

syntax ref
1. https://stackoverflow.com/questions/33643843/cant-drop-nan-with-dropna-in-pandas

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [82]:
import os
import glob
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor as rfr


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [6]:
FOLDERNAME_LEV3 = "/content/drive/MyDrive/서울대/2022-1/기계시스템설계1/data/LEV3"


# Utility functions

In [7]:
#utility functions

def load_data(FOLDRENAME):
    directory = os.fsdecode(FOLDRENAME)
    files = []

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".csv"):
            file_abs_path = os.path.join(FOLDRENAME, file)
            files.append(file_abs_path)
    
    return pd.concat(map(pd.read_csv, files), ignore_index=True)

# Loading Data

In [8]:
df = load_data(FOLDERNAME_LEV3)

In [9]:
df.head()

Unnamed: 0,IDX0002SantafeTMOOO19_KMHS281BBKU151711_LSMITI137M000D01995FF,CAL_Time_sec,CAL_SerialTime_none,AUX_RTCDate_YYYYMMDD,AUX_DeviceDate_YYMMDD,AUX_DeviceDate_hhmmss,OBD_CalEngLoad_perc,OBD_EngCoolantTemp_degC,OBD_InManiAbsPress_kPa,OBD_EngineSpeed_rpm,...,AUX_MagneticFieldZ_uT,AUX_sensSCVPosi_V,AUX_sensEGRValvePosi_V,CAL_ExhFlowrate_gps,CAL_CO2Flowrate_gphr,CAL_NOxFlowrateLNTInletValid_gps,CAL_NOxFlowrateLNTOutletValid_gps,CAL_NOxFlowrateSCROutletValid_gps,CAL_sensBaroPressSpdCorr_mbar,CAL_AltFromBarometer_m
0,1,0,737902.41228,2020-04-21 09:53:41.000,20200421,95340.99999,,,,,...,7.296222,3.930081,0.260084,,0.0,0.0,0.0,0.0,1003.749634,0.0
1,2,1,737902.412292,2020-04-21 09:53:42.000,20200421,95342.0,0.0,8.0,99.0,0.0,...,6.651423,3.931051,0.261396,0.0,0.0,0.0,0.0,0.0,1003.756198,-0.046019
2,3,2,737902.412303,2020-04-21 09:53:43.000,20200421,95343.0,0.0,8.0,99.0,0.0,...,7.2,3.928769,0.261396,0.0,0.0,0.0,0.0,0.0,1003.758468,-0.062014
3,4,3,737902.412315,2020-04-21 09:53:44.000,20200421,95344.0,0.0,8.0,99.0,0.0,...,7.150495,3.926828,0.262994,0.0,0.0,0.0,0.0,0.0,1003.756747,-0.050109
4,5,4,737902.412326,2020-04-21 09:53:45.000,20200421,95345.0,0.0,8.0,99.0,0.0,...,6.850735,3.932021,0.261054,0.0,0.0,0.0,0.0,0.0,1003.751961,-0.016767


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24595 entries, 0 to 24594
Columns: 132 entries, IDX0002SantafeTMOOO19_KMHS281BBKU151711_LSMITI137M000D01995FF to CAL_AltFromBarometer_m
dtypes: float64(124), int64(6), object(2)
memory usage: 24.8+ MB


In [11]:
df.describe()

Unnamed: 0,IDX0002SantafeTMOOO19_KMHS281BBKU151711_LSMITI137M000D01995FF,CAL_Time_sec,CAL_SerialTime_none,AUX_DeviceDate_YYMMDD,AUX_DeviceDate_hhmmss,OBD_CalEngLoad_perc,OBD_EngCoolantTemp_degC,OBD_InManiAbsPress_kPa,OBD_EngineSpeed_rpm,OBD_VehicleSpeed_kmph,...,AUX_MagneticFieldZ_uT,AUX_sensSCVPosi_V,AUX_sensEGRValvePosi_V,CAL_ExhFlowrate_gps,CAL_CO2Flowrate_gphr,CAL_NOxFlowrateLNTInletValid_gps,CAL_NOxFlowrateLNTOutletValid_gps,CAL_NOxFlowrateSCROutletValid_gps,CAL_sensBaroPressSpdCorr_mbar,CAL_AltFromBarometer_m
count,24595.0,24595.0,24595.0,24595.0,24595.0,24515.0,24515.0,24515.0,24515.0,24515.0,...,24595.0,24595.0,24595.0,24516.0,24595.0,24595.0,24595.0,24595.0,24595.0,24595.0
mean,3076.056516,3075.056516,737903.954084,20200420.0,103798.164749,45.664127,79.060902,114.334194,1317.166908,49.398653,...,1.765689,3.382409,2.690374,20.113594,7966.967142,0.011319,0.006664,0.000812,1004.539622,4.688337
std,1777.09848,1777.09848,1.118941,1.124449,5467.682993,26.907614,16.976225,24.594252,364.340053,32.273799,...,9.088363,0.459987,0.926204,13.161787,8609.559403,0.022665,0.01742,0.003676,4.047683,26.970959
min,1.0,0.0,737902.41228,20200420.0,92810.999991,0.0,7.0,56.0,0.0,0.0,...,-106.169141,1.528638,0.25599,0.0,0.0,0.0,0.0,0.0,985.031832,-35.255621
25%,1538.0,1537.0,737903.407749,20200420.0,100903.0,27.058824,83.0,101.0,1211.89686,25.189344,...,2.638573,3.148333,2.445085,11.734617,1907.104591,0.000989,5.3e-05,0.0,1002.84357,-13.348989
50%,3075.0,3074.0,737904.407512,20200420.0,103440.0,49.411765,85.0,109.0,1376.38965,52.0,...,3.70295,3.364418,3.12273,18.309924,5409.601116,0.002251,0.000621,6e-06,1005.387653,-3.165688
75%,4612.0,4611.0,737905.396916,20200420.0,110017.0,67.896083,86.0,120.603942,1526.546235,71.0,...,4.515304,3.860122,3.253924,25.813431,11951.359177,0.010323,0.004574,0.000253,1007.368964,16.986087
max,6352.0,6351.0,737905.468079,20200420.0,113544.0,99.929661,90.0,255.0,3201.986664,129.0,...,57.3,3.948102,4.115916,149.687361,89600.459149,0.278177,0.384286,0.106603,1011.316936,135.750057


# Preprocessing

In [111]:
#set input, output features

input_features = [
    'AUX_NOxLNInlet_ppm', 
#     'AUX_NOxLNTOutlet_ppm',
#    'CAL_Time_sec', 
#    'AUX_RTCDate_YYYYMMDD', 
   'OBD_CalEngLoad_perc',
    'OBD_EngCoolantTemp_degC', 
    'OBD_EngineSpeed_rpm', 
#    'OBD_VehicleSpeed_kmph', 
#    'OBD_ActEngPerTorque_perc',
#    'OBD_EngRefTorque_Nm', 
#    'OBD_MAFSensor_gps', 
#    'OBD_EGT11_degC', 
#    'OBD_EGT12_degC',
#    'OBD_DPFDiffPress_kPa', 
#    'AUX_NOxSCROutlet_ppm', 
#    'AUX_lambdaLNTInlet_none', 
#    'AUX_lambdaLNTOutlet_none',
#    'AUX_lambdaSCROutlet_none', 
    'AUX_O2FracLNTInlet_volPerc', 
    'AUX_O2FracLNTOutlet_volPerc',
#    'AUX_O2FracSCROutlet_volPerc',
#    'AUX_SensAmbTemp_degC',
#    'AUX_SensAmbRH_perc',
#    'AUX_SensBaroPress_kPa',
    'AUX_SensTempTurbinOutlet_degC',
#    'AUX_SensTempInManiInlet_degC',
#    'AUX_SensTempTurbinInlet_degC',
    'AUX_SensTempLNTOutlet_degC',
#    'AUX_SensTempSCROutlet_degC',
#    'AUX_SensTempCompressorOutlet_degC',
#    'AUX_SensPressEGRCoolerInlet_absBar',
#    'AUX_SensDiffPressDPF_kPa',
#    'AUX_SensDiffPressSCR_kPa',
    'CAL_ExhFlowrate_gps',
#    'CAL_CO2Flowrate_gphr',
   'CAL_NOxFlowrateLNTInletValid_gps',
#    'CAL_NOxFlowrateLNTOutletValid_gps',
#    'CAL_NOxFlowrateSCROutletValid_gps'
]

output_features = [
    'CAL_NOxFlowrateLNTOutletValid_gps',
    'AUX_O2FracLNTOutlet_volPerc',
    'AUX_SensTempLNTOutlet_degC',
]

In [112]:
# dorp null value
df.dropna(inplace = True)

#cleaning data
RPM_min = 100                      # excluding engine-stop
NOx_max = 1649                     # excluding clipped NOx data (since the sensor maxed out at 1650 ppm)

df = df.loc[((df['OBD_EngineSpeed_rpm'] >= RPM_min) & (df['AUX_NOxLNInlet_ppm'] <= NOx_max))]

In [113]:
X = df[input_features]
y = df[output_features]

In [114]:
X.describe()

Unnamed: 0,AUX_NOxLNInlet_ppm,OBD_CalEngLoad_perc,OBD_EngCoolantTemp_degC,OBD_EngineSpeed_rpm,AUX_O2FracLNTInlet_volPerc,AUX_O2FracLNTOutlet_volPerc,AUX_SensTempTurbinOutlet_degC,AUX_SensTempLNTOutlet_degC,CAL_ExhFlowrate_gps,CAL_NOxFlowrateLNTInletValid_gps
count,23833.0,23833.0,23833.0,23833.0,23833.0,23833.0,23833.0,23833.0,23833.0,23833.0
mean,241.385083,46.36428,80.316065,1339.192085,9.910926,9.941003,264.01101,246.636286,20.329752,0.011278
std,303.47914,26.349974,14.127071,320.095356,5.541563,5.49707,97.84965,112.170658,12.834047,0.022184
min,0.0,0.0,7.0,730.740352,-5.135187,-5.954211,10.9375,7.524541,4.938342,0.0
25%,64.4,27.611758,83.0,1230.723071,5.838882,5.838882,190.125,186.451848,11.968339,0.001034
50%,112.044811,50.006462,85.0,1378.145994,8.866294,8.846786,243.911923,215.573093,18.452233,0.00236
75%,286.3,67.989687,86.0,1531.652762,12.74746,12.875421,319.3125,285.454121,25.799537,0.010628
max,1648.988204,99.929661,90.0,3201.986664,22.402287,21.366533,616.816691,645.811458,149.687361,0.278177


In [115]:
# split train, test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

# Modeling

## Decision Tree Regressor

In [None]:
# 너무 오래걸려서 수기로 몇개만 해봄
# DTR = DecisionTreeRegressor()

# param_grid = {"criterion": ["squared_error", "absolute_error"],
#               "min_samples_split": [10, 20, 40],
#               "max_depth": [2, 6, 8],
#               "min_samples_leaf": [20, 40, 100],
#               "max_leaf_nodes": [5, 20, 100],
#               }

# grid_cv_DTR = GridSearchCV(DTR, param_grid, cv=5)

# grid_cv_DTR.fit(X_train, y_train)

In [127]:
# 전년도 논문과 같은 평가기준으로 한번 해봄. 조금 더 좋은 결과 얻은 듯 함
# 전년도 논문에서는 input과 output이 겹쳐서 성능이 좋았던 것인듯...?
dtm = DecisionTreeRegressor(max_depth=1000000,
                           min_samples_split=2,
                           max_leaf_nodes=100000000)

dtm.fit(X_train,y_train)
print("R-Squared on train dataset={}".format(round(dtm.score(X_train,y_train), 4)))
print("R-Squared on test dataset={}".format(round(dtm.score(X_test,y_test), 4)))

R-Squared on train dataset=1.0
R-Squared on test dataset=0.7485


## Random Forest Regressor
- data가 더 많을때는 이게 더 효과적이라길래 이걸로 해봄

# Model evaluation

In [5]:
# % pip install dtreeviz

In [123]:
for i in output_features:
    print(i)

CAL_NOxFlowrateLNTOutletValid_gps
AUX_O2FracLNTOutlet_volPerc
AUX_SensTempLNTOutlet_degC
