### 기본 라이브러러리 import

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [None]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGB
from lightgbm import LGBMRegressor as LGBM
from catboost import CatBoostRegressor as CBR

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

### csv 파일 불러오기 후 가변수화 및 x,y 나누기

In [None]:
repair_data = pd.read_csv('/content/drive/MyDrive/빅프로젝트/repair_payment.csv', index_col=0)

In [None]:
repair_data = repair_data.reset_index(drop=True)

In [None]:
repair_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62244 entries, 0 to 62243
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          62244 non-null  object
 1   product_date  62244 non-null  int64 
 2   part          62244 non-null  object
 3   payment       62244 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [None]:
repair_data['payment'].describe()

count     62244.000000
mean     111964.081630
std       33951.328589
min       40000.000000
25%       82410.000000
50%      112120.000000
75%      136460.000000
max      199900.000000
Name: payment, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
target = 'payment'

In [None]:
x = repair_data.drop(target, axis=1)
y = repair_data[target]

x.shape, y.shape

((62244, 3), (62244,))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2023)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((49795, 3), (12449, 3), (49795,), (12449,))

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)

In [None]:
x_train[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0.])

In [None]:
x_train.shape

(49795, 38)

### 모델 성능 비교

In [None]:
model_KNN = KNeighborsRegressor()
model_DTR = DTR()
model_RFR = RFR()
model_GBR = GBR()
model_XGB = XGB()
model_LGBM = LGBM()
model_CBR = CBR()

In [None]:
model_list = {'KNN' : model_KNN, 'DTR' : model_DTR, 'RFR' : model_RFR, 'GBR' : model_GBR, 'XGB' : model_XGB, 'LGBM' : model_LGBM}

In [None]:
model_perform = pd.DataFrame(columns=['model',
                'rmse',
                'mae',
                'r2',])

In [None]:
for model_name, model in model_list.items():
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    rmse = np.sqrt(mse(y_test, y_pred).round(2))
    mae = mean_absolute_error(y_test, y_pred).round(2)
    r2 = r2_score(y_test, y_pred).round(2)

    temp = pd.DataFrame({
            'model' : model_name,
            'rmse' : rmse,
            'mae' : mae,
            'r2' : r2,
        }, index=[0])
    model_perform = pd.concat([model_perform, temp], ignore_index=True)

In [None]:
model_CBR.fit(x_train, y_train)

y_pred = model_CBR.predict(x_test)

rmse = np.sqrt(mse(y_test, y_pred).round(5))
mae = mean_absolute_error(y_test, y_pred).round(2)
r2 = r2_score(y_test, y_pred).round(5)

temp = pd.DataFrame({
        'model' : 'CBR',
        'rmse' : rmse,
        'mae' : mae,
        'r2' : r2,
    }, index=[0])
model_perform = pd.concat([model_perform, temp], ignore_index=True)

In [None]:
# ohe
model_perform

Unnamed: 0,model,rmse,mae,r2
0,KNN,24597.416546,19019.99,0.47
1,DTR,22903.545961,18247.8,0.54
2,RFR,22871.239265,18237.89,0.54
3,GBR,23231.516648,18686.25,0.53
4,XGB,22861.423217,18232.6,0.54
5,LGBM,22843.217768,18267.43,0.55
6,CBR,22844.98132,18228.46,0.54519


In [None]:
test = pd.DataFrame({'name' : ['그랜저'],
                'product_date' : [2017],
                'part' : ['앞범퍼'],})
test = encoder.transform(test)

In [None]:
pred = model_LGBM.predict(test)
pred

array([118112.04638366])

### 딥러닝 모델

In [None]:
x_train.shape

(49795, 38)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
es = EarlyStopping(
    monitor='mse',
    min_delta=0,
    patience=5,
    restore_best_weights=True
)

In [None]:
keras.backend.clear_session()

model_DNN = keras.models.Sequential()

model_DNN.add(keras.layers.Input(shape=(x_train.shape[1])))
model_DNN.add(keras.layers.Dense(256, activation='relu'))
model_DNN.add(keras.layers.Dense(256, activation='relu'))
model_DNN.add(keras.layers.Dense(1, ))

model_DNN.compile(optimizer='adam', loss='mae', metrics=['mse'])

model_DNN.summary()

In [None]:
model_DNN.fit(x_train, y_train, epochs=20, batch_size=64, callbacks=[es])

In [None]:
y_pred = model_DNN.predict(test)



In [None]:
y_pred

array([[109458.09]], dtype=float32)

In [None]:
keras.backend.clear_session()

model_2 = keras.models.Sequential()

model_2.add(keras.layers.Input(shape=(x_train.shape[1])))
model_2.add(keras.layers.Dense(256, activation='relu'))
model_2.add(keras.layers.Dropout(0.2))
model_2.add(keras.layers.Dense(256, activation='relu'))
model_2.add(keras.layers.Dropout(0.2))
model_2.add(keras.layers.Dense(1, ))

model_2.compile(optimizer='adam', loss='mae', metrics=['mse'])

model_2.summary()

In [None]:
model_2.fit(x_train, y_train, epochs=20, batch_size=64, callbacks=[es])

### 고도화

LGBM이 괜찮은 성능을 보임


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model_LGBM = LGBM()

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.5]
}

In [None]:
grid_search = GridSearchCV(
    estimator=model_LGBM,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5
)


In [None]:
grid_search.fit(x_train, y_train)

In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Best Parameters:  {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
Best Score:  -18235.940858340447


In [None]:
best_model = grid_search.best_estimator_

In [None]:
val_predictions = best_model.predict(x_test)

In [None]:
rmse = np.sqrt(mse(y_test, val_predictions).round(5))
mae = mean_absolute_error(y_test, val_predictions).round(2)
r2 = r2_score(y_test, val_predictions).round(5)

In [None]:
print(rmse, mae, r2)

22842.423020392343 18241.51 0.54529


In [None]:
import joblib

In [None]:
joblib.dump(encoder, '/content/drive/MyDrive/빅프로젝트/model/encoder.pkl')
joblib.dump(model, '/content/drive/MyDrive/빅프로젝트/model/LGBM.pkl')

['/content/drive/MyDrive/빅프로젝트/model/LGBM.pkl']