In [317]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR,SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error 
from sklearn.preprocessing import OneHotEncoder
from pycaret.regression import *

In [284]:
df = pd.read_csv('train.csv')

In [285]:
df = df.drop(['filename','ID','date'],axis=1)

In [286]:
label_encoder = LabelEncoder()
encoded_colors = label_encoder.fit_transform(df['sex'])
df['sex'] = encoded_colors

In [287]:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df[['operator']])
df = pd.concat([df, pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['operator']))], axis=1)
df = df.drop('operator',axis=1)

In [288]:
df = df.dropna()

In [289]:
x_train = df.drop('time_min', axis=1)
y_train = df['time_min']

In [290]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=41)

In [291]:
train = pd.concat([x_train, y_train], axis=1)
val = pd.concat([x_val, y_val], axis=1)

In [292]:
ms = MinMaxScaler()
x_train_ms = ms.fit_transform(x_train)
x_val_ms = ms.transform(x_val)

In [331]:
model_xgb = XGBRegressor(n_estimators=500, max_depth=3)
model_xgb.fit(x_train_ms, y_train)

model_rf = RandomForestRegressor(n_estimators=500, max_depth=3)
model_rf.fit(x_train_ms, y_train)

model_hb = HuberRegressor()
model_hb.fit(x_train_ms, y_train)

model_cb = CatBoostRegressor(iterations=50, depth=4)
model_cb.fit(x_train_ms, y_train)

Learning rate set to 0.394541
0:	learn: 7.2412991	total: 1.26ms	remaining: 61.7ms
1:	learn: 6.8588726	total: 3.4ms	remaining: 81.7ms
2:	learn: 6.5458504	total: 5.35ms	remaining: 83.8ms
3:	learn: 6.4542294	total: 6.05ms	remaining: 69.5ms
4:	learn: 6.3124100	total: 7.57ms	remaining: 68.1ms
5:	learn: 6.2085870	total: 9.28ms	remaining: 68ms
6:	learn: 6.1362322	total: 10.9ms	remaining: 66.9ms
7:	learn: 6.0716105	total: 12.4ms	remaining: 65.2ms
8:	learn: 5.9507910	total: 13.9ms	remaining: 63.4ms
9:	learn: 5.8410148	total: 16ms	remaining: 63.9ms
10:	learn: 5.7771414	total: 17.8ms	remaining: 63.2ms
11:	learn: 5.7323500	total: 19.5ms	remaining: 61.8ms
12:	learn: 5.6779202	total: 21.3ms	remaining: 60.8ms
13:	learn: 5.6021385	total: 23.3ms	remaining: 59.9ms
14:	learn: 5.5611823	total: 25.2ms	remaining: 58.9ms
15:	learn: 5.5208200	total: 26.6ms	remaining: 56.4ms
16:	learn: 5.4542068	total: 28.5ms	remaining: 55.4ms
17:	learn: 5.3838415	total: 30.4ms	remaining: 54ms
18:	learn: 5.3538295	total: 32.3m

<catboost.core.CatBoostRegressor at 0x22514ef1f70>

In [332]:
xgb_pred_train = model_xgb.predict(x_train_ms)
print(mean_absolute_percentage_error(y_train, xgb_pred_train))

rf_pred_train = model_rf.predict(x_train_ms)
print(mean_absolute_percentage_error(y_train, rf_pred_train))

hb_pred_train = model_hb.predict(x_train_ms)
print(mean_absolute_percentage_error(y_train, hb_pred_train))

cb_pred_train = model_cb.predict(x_train_ms)
print(mean_absolute_percentage_error(y_train, cb_pred_train))

0.03619540782623542
0.4800199667759742
0.4673340939786657
0.3144942141977625


In [333]:
xgb_pred = model_xgb.predict(x_val_ms)
print(mean_absolute_percentage_error(y_val, xgb_pred))

rf_pred = model_rf.predict(x_val_ms)
print(mean_absolute_percentage_error(y_val,rf_pred))

hb_pred = model_hb.predict(x_val_ms)
print(mean_absolute_percentage_error(y_val,hb_pred))

cb_pred = model_cb.predict(x_val_ms)
print(mean_absolute_percentage_error(y_val,cb_pred))

0.4099325223462076
0.3849580568407149
0.370201103310999
0.37074937243524125


In [327]:
ensemble_pred = (xgb_pred + rf_pred + hb_pred + cb_pred)/4
print(mean_absolute_percentage_error(y_val, ensemble_pred))


0.3675898161103057


In [283]:
s = setup(df, target = 'time_min', session_id = 123, train_size = 0.8)
best = compare_models( n_select=5)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,time_min
2,Target type,Regression
3,Original data shape,"(427, 12)"
4,Transformed data shape,"(427, 12)"
5,Transformed train set shape,"(341, 12)"
6,Transformed test set shape,"(86, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,5.0451,48.4319,6.8412,0.3186,0.4784,0.5647,0.02
ridge,Ridge Regression,5.0422,48.4371,6.8434,0.3161,0.4774,0.5594,0.036
catboost,CatBoost Regressor,5.0098,47.5715,6.8059,0.3159,0.4717,0.5269,1.16
lar,Least Angle Regression,5.0427,48.4788,6.847,0.3149,0.4773,0.5581,0.021
lr,Linear Regression,5.0427,48.4791,6.847,0.3149,0.4773,0.5581,1.098
rf,Random Forest Regressor,5.035,47.9664,6.8404,0.2987,0.4638,0.5362,0.142
huber,Huber Regressor,4.9332,50.4173,7.0081,0.2898,0.4662,0.4918,0.035
gbr,Gradient Boosting Regressor,5.1525,49.481,6.9584,0.2842,0.4714,0.551,0.049
et,Extra Trees Regressor,5.2024,48.6912,6.9315,0.271,0.4843,0.555,0.119
omp,Orthogonal Matching Pursuit,5.4172,52.3373,7.1451,0.2577,0.507,0.6128,0.021


Processing:   0%|          | 0/89 [00:00<?, ?it/s]

In [114]:
feature_importance = model_xgb.feature_importances_

# 각 특성의 중요도 출력
for i, importance in enumerate(feature_importance):
    print(f'Feature {i+1}: {importance}')

Feature 1: 0.01455379743129015
Feature 2: 0.0338318832218647
Feature 3: 0.06608369201421738
Feature 4: 0.016791395843029022
Feature 5: 0.01786457933485508
Feature 6: 0.0366649255156517
Feature 7: 0.010511876083910465
Feature 8: 0.0
Feature 9: 0.0
Feature 10: 0.0
Feature 11: 0.8036978244781494


In [23]:
print(df[df['operator_Y']==1]['time_min'].describe())
print(df[df['operator_A']==1]['time_min'].describe())
print(df[df['operator_K']==1]['time_min'].describe())
print(df[df['operator_P']==1]['time_min'].describe())

count    93.000000
mean     21.580645
std       8.876352
min       5.000000
25%      15.000000
50%      20.000000
75%      25.000000
max      50.000000
Name: time_min, dtype: float64
count    194.000000
mean       9.097938
std        5.246829
min        1.000000
25%        6.000000
50%        8.000000
75%       11.000000
max       30.000000
Name: time_min, dtype: float64
count    92.000000
mean     13.434783
std       6.347630
min       4.000000
25%       9.750000
50%      12.000000
75%      16.000000
max      50.000000
Name: time_min, dtype: float64
count    48.000000
mean     14.854167
std       7.322334
min       3.000000
25%      10.750000
50%      14.000000
75%      18.000000
max      40.000000
Name: time_min, dtype: float64
