In [None]:
import pandas as pd
import numpy as np
import joblib

In [17]:
df = pd.read_csv('2020.csv')

In [18]:
# 카테고리 분류
drop_col = ['카테고리', '낙찰가율']
df[['대분류', '중분류']] = (df['카테고리'] .str.strip('[]') .str.split(' / ', expand=True))

# 데이터타입 전환
df['개찰일시'] = pd.to_datetime(df['개찰일시'], format='%Y-%m-%d %H:%M')
price_cols = ['최저입찰가 (예정가격)(원)', '낙찰가(원)']
for col in price_cols:
    df[col] = (df[col] .astype(str).str.replace(',', '', regex=False).replace({'비공개': np.nan, '-': np.nan}))
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 그룹별 집계함수 정의
agg_dict = {
    '대분류':                   'first',
    '중분류':                   'first',
    '물건정보':                 'first',
    '최저입찰가 (예정가격)(원)': 'max',
    '낙찰가(원)':               'max',
    '개찰일시':                 'min',
    '입찰결과':                 'first'
}

# 그룹별 집계 수행
new_df = (df.groupby('일련번호', as_index=False).agg(agg_dict))

# 컬럼명 간소화
new_df = new_df.rename(columns={'최저입찰가 (예정가격)(원)': '최저입찰가', '낙찰가(원)': '낙찰가'})

# 온전한 정보만 유지
new_df = new_df[new_df['입찰결과'] == '낙찰']
new_df = new_df.dropna()
new_df.drop(columns=['입찰결과'], inplace=True)
new_df = new_df[new_df['최저입찰가'] != 0]
df = new_df

# 낙찰가율 계산
df['낙찰가율'] = (df['낙찰가'] / df['최저입찰가']) * 100

del new_df

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11691 entries, 16 to 20564
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   일련번호    11691 non-null  object        
 1   대분류     11691 non-null  object        
 2   중분류     11691 non-null  object        
 3   물건정보    11691 non-null  object        
 4   최저입찰가   11691 non-null  float64       
 5   낙찰가     11691 non-null  float64       
 6   개찰일시    11691 non-null  datetime64[ns]
 7   낙찰가율    11691 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 822.0+ KB


In [20]:
print(df.head())

                일련번호    대분류       중분류                물건정보       최저입찰가  \
16    2015-03797-004  미분류기타     미분류기타  서울특별시 강남구 삼성동 ****  50000000.0   
18  2015-0931-001004    회원권  콘도미니엄회원권  윌리힐리파크 1309 (제일저축)  25900000.0   
19  2015-0931-001005    회원권  콘도미니엄회원권  윌리힐리파크 1309 (제일저축)  25900000.0   
20  2015-0931-001006    회원권  콘도미니엄회원권  윌리힐리파크 1309 (제일저축)  25900000.0   
34    2017-06205-003  미분류기타     미분류기타    서울특별시 종로구 종로****  28000000.0   

           낙찰가                개찰일시        낙찰가율  
16  35000000.0 2020-05-07 11:00:00   70.000000  
18  25900000.0 2020-02-03 10:00:00  100.000000  
19  26350000.0 2020-02-03 10:00:00  101.737452  
20  26100000.0 2020-02-03 10:00:00  100.772201  
34  29960000.0 2020-07-16 11:00:00  107.000000  


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

# 1) 데이터·타겟 분리
X = df.drop(columns=['일련번호','낙찰가','낙찰가율','물건정보'])
y = df['낙찰가율']

# 2) 전처리기 정의 (한 번만)
def extract_date_feats(df):
    d = pd.DatetimeIndex(df['개찰일시'])
    return pd.DataFrame({
        'year':    d.year,
        'month':   d.month,
        'day':     d.day,
        'weekday': d.weekday,
        'hour':    d.hour
    }, index=df.index)

date_pipe = Pipeline([
    ('extract', FunctionTransformer(extract_date_feats, validate=False))
])

preprocessor = ColumnTransformer([
    ('date', date_pipe, ['개찰일시']),
    ('cat',  OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['대분류','중분류']),
    ('num',  FunctionTransformer(lambda df: df[['최저입찰가']].to_numpy(), validate=False), ['최저입찰가'])
])

In [22]:
import optuna
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 튜닝할 모델 리스트
models = {
    'GBR': GradientBoostingRegressor,
    'RFR': RandomForestRegressor,
    'LGBM': LGBMRegressor
}

best_params = {}
best_pipelines = {}

for name, ModelClass in models.items():
    def objective(trial):
        # 하이퍼파라미터 공간 정의
        if name == 'GBR':
            params = {
                'n_estimators':    trial.suggest_int('n_estimators', 50, 300),
                'learning_rate':   trial.suggest_loguniform('learning_rate', 1e-3, 0.2),
                'max_depth':       trial.suggest_int('max_depth', 2, 8),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'subsample':       trial.suggest_uniform('subsample', 0.5, 1.0),
                'max_features':    trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
            }
        elif name == 'RFR':
            params = {
                'n_estimators':    trial.suggest_int('n_estimators', 100, 500),
                'max_depth':       trial.suggest_int('max_depth', 5, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'max_features':    trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
            }
        else:  # LGBM
            params = {
                'n_estimators':    trial.suggest_int('n_estimators', 100, 500),
                'learning_rate':   trial.suggest_loguniform('learning_rate', 1e-3, 0.2),
                'num_leaves':      trial.suggest_int('num_leaves', 31, 128),
                'max_depth':       trial.suggest_int('max_depth', 5, 20),
                'subsample':       trial.suggest_uniform('subsample', 0.5, 1.0),
                'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
            }

        model = ModelClass(random_state=42, **params)
        pipe = Pipeline([
            ('pre', preprocessor),
            ('model', model)
        ])

        # 3-fold CV, neg_root_mean_squared_error
        scores = cross_val_score(
            pipe, X_train, y_train,
            cv=3,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1
        )
        return -scores.mean()

    # Optuna 스터디 생성 및 최적화
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)

    print(f"[{name}] best RMSE: {study.best_value:.3f}")
    print(f"[{name}] best params: {study.best_trial.params}\n")

    # 최적 파라미터로 전체 학습
    best_params[name] = study.best_trial.params
    best_model = ModelClass(random_state=42, **study.best_trial.params)
    best_pipe = Pipeline([
        ('pre', preprocessor),
        ('model', best_model)
    ])
    best_pipe.fit(X_train, y_train)
    best_pipelines[name] = best_pipe

# 검증 세트 성능 비교
for name, pipe in best_pipelines.items():
    preds = pipe.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    mae  = mean_absolute_error(y_val, preds)
    print(f"[{name}] Validation RMSE: {rmse:.3f}, MAE: {mae:.3f}")


[I 2025-05-14 03:29:24,313] A new study created in memory with name: no-name-907c5cd0-dc79-4711-8c8f-8f9a21194b23
  'learning_rate':   trial.suggest_loguniform('learning_rate', 1e-3, 0.2),
  'subsample':       trial.suggest_uniform('subsample', 0.5, 1.0),
[I 2025-05-14 03:29:37,968] Trial 0 finished with value: 482.0761730149922 and parameters: {'n_estimators': 98, 'learning_rate': 0.002523793666486755, 'max_depth': 6, 'min_samples_split': 8, 'subsample': 0.967622246507671, 'max_features': 'sqrt'}. Best is trial 0 with value: 482.0761730149922.
[I 2025-05-14 03:29:58,356] Trial 1 finished with value: 572.3407014877436 and parameters: {'n_estimators': 142, 'learning_rate': 0.018265164141140423, 'max_depth': 4, 'min_samples_split': 6, 'subsample': 0.7125782514313822, 'max_features': None}. Best is trial 0 with value: 482.0761730149922.
[I 2025-05-14 03:29:59,783] Trial 2 finished with value: 481.0173143014584 and parameters: {'n_estimators': 194, 'learning_rate': 0.012297071865629639, 'm

[GBR] best RMSE: 474.824
[GBR] best params: {'n_estimators': 168, 'learning_rate': 0.01022068953806962, 'max_depth': 7, 'min_samples_split': 2, 'subsample': 0.6293605448302795, 'max_features': 'sqrt'}



[I 2025-05-14 03:34:45,564] A new study created in memory with name: no-name-997a8101-f212-4033-921f-0b164fb909ae
[I 2025-05-14 03:34:48,723] Trial 0 finished with value: 475.4225567557721 and parameters: {'n_estimators': 139, 'max_depth': 12, 'min_samples_split': 9, 'max_features': 'log2'}. Best is trial 0 with value: 475.4225567557721.
[I 2025-05-14 03:35:07,742] Trial 1 finished with value: 484.95638994019123 and parameters: {'n_estimators': 130, 'max_depth': 7, 'min_samples_split': 2, 'max_features': None}. Best is trial 0 with value: 475.4225567557721.
[I 2025-05-14 03:35:17,738] Trial 2 finished with value: 474.4812069746626 and parameters: {'n_estimators': 262, 'max_depth': 13, 'min_samples_split': 9, 'max_features': 'sqrt'}. Best is trial 2 with value: 474.4812069746626.
[I 2025-05-14 03:36:43,629] Trial 3 finished with value: 493.72741794885354 and parameters: {'n_estimators': 356, 'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 2 with value: 474.

KeyboardInterrupt: 

In [33]:
if 'LGBM' in best_pipelines:
    try:
        joblib.dump(best_pipelines['LGBM'], 'best_lgbm_pipeline.pkl')
        print("best_lgbm_pipeline.pkl 저장 완료")
    except Exception as e:
        print(f"모델 저장에 실패했습니다: {e}")
else:
    print("best_pipelines에 'LGBM' 모델이 없습니다. 튜닝 중 오류가 있었을 수 있습니다.")


best_pipelines에 'LGBM' 모델이 없습니다. 튜닝 중 오류가 있었을 수 있습니다.
