# 전기차 가격 예측

## import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from catboost import CatBoostRegressor
import optuna

## 1. 데이터 로드

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## 2. 데이터 전처리

In [None]:
# 2-1. 배터리용량 결측치 처리 (같은 모델의 배터리용량은 항상 같음)
train_data['배터리용량'] = train_data.groupby('모델')['배터리용량'].transform(lambda x: x.fillna(x.mean()))
test_data['배터리용량'] = test_data.groupby('모델')['배터리용량'].transform(lambda x: x.fillna(x.mean()))

In [None]:
# 2-2. 사고이력 라벨 인코딩
label_encoder = LabelEncoder()
train_data['사고이력'] = label_encoder.fit_transform(train_data['사고이력'])
test_data['사고이력'] = label_encoder.transform(test_data['사고이력'])

In [None]:
# 2-3. 모델별 사고 없는 차량 평균 가격 기반 가중치 생성
avg_price_no_accidents = train_data[train_data['사고이력'] == 0].groupby('모델')['가격(백만원)'].mean()
model_weights = avg_price_no_accidents / avg_price_no_accidents.mean()
train_data['모델가중치'] = train_data['모델'].map(model_weights)
test_data['모델가중치'] = test_data['모델'].map(model_weights).fillna(1)

In [None]:
# 2-4. 원핫 인코딩 (제조사, 차량상태, 구동방식, 모델)
train_data = pd.get_dummies(train_data, columns=['제조사', '차량상태', '구동방식', '모델'])
test_data = pd.get_dummies(test_data, columns=['제조사', '차량상태', '구동방식', '모델'])

In [None]:
# 2-5. 학습/테스트 데이터 간 컬럼 맞춤
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[train_data.columns.drop('가격(백만원)')]
test_data = test_data.drop(['ID'], axis=1)

y = train_data['가격(백만원)']
X = train_data.drop(['ID', '가격(백만원)'], axis=1)

In [None]:
# 2-6. 특징 엔지니어링 (새로운 변수 생성)
X['주행거리_연식_비율'] = X['주행거리(km)'] / (X['연식(년)'] + 1e-5)
test_data['주행거리_연식_비율'] = test_data['주행거리(km)'] / (test_data['연식(년)'] + 1e-5)

## 3. 교차 검증 설정

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## 4. 모델 학습 및 앙상블

In [None]:
def train_model(X, y, test_data, model_name):
    def objective(trial):
        if model_name == 'xgb':
            params = {
                'device': 'gpu',
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            }
            model = xgb.XGBRegressor(**params, random_state=42)
        else:
            params = {
                'task_type': 'GPU',
                'depth': trial.suggest_int('depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'iterations': trial.suggest_int('iterations', 100, 1000),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 10.0),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            }
            model = CatBoostRegressor(**params, random_state=42, verbose=0)
        val_rmse = []
        for train_idx, val_idx in kf.split(X):
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            model.fit(X_train_fold, y_train_fold)
            preds = model.predict(X_val_fold)
            rmse = np.sqrt(mean_squared_error(y_val_fold, preds))
            val_rmse.append(rmse)
        return np.mean(val_rmse)

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)
    best_params = study.best_trial.params

    # 최적 파라미터로 학습
    if model_name == 'xgb':
        final_model = xgb.XGBRegressor(**best_params, random_state=42)
    else:
        final_model = CatBoostRegressor(**best_params, random_state=42, verbose=0)

    final_model.fit(X, y)
    test_preds = final_model.predict(test_data)
    return test_preds

xgb_preds = train_model(X, y, test_data, 'xgb')
catboost_preds = train_model(X, y, test_data, 'catboost')

# 예측값 앙상블 (평균)
final_preds = (xgb_preds + catboost_preds) / 2

In [None]:
# 제출 파일 생성
submission = pd.read_csv('data/sample_submission.csv')
submission['가격(백만원)'] = final_preds
submission.to_csv('data/submission.csv', index=False)