In [1]:
import numpy as np 
import pandas as pd 
import pandas_ta as ta
import ccxt
from datetime import datetime
from xgboost import XGBRegressor
import optuna 
from tqdm.auto import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("train.csv") 
submission = pd.read_csv("sample_submission.csv") 

In [3]:
train['일시'] = pd.to_datetime(train['일시'])

# Fill missing values with ffill and bfill
train.fillna(method='ffill', inplace=True)
train.fillna(method='bfill', inplace=True)

In [4]:
train['month'] = train['일시'].dt.month
train['day'] = train['일시'].dt.day
train['week_of_month'] = train['일시'].dt.day.apply(lambda x: (x - 1) // 7 + 1)

In [5]:
X = train[["최고기온", "최저기온", "일교차", "강수량", "평균습도", "평균풍속", "일조합", "일사합", "일조율", "month", "day", "week_of_month"]]
y = train["평균기온"] 

In [6]:
grouped = X.groupby(['month', 'week_of_month']).agg(['mean', 'median', 'std'])
grouped = grouped.drop(columns=[('day', 'mean'), ('day', 'median'), ('day', 'std')])

In [7]:
merged_df = pd.merge(X, grouped, on=['month', 'week_of_month'], how='left')

  merged_df = pd.merge(X, grouped, on=['month', 'week_of_month'], how='left')


In [8]:
merged_df.drop(columns={"최고기온", "최저기온", "일교차", "강수량", "평균습도", "평균풍속", "일조합", "일사합", "일조율"}, inplace=True)  

In [9]:
def stratified_month_split(X, y, test_size=0.2): 
    X_train = pd.DataFrame(columns=X.columns) 
    X_val = pd.DataFrame(columns=X.columns)
    y_train = pd.Series(dtype=float)
    y_val = pd.Series(dtype=float) 

    for month in X["month"].unique():
        X_month = X[X["month"] == month] 
        y_month = y[X["month"] == month] 
        X_train_month, X_val_month, y_train_month, y_val_month = train_test_split(
            X_month, y_month, test_size=test_size, random_state=42
        ) 

        # Append to train and test sets
        X_train = pd.concat([X_train, X_train_month])
        X_val = pd.concat([X_val, X_val_month])
        y_train = pd.concat([y_train, y_train_month])
        y_val = pd.concat([y_val, y_val_month])

    return X_train, X_val, y_train, y_val 

In [10]:
X_train, X_val, y_train, y_val = stratified_month_split(merged_df, y, test_size=0.2)

In [11]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((18406, 30), (4605, 30), (18406,), (4605,))

In [12]:
X_train['month'] = X_train['month'].astype(int)
X_train['day'] = X_train['day'].astype(int)
X_train['week_of_month'] = X_train['week_of_month'].astype(int)

X_val['month'] = X_val['month'].astype(int)
X_val['day'] = X_val['day'].astype(int)
X_val['week_of_month'] = X_val['week_of_month'].astype(int)

In [13]:
def objective(trial): 
    param = {
        "objective": "reg:squarederror",  # Objective for regression
        "n_estimators": 100, 
        "random_state": 42, 
        "tree_method": "gpu_hist",  # Assuming you have a compatible GPU
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True), 
        "subsample": trial.suggest_float("subsample", 0.6, 1.0), 
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0), 
        "max_depth": trial.suggest_int("max_depth", 3, 10), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300), 
        "gamma": trial.suggest_float("gamma", 0, 0.4), 
        "alpha": trial.suggest_float("alpha", 0, 10), 
        "lambda": trial.suggest_float("lambda", 1, 10),
        "enable_categorical": True, 
    }
    reg_xgb = XGBRegressor(**param)
    reg_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=False)
    predictions = reg_xgb.predict(X_val) 
    mae = mean_absolute_error(y_val, predictions)
    return mae 

In [14]:
study = optuna.create_study(direction="minimize") 
study.optimize(objective, n_trials=500) 

[I 2023-12-31 11:49:00,637] A new study created in memory with name: no-name-cf5be5e0-fe64-4704-9a1d-b68d92787081
[I 2023-12-31 11:49:01,463] Trial 0 finished with value: 2.5398050983187046 and parameters: {'learning_rate': 0.09200164999904892, 'subsample': 0.8736777146092519, 'colsample_bytree': 0.8331830629599828, 'max_depth': 7, 'min_child_weight': 234, 'gamma': 0.1250894946144888, 'alpha': 5.456697573734499, 'lambda': 4.697753710313035}. Best is trial 0 with value: 2.5398050983187046.
[I 2023-12-31 11:49:01,718] Trial 1 finished with value: 3.2291445122259574 and parameters: {'learning_rate': 0.02028178389852096, 'subsample': 0.6545715669861824, 'colsample_bytree': 0.9973444111549655, 'max_depth': 6, 'min_child_weight': 127, 'gamma': 0.03504490196146422, 'alpha': 3.5028422428476533, 'lambda': 5.3703996098140525}. Best is trial 0 with value: 2.5398050983187046.
[I 2023-12-31 11:49:01,844] Trial 2 finished with value: 2.546224823068589 and parameters: {'learning_rate': 0.129403171097

In [15]:
best_params = study.best_params 
best_params["n_estimators"] = 100  
best_params["random_state"] = 42 
best_params["objective"] = "reg:squarederror" 
best_params["tree_method"] = "gpu_hist" 
print(f"best params = {best_params}")

best params = {'learning_rate': 0.2987633195855668, 'subsample': 0.7618827977160534, 'colsample_bytree': 0.7975880613795246, 'max_depth': 9, 'min_child_weight': 244, 'gamma': 0.1400177957602812, 'alpha': 4.2890899014765935, 'lambda': 5.8108967740757125, 'n_estimators': 100, 'random_state': 42, 'objective': 'reg:squarederror', 'tree_method': 'gpu_hist'}


In [16]:
# refit full 
full_x = np.concatenate([X_train, X_val], axis=0) 
full_y = np.concatenate([y_train, y_val], axis=0)  

In [18]:
reg_xgb = XGBRegressor(**best_params) 
reg_xgb.fit(full_x, full_y, eval_set=[(full_x, full_y)], early_stopping_rounds=30, verbose=20)

[0]	validation_0-rmse:11.39392
[20]	validation_0-rmse:3.23177
[40]	validation_0-rmse:3.22530




[60]	validation_0-rmse:3.22346
[80]	validation_0-rmse:3.22266
[99]	validation_0-rmse:3.22233


In [20]:
# inference 
submission['일시'] = pd.to_datetime(submission['일시'])
submission['month'] = submission['일시'].dt.month
submission['day'] = submission['일시'].dt.day
submission['week_of_month'] = submission['일시'].dt.day.apply(lambda x: (x - 1) // 7 + 1) 

In [31]:
submission

Unnamed: 0,일시,평균기온,month,day,week_of_month
0,2023-01-01,0,1,1,1
1,2023-01-02,0,1,2,1
2,2023-01-03,0,1,3,1
3,2023-01-04,0,1,4,1
4,2023-01-05,0,1,5,1
...,...,...,...,...,...
353,2023-12-20,0,12,20,3
354,2023-12-21,0,12,21,3
355,2023-12-22,0,12,22,4
356,2023-12-23,0,12,23,4


In [23]:
merged_submission = pd.merge(submission, grouped, on=['month', 'week_of_month'], how='left') 

  merged_submission = pd.merge(submission, grouped, on=['month', 'week_of_month'], how='left')


In [28]:
merged_submission.drop(columns={"일시", "평균기온"}, inplace=True) 

In [32]:
predictions = reg_xgb.predict(merged_submission)

In [36]:
answers = pd.read_csv("sample_submission.csv") 
answers["평균기온"] = predictions

In [37]:
answers

Unnamed: 0,일시,평균기온
0,2023-01-01,-2.293260
1,2023-01-02,-2.345223
2,2023-01-03,-2.367261
3,2023-01-04,-3.357687
4,2023-01-05,-3.200403
...,...,...
353,2023-12-20,-0.003871
354,2023-12-21,-0.104263
355,2023-12-22,0.147887
356,2023-12-23,-0.382040


In [38]:
answers.to_csv("xgboost_optuna.csv", index=False)