In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("공공/훈련데이터셋.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181408 entries, 0 to 181407
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   지역코드           181408 non-null  int64  
 1   최저기온(°C)       181408 non-null  float64
 2   3.0m 지중온도(°C)  181408 non-null  float64
 3   평균 현지기압(hPa)   181408 non-null  float64
 4   가조시간(hr)       181408 non-null  float64
 5   평균 상대습도(%)     181408 non-null  float64
 6   풍정합(100m)      181408 non-null  float64
 7   합계 소형증발량(mm)   181408 non-null  float64
 8   파워             181408 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 12.5 MB


In [3]:
target = "파워"
y = df[target]

X = df.drop(columns=[target])

# 숫자형 컬럼만 사용
X = X.select_dtypes(include="number")


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [5]:
X_train.columns[:20]

Index(['지역코드', '최저기온(°C)', '3.0m 지중온도(°C)', '평균 현지기압(hPa)', '가조시간(hr)',
       '평균 상대습도(%)', '풍정합(100m)', '합계 소형증발량(mm)'],
      dtype='object')

In [6]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=20000,        # 크게 잡고
    learning_rate=0.03,        # 조금 천천히
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=200,
    verbose=200
)

print("best_iteration:", model.best_iteration)

[0]	validation_0-rmse:371613.60760




[200]	validation_0-rmse:289661.39618
[400]	validation_0-rmse:288530.00543
[521]	validation_0-rmse:289827.65801
best_iteration: 322


In [7]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R2:", r2)

RMSE: 288182.51236895716
R2: 0.3190037074356883


In [8]:
# 기온이 진짜 사용되었는지
import pandas as pd

importance = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance.head(10)

지역코드             0.415610
3.0m 지중온도(°C)    0.159016
가조시간(hr)         0.104906
합계 소형증발량(mm)     0.069923
최저기온(°C)         0.065459
평균 현지기압(hPa)     0.064984
풍정합(100m)        0.061938
평균 상대습도(%)       0.058165
dtype: float32

In [9]:
print("best_iteration:", model.best_iteration)
print("best_score (valid rmse):", model.best_score)

best_iteration: 322
best_score (valid rmse): 288182.51163823914


In [10]:
model = XGBRegressor(
    n_estimators=20000,
    learning_rate=0.05,        # <- 여기만 변경
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=200,
    verbose=200
)

print("best_iteration:", model.best_iteration)
print("best_score (valid rmse):", model.best_score)

[0]	validation_0-rmse:369330.15563




[200]	validation_0-rmse:288320.17493
[383]	validation_0-rmse:290591.45369
best_iteration: 183
best_score (valid rmse): 287961.17980909545


In [11]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=20000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=10,
    gamma=1.0,                # <- 여기만 추가
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=200,
    verbose=200
)

print("best_iteration:", model.best_iteration)
print("best_score (valid rmse):", model.best_score)

[0]	validation_0-rmse:369330.15563
[200]	validation_0-rmse:287630.73166
[390]	validation_0-rmse:289226.00624
best_iteration: 191
best_score (valid rmse): 287462.00932567543


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# X/y
X = df.drop(columns=["파워"])
y = df["파워"]

# 지역코드 원핫
X = pd.get_dummies(X, columns=["지역코드"], drop_first=False)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# model (지금 쓰던 설정 그대로 시작)
model = XGBRegressor(
    n_estimators=20000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=10,
    objective="reg:squarederror",
    eval_metric="rmse",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=200,
    verbose=200
)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print("RMSE:", rmse)
print("R2:", r2)
print("best_iteration:", model.best_iteration)
print("best_score:", model.best_score)



[0]	validation_0-rmse:364646.76512
[200]	validation_0-rmse:186571.56840
[400]	validation_0-rmse:176261.59817
[600]	validation_0-rmse:171245.13257
[800]	validation_0-rmse:168067.27829
[1000]	validation_0-rmse:165693.83826
[1200]	validation_0-rmse:164226.65392
[1400]	validation_0-rmse:164404.99175
[1459]	validation_0-rmse:164015.55576
RMSE: 163921.1214249129
R2: 0.7796671083451707
best_iteration: 1260
best_score: 163921.12121277035


In [13]:
final_model = XGBRegressor(
    n_estimators=1260,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=10,
    gamma=1.0,
    reg_lambda=5.0,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

final_model.fit(X_train, y_train)

In [14]:

pred = final_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)

print("RMSE:", rmse)
print("R2  :", r2)

RMSE: 161099.79533307685
R2  : 0.7871863509500427
