In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("공공/훈련데이터셋.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181408 entries, 0 to 181407
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   지역코드           181408 non-null  int64  
 1   최저기온(°C)       181408 non-null  float64
 2   3.0m 지중온도(°C)  181408 non-null  float64
 3   평균 현지기압(hPa)   181408 non-null  float64
 4   가조시간(hr)       181408 non-null  float64
 5   평균 상대습도(%)     181408 non-null  float64
 6   풍정합(100m)      181408 non-null  float64
 7   합계 소형증발량(mm)   181408 non-null  float64
 8   파워             181408 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 12.5 MB


In [3]:
features = [
    "지역코드",
    "최저기온(°C)",
    "3.0m 지중온도(°C)",
    "평균 현지기압(hPa)",
    "가조시간(hr)",
    "평균 상대습도(%)",
    "풍정합(100m)"
]

target = "파워"

X = df[features]
y = df[target]

In [4]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [5]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

base_model = RandomForestRegressor(
    random_state=42,
    n_jobs=-1          
)

param_grid = {
    "n_estimators": [150, 250],
    "max_depth": [8, 10],
    "min_samples_split": [5, 10],
    "min_samples_leaf": [5, 10]
}

grid = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=3,              
    n_jobs=1,          
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=150; total time=   6.2s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=150; total time=   6.1s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=150; total time=   6.3s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=250; total time=   9.6s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=250; total time=  10.2s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=5, n_estimators=250; total time=   9.8s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=10, n_estimators=150; total time=   6.8s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=10, n_estimators=150; total time=   6.7s
[CV] END max_depth=8, min_samples_leaf=5, min_samples_split=10, n_estimators=150; total time=   6.8s
[CV] END max_depth=8, min_samples_le

In [10]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Params:", grid.best_params_)
print(f"RMSE: {rmse:,.2f}")
print(f"R2  : {r2:.4f}")

Best Params: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 150}
RMSE: 239,305.74
R2  : 0.5304


In [11]:
importance = pd.Series(
    best_model.feature_importances_,
    index=features
).sort_values(ascending=False)

importance

지역코드             0.783270
3.0m 지중온도(°C)    0.081618
최저기온(°C)         0.047171
가조시간(hr)         0.037078
풍정합(100m)        0.018139
평균 현지기압(hPa)     0.017433
평균 상대습도(%)       0.015292
dtype: float64