In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

  from pandas import MultiIndex, Int64Index


In [3]:
import os
import random
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
building_info = pd.read_csv('building_info.csv')

In [6]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [7]:
train_df.fillna(0, inplace= True)

In [8]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [9]:
train_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh),month,day,time
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,0.0,0.0,986.40,6,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,0.5,0.0,881.04,8,24,19
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,0.0,0.0,798.96,8,24,20
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,0.0,0.0,825.12,8,24,21
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,0.0,0.0,640.08,8,24,22


In [50]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train_df['전력소비량(kWh)']

In [51]:
train_x.head()

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1,18.6,0.0,0.9,42.0,6,1,0
1,1,18.0,0.0,1.1,45.0,6,1,1
2,1,17.7,0.0,1.5,45.0,6,1,2
3,1,16.7,0.0,1.4,48.0,6,1,3
4,1,18.4,0.0,2.8,43.0,6,1,4


In [52]:
train_y.head()

0    1085.28
1    1047.36
2     974.88
3     953.76
4     986.40
Name: 전력소비량(kWh), dtype: float64

In [53]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

RandomForestRegressor()

In [54]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [55]:
test_x = test_df.drop(columns=['num_date_time', '일시'])

In [57]:
preds = model.predict(test_x)

In [58]:
preds

array([2127.2688, 2090.5008, 2009.9712, ...,  748.2216,  654.1728,
        503.1408])

In [61]:
submission = pd.read_csv('sample_submission.csv')

In [62]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2127.2688
1,1_20220825 01,2090.5008
2,1_20220825 02,2009.9712
3,1_20220825 03,1981.9440
4,1_20220825 04,1946.7744
...,...,...
16795,100_20220831 19,893.0712
16796,100_20220831 20,784.7448
16797,100_20220831 21,748.2216
16798,100_20220831 22,654.1728


In [64]:
submission.to_csv('submission.csv', index=False)