<a href="https://colab.research.google.com/github/silver-or/dacon/blob/main/sklearn%EC%9D%84_%ED%99%9C%EC%9A%A9%ED%95%98%EC%97%AC_%EB%8B%B5%EC%9D%84_%EC%A0%9C%EC%B6%9C%ED%95%B4%EB%B3%B4%EA%B8%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## sklearn의 randomforest 이용한 풀이 방법

In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

### 1. 데이터 불러오기

In [None]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/colab/competition_data'
_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

In [None]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

### 2. 데이터 전처리

In [None]:
print(df_rf.index)
print(df_water.index)
print(submission.index)

Index(['2018-05-01 00:00', '2018-05-01 00:10', '2018-05-01 00:20',
       '2018-05-01 00:30', '2018-05-01 00:40', '2018-05-01 00:50',
       '2018-05-01 01:00', '2018-05-01 01:10', '2018-05-01 01:20',
       '2018-05-01 01:30',
       ...
       '2017-10-31 22:20', '2017-10-31 22:30', '2017-10-31 22:40',
       '2017-10-31 22:50', '2017-10-31 23:00', '2017-10-31 23:10',
       '2017-10-31 23:20', '2017-10-31 23:30', '2017-10-31 23:40',
       '2017-10-31 23:50'],
      dtype='object', name='ymdhm', length=276336)
Index(['2012-05-01 00:00', '2012-05-01 00:10', '2012-05-01 00:20',
       '2012-05-01 00:30', '2012-05-01 00:40', '2012-05-01 00:50',
       '2012-05-01 01:00', '2012-05-01 01:10', '2012-05-01 01:20',
       '2012-05-01 01:30',
       ...
       '2017-10-31 22:20', '2017-10-31 22:30', '2017-10-31 22:40',
       '2017-10-31 22:50', '2017-10-31 23:00', '2017-10-31 23:10',
       '2017-10-31 23:20', '2017-10-31 23:30', '2017-10-31 23:40',
       '2017-10-31 23:50'],
      dtype='

In [None]:
# pd.to_datetime() 함수를 사용하여 시간 형식의 object 자료형 column을 datetime 형식으로 바꾼다.
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [None]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

In [None]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [None]:
# 데이터 시간대 확인하기
# DataFrame.dtypes: Return Series with the data type of each column.
# To select Pandas datetimetz dtypes, use 'datetimetz' (new in 0.20.0) or 'datetime64[ns, tz]'
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [None]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)

In [None]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
'''
reset_index
- 데이터프레임의 전처리 과정으로 뒤죽박죽이 된 인덱스를 처음부터 재배열함
- drop=True 옵션을 주면 기존 인덱스를 버리고 재배열함
'''
_target = target.reset_index(drop=True)
_data = data.reset_index(drop=True)

_data.index += 1

tot=pd.concat((_data,_target),axis=1)
tot=tot.sort_index()  # 인덱스를 오름차순으로 정렬

tot=tot.iloc[1:-1]

target = tot.loc[:,submission.columns]
data = tot.drop(submission.columns,axis=1)

In [None]:
train_target=target.iloc[:-len(submission),:]
test_target=target.iloc[-len(submission):,:]

train_data=data.iloc[:-len(submission),:]
test_data=data.iloc[-len(submission):,:]

In [None]:
# DataFrame.mean(): 열 축을 따라 평균을 찾음
train_target.fillna(train_target.mean(),inplace=True)
test_target.fillna(train_target.mean(),inplace=True)
train_data.fillna(train_data.mean(),inplace=True)
test_data.fillna(train_data.mean(),inplace=True)

In [None]:
print('--data--')
print(train_data.shape)
print(test_data.shape)
print('--target--')
print(train_target.shape)
print(test_target.shape)

--data--
(269423, 13)
(6912, 13)
--target--
(269423, 4)
(6912, 4)


### modelling

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=2, shuffle=True)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1)

params = {
    "n_estimators" : (100, 150, 200)
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(rf,
                       param_grid=params,
                       cv = kfold,
                       n_jobs=-1)

In [None]:
grid_cv.fit(train_data,train_target)

GridSearchCV(cv=KFold(n_splits=2, random_state=None, shuffle=True),
             estimator=RandomForestRegressor(n_jobs=-1), n_jobs=-1,
             param_grid={'n_estimators': (100, 150, 200)})

In [None]:
grid_cv.best_estimator_

RandomForestRegressor(n_estimators=150, n_jobs=-1)

In [None]:
model=grid_cv.best_estimator_
model.fit(train_data,train_target)
y_pred=model.predict(test_data)

_submission_raw.iloc[:,:] = y_pred
_submission_raw.to_csv('/content/drive/MyDrive/colab/competition_data/ans.csv')