In [1]:
!pip install h2o
!pip install pandasql



In [2]:
#=============================================================
# 패키지 로딩
#============================================================
import os
import sys
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import h2o
import numpy as np
import itertools 
import math
import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from pandasql import sqldf

In [3]:
#============================================================
# 분석 환경 셋팅
#============================================================
sys.stdout.flush() #Python 메모리에 생성된 모든 객체 삭제(초기화)

#============================================================
# 작업 디렉토리 경로 확인
#============================================================
currentPath=os.getcwd()
print('Current working dir : %s' % currentPath)

Current working dir : C:\Users\user\content


In [4]:
#============================================================
# 기상 데이터 읽어오기 
#============================================================
ASOS = pd.read_csv(currentPath + "/input/ASOS_imput.csv", encoding='UTF-8') #loading weather data
BUOY_DP = pd.read_csv(currentPath + "/input/BUOY_DP_imput.csv", encoding='UTF-8') #loading weather data
HYCOM = pd.read_csv(currentPath + "/input/SEA_IVs_hycom_all.csv", encoding='UTF-8') #loading hycom data

ASOS = ASOS.rename(columns = {"WS_MAX":"WS_MAX_ASOS"})
BUOY_DP = BUOY_DP.rename(columns = {"WS_MAX":"WS_MAX_BD"})

#=============================================================
# 불러온 데이터 구조 확인하기
#=============================================================
ASOS
BUOY_DP
HYCOM

Unnamed: 0,YYMMDD,HAEGU_NUM,year,month,AVG_EMP,MAX_SSH,AVG_SURFACE_SALINITY_TREND,STDEV_SURFACE_TEMPERATURE_TREND,AVG_SALINITY_01,AVG_SALINITY_02,...,MAX_V_VELOCITY_03,MAX_V_VELOCITY_04,MIN_V_VELOCITY_01,MIN_V_VELOCITY_02,MIN_V_VELOCITY_03,MIN_V_VELOCITY_04,STDEV_V_VELOCITY_01,STDEV_V_VELOCITY_02,STDEV_V_VELOCITY_03,STDEV_V_VELOCITY_04
0,2008-09-20,97,2008,9,-0.000068,0.259959,0.019379,0.131165,31.6075,31.6081,...,-0.017062,-999.000000,-0.121754,-0.035654,-0.017062,-999.000000,0.032370,0.014972,-999.000000,-999.000000
1,2008-09-20,98,2008,9,0.000088,0.240687,0.061153,0.880088,32.8018,32.8243,...,0.072945,0.031431,-0.036622,-0.033818,-0.038480,-0.041347,0.034178,0.022088,0.040056,0.027100
2,2008-09-20,99,2008,9,0.000333,0.258393,-0.242847,0.178685,32.8512,32.8829,...,0.407543,0.444247,-0.272650,-0.173565,-0.014867,-0.079175,0.244142,0.187899,0.136900,0.143455
3,2008-09-20,213,2008,9,0.000091,0.249247,-0.293269,1.727980,32.4543,32.5524,...,0.131133,0.130061,-0.122278,-0.072258,-0.061687,-0.031458,0.066656,0.040829,0.068074,0.050827
4,2008-09-20,214,2008,9,0.000153,0.239578,-0.443171,1.554340,32.7638,32.8194,...,0.062962,0.065815,-0.143893,-0.088814,-0.074735,-0.009163,0.057059,0.039952,0.046266,0.029428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12750,2016-12-31,97,2016,12,-0.000072,0.141733,0.017642,0.016048,33.9187,33.9196,...,0.000251,-999.000000,-0.042890,-0.005009,0.000251,-999.000000,0.012369,0.008401,-999.000000,-999.000000
12751,2016-12-31,98,2016,12,-0.000191,0.151005,1.913620,1.995750,33.9292,33.9454,...,0.029539,0.023992,-0.060395,-0.038716,-0.015458,0.005838,0.029858,0.026927,0.016806,0.007840
12752,2016-12-31,99,2016,12,-0.000115,0.203799,0.427979,2.624260,33.8959,33.9022,...,0.422083,0.378267,-0.075958,-0.020391,-0.007830,0.003689,0.119245,0.115518,0.108240,0.098283
12753,2016-12-31,213,2016,12,-0.000265,0.186330,0.040789,0.050849,33.0969,33.0963,...,0.076891,0.060830,-0.091773,-0.044862,-0.021534,-0.015512,0.039020,0.034755,0.028141,0.022072


In [5]:
#=============================================================
# 테이블 결합 및 확인 
#=============================================================
HYCOM['YYMMDD'] = pd.to_datetime(HYCOM['YYMMDD'], format='%Y-%m-%d')
HYCOM[['year', 'month']] = HYCOM[['year', 'month']].apply(pd.to_numeric)
HYCOM.dtypes

ASOS['YYMMDD'] = pd.to_datetime(ASOS['YYMMDD'], format='%Y-%m-%d')
ASOS[['year', 'month']] = ASOS[['year', 'month']].apply(pd.to_numeric)
ASOS.dtypes

BUOY_DP['YYMMDD'] = pd.to_datetime(BUOY_DP['YYMMDD'], format='%Y-%m-%d')
BUOY_DP[['year', 'month']] = BUOY_DP[['year', 'month']].apply(pd.to_numeric)
BUOY_DP.dtypes

DT = pd.merge(HYCOM, ASOS, how='left', on=['HAEGU_NUM', 'YYMMDD', 'year', 'month'])
DT = pd.merge(DT, BUOY_DP, how='left', on=['HAEGU_NUM', 'YYMMDD', 'year', 'month'])

DT.columns.values

array(['YYMMDD', 'HAEGU_NUM', 'year', 'month', 'AVG_EMP', 'MAX_SSH',
       'AVG_SURFACE_SALINITY_TREND', 'STDEV_SURFACE_TEMPERATURE_TREND',
       'AVG_SALINITY_01', 'AVG_SALINITY_02', 'AVG_SALINITY_03',
       'AVG_SALINITY_04', 'MAX_SALINITY_01', 'MAX_SALINITY_02',
       'MAX_SALINITY_03', 'MAX_SALINITY_04', 'MIN_SALINITY_01',
       'MIN_SALINITY_02', 'MIN_SALINITY_03', 'MIN_SALINITY_04',
       'STDEV_SALINITY_01', 'STDEV_SALINITY_02', 'STDEV_SALINITY_03',
       'STDEV_SALINITY_04', 'AVG_TEMP_01', 'AVG_TEMP_02', 'AVG_TEMP_03',
       'AVG_TEMP_04', 'MAX_TEMP_01', 'MAX_TEMP_02', 'MAX_TEMP_03',
       'MAX_TEMP_04', 'MIN_TEMP_01', 'MIN_TEMP_02', 'MIN_TEMP_03',
       'MIN_TEMP_04', 'STDEV_TEMP_01', 'STDEV_TEMP_02', 'STDEV_TEMP_03',
       'STDEV_TEMP_04', 'AVG_U_VELOCITY_01', 'AVG_U_VELOCITY_02',
       'AVG_U_VELOCITY_03', 'AVG_U_VELOCITY_04', 'MAX_U_VELOCITY_01',
       'MAX_U_VELOCITY_02', 'MAX_U_VELOCITY_03', 'MAX_U_VELOCITY_04',
       'MIN_U_VELOCITY_01', 'MIN_U_VELOCITY_02'

In [6]:
#============================================================
# 적조 데이터 읽어오기 
#============================================================
redtide = pd.read_csv(currentPath + "/input/redtide.csv", sep='\t', encoding='UTF-8') #loading redtide data

# column 수정
redtide.rename(columns={redtide.columns[0]:"YYMMDD"}, inplace = True)
redtide.rename(columns={redtide.columns[5]:"LAT_r"}, inplace = True)
redtide.rename(columns={redtide.columns[6]:"LON_r"}, inplace = True)

redtide['YYMMDD'] = pd.to_datetime(redtide['YYMMDD'], format='%Y%m%d')
redtide['month'] = pd.to_numeric(redtide['YYMMDD'].dt.month)
redtide['year'] = pd.to_numeric(redtide['YYMMDD'].dt.year)

#============================================================
# 불러온 데이터 구조 확인하기 
#============================================================
redtide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2324 entries, 0 to 2323
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   YYMMDD     2324 non-null   datetime64[ns]
 1   Cochlo_YN  2324 non-null   int64         
 2   type       1697 non-null   object        
 3   Cell_min   2303 non-null   float64       
 4   Cell_max   2262 non-null   float64       
 5   LAT_r      2324 non-null   float64       
 6   LON_r      2324 non-null   float64       
 7   month      2324 non-null   int64         
 8   year       2324 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3), object(1)
memory usage: 163.5+ KB


In [7]:
#=============================================================
# 해구 데이터 읽어오기
#=============================================================
HAEGU = pd.read_csv(currentPath + "/input/SEA_latlon.csv", sep='\t', encoding='UTF-8') #loading redtide data
HAEGU = HAEGU.sort_values(by=['HAEGU_NUM'])
HAEGU['row_h'] = HAEGU.index + 1

HAEGU.rename(columns={HAEGU.columns[1]:"LAT_h"}, inplace = True)
HAEGU.rename(columns={HAEGU.columns[2]:"LON_h"}, inplace = True)

#============================================================
# 불러온 데이터 구조 확인하기 
#============================================================
HAEGU.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1318 entries, 0 to 1317
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   HAEGU_NUM  1318 non-null   int64  
 1   LAT_h      1318 non-null   float64
 2   LON_h      1318 non-null   float64
 3   row_h      1318 non-null   int64  
dtypes: float64(2), int64(2)
memory usage: 51.5 KB


In [8]:
#=============================================================
# 테이블 결합 및 확인
#=============================================================
match=redtide[["LAT_r","LON_r"]].drop_duplicates()
match["row_r"]=match.index+1

def expand_grid(data_dict):
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())
tmp = expand_grid({'row_h' : HAEGU['row_h'], 'row_r' : match['row_r']})
np.shape(tmp)[0]# nrow(match)*nrow(HAEGU)= 668*1318

tmp = pd.merge(tmp, HAEGU, how='left', on=['row_h'])
tmp = pd.merge(tmp, match, how='left', on=['row_r'])

tmp['dist'] = np.hypot(tmp['LAT_h'].sub(tmp['LAT_r']), tmp['LON_h'].sub(tmp['LON_r']))

pysqldf = lambda q: sqldf(q, globals())
tmp = pysqldf("select HAEGU_NUM, LAT_h, LON_h, LAT_r, LON_r, min(dist) as dist from tmp group by row_r;")
redtide = pd.merge(redtide, tmp, how='left', on=['LAT_r','LON_r'])
              
redtide.info()
redtide.head(5)

#=============================================================
# HAEGU, period 선택
#=============================================================
#start = datetime.datetime.strptime("2008-09-19", "%Y-%m-%d").date()
#end = datetime.datetime.strptime("2016-12-31", "%Y-%m-%d").date()
start = np.datetime64("2008-09-19")
end = np.datetime64("2016-12-31")

redtide = redtide.loc[pd.to_datetime(redtide['YYMMDD']) >= start]
redtide = redtide.loc[pd.to_datetime(redtide['YYMMDD']) <= end]

redtide = redtide[redtide['month'].isin([5, 6, 7, 8, 9, 10, 11, 12])]
redtide = redtide[redtide['HAEGU_NUM'].isin([97, 98, 99, 213, 214])]

# 해구Num, 같은 날짜 여러 상태의 경우 Cochlo_YN값 MAX로 표출
redtide = redtide.groupby(['HAEGU_NUM', 'YYMMDD']).max()['Cochlo_YN'].reset_index()
redtide.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2324 entries, 0 to 2323
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   YYMMDD     2324 non-null   datetime64[ns]
 1   Cochlo_YN  2324 non-null   int64         
 2   type       1697 non-null   object        
 3   Cell_min   2303 non-null   float64       
 4   Cell_max   2262 non-null   float64       
 5   LAT_r      2324 non-null   float64       
 6   LON_r      2324 non-null   float64       
 7   month      2324 non-null   int64         
 8   year       2324 non-null   int64         
 9   HAEGU_NUM  2324 non-null   int64         
 10  LAT_h      2324 non-null   float64       
 11  LON_h      2324 non-null   float64       
 12  dist       2324 non-null   float64       
dtypes: datetime64[ns](1), float64(7), int64(4), object(1)
memory usage: 254.2+ KB


Unnamed: 0,HAEGU_NUM,YYMMDD,Cochlo_YN
0,97,2009-08-17,0
1,97,2009-10-28,1
2,97,2009-10-30,1
3,97,2010-07-19,0
4,97,2010-07-21,0


In [9]:
#=============================================================
# 기상 데이터, 적조 데이터 결합
#=============================================================
AB = pd.merge(DT, redtide, how='left', on=['YYMMDD', 'HAEGU_NUM'])
AB['Cochlo_YN'] = AB['Cochlo_YN'].fillna(0)

# 데이터 결측치 확인
AB.isnull().sum()

YYMMDD       0
HAEGU_NUM    0
year         0
month        0
AVG_EMP      0
            ..
WS_MAX_BD    0
WS_MIN       0
PS_MIN       0
WH_MAX       0
Cochlo_YN    0
Length: 87, dtype: int64

In [10]:
#=============================================================
# 데이터 전처리
#=============================================================
AB.describe()

# 기상변수에서 NA값이 -999로 처리된 경우 확인
AB[AB==-999]=np.nan

# 결측치 처리
AB = AB.fillna(AB.mean(numeric_only=True))

# 결측치 확인
AB.isnull().sum()

AB = AB.sort_values(by=['HAEGU_NUM', 'year', 'YYMMDD'], axis=0)

In [11]:
#=============================================================
# 파생변수 생성
#=============================================================
# 7일 평균값을 구함
AB['mean_AVG_EMP'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_EMP'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_SSH'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SSH'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_SURFACE_SALINITY_TREND'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SURFACE_SALINITY_TREND'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_SURFACE_TEMPERATURE_TREND'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SURFACE_TEMPERATURE_TREND'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_SALINITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SALINITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_SALINITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SALINITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_SALINITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SALINITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_SALINITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SALINITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_SALINITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SALINITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_SALINITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SALINITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_SALINITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SALINITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_SALINITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SALINITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_SALINITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_SALINITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_SALINITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_SALINITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_SALINITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_SALINITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_SALINITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_SALINITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_SALINITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SALINITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_SALINITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SALINITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_SALINITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SALINITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_SALINITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SALINITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_TEMP_01'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_TEMP_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_TEMP_02'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_TEMP_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_TEMP_03'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_TEMP_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_TEMP_04'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_TEMP_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_TEMP_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_TEMP_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_TEMP_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_TEMP_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_TEMP_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_TEMP_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_TEMP_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_TEMP_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_TEMP_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_TEMP_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_TEMP_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_TEMP_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_TEMP_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_TEMP_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_TEMP_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_TEMP_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_TEMP_01'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_TEMP_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_TEMP_02'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_TEMP_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_TEMP_03'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_TEMP_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_TEMP_04'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_TEMP_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_U_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_U_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_U_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_U_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_U_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_U_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_U_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_U_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_U_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_U_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_U_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_U_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_U_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_U_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_U_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_U_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_U_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_U_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_U_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_U_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_U_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_U_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_U_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_U_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_U_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_U_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_U_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_U_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_U_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_U_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_U_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_U_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_V_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_V_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_V_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_V_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_V_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_V_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_AVG_V_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_V_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_V_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_V_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_V_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_V_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_V_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_V_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MAX_V_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_V_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_V_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_V_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_V_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_V_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_V_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_V_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_MIN_V_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['MIN_V_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_V_VELOCITY_01'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_V_VELOCITY_01'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_V_VELOCITY_02'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_V_VELOCITY_02'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_V_VELOCITY_03'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_V_VELOCITY_03'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
AB['mean_STDEV_V_VELOCITY_04'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_V_VELOCITY_04'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7

# 전처리 이전 변수 삭제
AB.drop(['AVG_EMP', 'MAX_SSH', 'AVG_SURFACE_SALINITY_TREND', 'STDEV_SURFACE_TEMPERATURE_TREND', 'AVG_SALINITY_01', 'AVG_SALINITY_02'
, 'AVG_SALINITY_03', 'AVG_SALINITY_04', 'MAX_SALINITY_01', 'MAX_SALINITY_02', 'MAX_SALINITY_03', 'MAX_SALINITY_04', 'MIN_SALINITY_01'
, 'MIN_SALINITY_02', 'MIN_SALINITY_03', 'MIN_SALINITY_04', 'STDEV_SALINITY_01', 'STDEV_SALINITY_02', 'STDEV_SALINITY_03'
, 'STDEV_SALINITY_04', 'AVG_TEMP_01', 'AVG_TEMP_02', 'AVG_TEMP_03', 'AVG_TEMP_04', 'MAX_TEMP_01', 'MAX_TEMP_02', 'MAX_TEMP_03'
, 'MAX_TEMP_04', 'MIN_TEMP_01', 'MIN_TEMP_02', 'MIN_TEMP_03', 'MIN_TEMP_04', 'STDEV_TEMP_01', 'STDEV_TEMP_02', 'STDEV_TEMP_03'
, 'STDEV_TEMP_04', 'AVG_U_VELOCITY_01', 'AVG_U_VELOCITY_02', 'AVG_U_VELOCITY_03', 'AVG_U_VELOCITY_04', 'MAX_U_VELOCITY_01'
, 'MAX_U_VELOCITY_02', 'MAX_U_VELOCITY_03', 'MAX_U_VELOCITY_04', 'MIN_U_VELOCITY_01', 'MIN_U_VELOCITY_02', 'MIN_U_VELOCITY_03'
, 'MIN_U_VELOCITY_04', 'STDEV_U_VELOCITY_01', 'STDEV_U_VELOCITY_02', 'STDEV_U_VELOCITY_03', 'STDEV_U_VELOCITY_04'
, 'AVG_V_VELOCITY_01', 'AVG_V_VELOCITY_02', 'AVG_V_VELOCITY_03', 'AVG_V_VELOCITY_04', 'MAX_V_VELOCITY_01', 'MAX_V_VELOCITY_02'
, 'MAX_V_VELOCITY_03', 'MAX_V_VELOCITY_04', 'MIN_V_VELOCITY_01', 'MIN_V_VELOCITY_02', 'MIN_V_VELOCITY_03', 'MIN_V_VELOCITY_04'
, 'STDEV_V_VELOCITY_01', 'STDEV_V_VELOCITY_02', 'STDEV_V_VELOCITY_03', 'STDEV_V_VELOCITY_04'], axis='columns', inplace=True)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_AVG_EMP'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_EMP'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_MAX_SSH'] = AB.groupby(['HAEGU_NUM', 'year'])['MAX_SSH'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_AVG_SURFACE_SALINITY_TREND'] = AB.groupby(['HAEGU_NUM', 'year'])['AVG_SURFACE_SALINITY_TREND'].apply(lambda x : x.rolling(7).sum().shift(1)) / 7
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_STDEV_SURFACE_TEMPERATURE_TREND'] = AB.groupby(['HAEGU_NUM', 'year'])['STDEV_SURFACE_TEMPERATURE_TREND'].apply(lambda x : x.rolling(7).su

In [12]:
# 14일 평균값을 구함
AB['mean_TA_MAX'] = AB.groupby(['HAEGU_NUM', 'year'])['TA_MAX'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_WS_MAX_ASOS'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_MAX_ASOS'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_WS_MAX_BD'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_MAX_BD'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_WS_INS'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_INS'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_WS_MIN'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_MIN'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_HM_AVG'] = AB.groupby(['HAEGU_NUM', 'year'])['HM_AVG'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_EV_L'] = AB.groupby(['HAEGU_NUM', 'year'])['EV_L'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_PA_AVG'] = AB.groupby(['HAEGU_NUM', 'year'])['PA_AVG'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_PS_MIN'] = AB.groupby(['HAEGU_NUM', 'year'])['PS_MIN'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
AB['mean_WH_MAX'] = AB.groupby(['HAEGU_NUM', 'year'])['WH_MAX'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14

# 전처리 이전 변수 삭제
AB.drop(['TA_MAX', 'WS_MAX_ASOS', 'WS_MAX_BD', 'WS_INS', 'WS_MIN', 'HM_AVG', 'EV_L', 'PA_AVG', 'PS_MIN', 'WH_MAX'], axis='columns', inplace=True)

# 14일 누적치를 구함
AB['sum_SS_DAY'] = AB.groupby(['HAEGU_NUM', 'year'])['SS_DAY'].apply(lambda x : x.rolling(14).sum().shift(1))
AB['sum_RN_DAY'] = AB.groupby(['HAEGU_NUM', 'year'])['RN_DAY'].apply(lambda x : x.rolling(14).sum().shift(1))
AB['sum_SI_DAY'] = AB.groupby(['HAEGU_NUM', 'year'])['SI_DAY'].apply(lambda x : x.rolling(14).sum().shift(1))
AB['sum_RN_DUR'] = AB.groupby(['HAEGU_NUM', 'year'])['RN_DUR'].apply(lambda x : x.rolling(14).sum().shift(1))
AB.drop(['SS_DAY', 'RN_DAY', 'SI_DAY', 'RN_DUR'], axis='columns', inplace=True)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_TA_MAX'] = AB.groupby(['HAEGU_NUM', 'year'])['TA_MAX'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_WS_MAX_ASOS'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_MAX_ASOS'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_WS_MAX_BD'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_MAX_BD'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  AB['mean_WS_INS'] = AB.groupby(['HAEGU_NUM', 'year'])['WS_INS'].apply(lambda x : x.rolling(14).sum().shift(1)) / 14
To preserve the previous behavior, use

	>>> .groupb

In [13]:
# time interval create
AB['Cochlo_YN'] = AB.groupby(['HAEGU_NUM', 'year'])['Cochlo_YN'].shift(-7)

#start = datetime.datetime.strptime("2008-10-04", "%Y-%m-%d").date()
#end = datetime.datetime.strptime("2016-12-31", "%Y-%m-%d").date()
start = np.datetime64("2008-10-04")
end = np.datetime64("2016-12-31")

AB = AB.loc[pd.to_datetime(AB['YYMMDD']) >= start]
AB = AB.loc[pd.to_datetime(AB['YYMMDD']) <= end]

AB = AB[AB['month'].isin([5, 6, 7, 8, 9, 10, 11])]

#======================================================================================================
#메모리 용량 줄이기
#======================================================================================================
del(start, end)
del(ASOS,BUOY_DP,DT,HAEGU,HYCOM,redtide,tmp,match)

AB.info()
AB.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8850 entries, 70 to 12599
Data columns (total 87 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   YYMMDD                                8850 non-null   datetime64[ns]
 1   HAEGU_NUM                             8850 non-null   int64         
 2   year                                  8850 non-null   int64         
 3   month                                 8850 non-null   int64         
 4   Cochlo_YN                             8850 non-null   float64       
 5   mean_AVG_EMP                          8850 non-null   float64       
 6   mean_MAX_SSH                          8850 non-null   float64       
 7   mean_AVG_SURFACE_SALINITY_TREND       8850 non-null   float64       
 8   mean_STDEV_SURFACE_TEMPERATURE_TREND  8850 non-null   float64       
 9   mean_AVG_SALINITY_01                  8850 non-null   float64       
 10

Unnamed: 0,YYMMDD,HAEGU_NUM,year,month,Cochlo_YN,mean_AVG_EMP,mean_MAX_SSH,mean_AVG_SURFACE_SALINITY_TREND,mean_STDEV_SURFACE_TEMPERATURE_TREND,mean_AVG_SALINITY_01,...,mean_WS_MIN,mean_HM_AVG,mean_EV_L,mean_PA_AVG,mean_PS_MIN,mean_WH_MAX,sum_SS_DAY,sum_RN_DAY,sum_SI_DAY,sum_RN_DUR
70,2008-10-04,97,2008,10,0.0,-0.000148,0.208562,0.07917,0.137183,31.689443,...,3.190673,66.689079,2.931534,1001.556916,1010.343008,3.358109,54.180084,21.338384,1.080137,8.321869
75,2008-10-05,97,2008,10,0.0,-0.00014,0.201662,0.107633,0.216339,31.711071,...,3.100917,65.40103,2.908604,1001.814943,1010.726256,3.304844,57.397615,20.683448,1.097392,7.218716
80,2008-10-06,97,2008,10,0.0,-0.000136,0.21066,0.106639,0.213024,31.730586,...,3.107218,65.120337,2.817988,1001.988595,1010.962944,2.762914,55.369954,1.10148,1.080196,3.840809
85,2008-10-07,97,2008,10,0.0,-0.000132,0.200848,0.130929,0.261637,31.746943,...,3.037457,63.842112,2.86857,1002.129137,1011.180098,2.720642,52.81663,1.102438,1.051451,3.841749
90,2008-10-08,97,2008,10,0.0,-0.000137,0.208403,0.159245,0.304827,31.778571,...,3.190235,62.286809,2.902726,1002.464811,1011.523714,2.703223,55.182198,0.891279,1.076749,3.822488


In [14]:
#=============================================================
# 분석
#=============================================================
#2o.cluster().shutdown()
h2o.init(max_mem_size = "4G", nthreads = 1, port=54321)
#h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.19+9-LTS-224, mixed mode)
  Starting server from C:\Users\user\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\user\AppData\Local\Temp\tmprwvs103q
  JVM stdout: C:\Users\user\AppData\Local\Temp\tmprwvs103q\h2o_user_started_from_python.out
  JVM stderr: C:\Users\user\AppData\Local\Temp\tmprwvs103q\h2o_user_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.1
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_user_qfefht
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,1


In [15]:
# setting AB data
dataXY = AB
dataXY = dataXY.drop(['HAEGU_NUM', 'YYMMDD', 'month', 'year'], axis=1)
dataXY = dataXY.rename(columns = {"Cochlo_YN":"Y"})
dataXY.reset_index(drop=True, inplace=True)
dataXY.info()

# train, valid, test 데이터 분리
## split to train, valid, test 
dataXY = h2o.H2OFrame(dataXY)
train_data, valid_data, test_data = dataXY.split_frame(ratios=[0.7,0.15], seed=1111)

## 독립변수, 종속변수 설정(x: 독립변수, y: 종속변수)
x = dataXY.columns
x.remove('Y')
y='Y'

train_data['Y'] = train_data['Y'].asfactor()
valid_data['Y'] = valid_data['Y'].asfactor()
test_data['Y'] = test_data['Y'].asfactor()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8850 entries, 0 to 8849
Data columns (total 83 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Y                                     8850 non-null   float64
 1   mean_AVG_EMP                          8850 non-null   float64
 2   mean_MAX_SSH                          8850 non-null   float64
 3   mean_AVG_SURFACE_SALINITY_TREND       8850 non-null   float64
 4   mean_STDEV_SURFACE_TEMPERATURE_TREND  8850 non-null   float64
 5   mean_AVG_SALINITY_01                  8850 non-null   float64
 6   mean_AVG_SALINITY_02                  8850 non-null   float64
 7   mean_AVG_SALINITY_03                  8850 non-null   float64
 8   mean_AVG_SALINITY_04                  8850 non-null   float64
 9   mean_MAX_SALINITY_01                  8850 non-null   float64
 10  mean_MAX_SALINITY_02                  8850 non-null   float64
 11  mean_MAX_SALINITY

In [16]:
# -----------------------------------------------------------------------------
# 모형 튜닝 자동화
# -----------------------------------------------------------------------------
# cartesian grid search 
# -----------------------------------------------------------------------------
hyper_parameters = {'max_depth': [4, 6, 8, 12, 16, 20]}

# 조합 모형 돌리기
m = H2OGridSearch(H2ORandomForestEstimator,
                  hyper_params=hyper_parameters,
                  search_criteria={'strategy': "Cartesian"},
                  grid_id='RF_depth_grid')

m.train(x = x,
        y = y,
        training_frame = train_data,
        validation_frame = valid_data,
        ntrees = 10000,
        stopping_rounds = 5,
        stopping_tolerance = 1e-4,
        stopping_metric = 'AUC',
        score_tree_interval = 5,
        seed=1111)

# AUC가 높은 순으로 정렬하기
sortedGrid = m.get_grid(sort_by='auc', decreasing=True)

print('===== sortedGrid =====')
print(sortedGrid)

drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
===== sortedGrid =====
Hyper-Parameter Search Summary: ordered by decreasing auc
    max_depth    model_ids              auc
--  -----------  ---------------------  --------
    12           RF_depth_grid_model_4  0.983323
    8            RF_depth_grid_model_3  0.976329
    16           RF_depth_grid_model_5  0.976284
    20           RF_depth_grid_model_6  0.973914
    6            RF_depth_grid_model_2  0.970839
    4            RF_depth_grid_model_1  0.960837


In [17]:
## 모형 튜닝 자동화
minDepth = 12
maxDepth = 16

# options for grid search 
max_runtime_secs = 60*10
max_models = 100

# random grid search 
hyper_params = {
    'max_depth': list(range(minDepth, maxDepth + 1)),
    'sample_rate': [i * 0.01 for i in range(20, 100 + 1)],
    'col_sample_rate_per_tree': [i * 0.01 for i in range(20, 100 + 1)],
    'col_sample_rate_change_per_level': [i * 0.01 for i in range(90, 110 + 1)],
    'min_rows': [1,5,10,20,50,100],
    'min_split_improvement': [0,1e-8,1e-6,1e-4],
    'histogram_type': ['UniformAdaptive', 'QuantilesGlobal', 'RoundRobin']
}

search_criteria = {
    'strategy': "RandomDiscrete",
    'max_runtime_secs': max_runtime_secs,
    'max_models': max_models
}

grid = H2OGridSearch(H2ORandomForestEstimator
                     , hyper_params=hyper_parameters
                     , search_criteria=search_criteria
                     , grid_id='RF_grid')
grid.train(
      x = x
    , y = y
    , training_frame = train_data
    , validation_frame = valid_data
    , ntrees = 10000
    , stopping_rounds = 5
    , stopping_tolerance = 1e-4
    , stopping_metric = 'AUC'
    , score_tree_interval = 5
    , seed = 1111
)

# AUC가 높은 순으로 정렬하기
sortedGrid = grid.get_grid(sort_by='auc', decreasing=True)
RF_AB_Tune = h2o.get_model(sortedGrid.model_ids[1])
print(RF_AB_Tune)

drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%
Model Details
H2ORandomForestEstimator : Distributed Random Forest
Model Key: RF_grid_model_5


Model Summary: 
    number_of_trees    number_of_internal_trees    model_size_in_bytes    min_depth    max_depth    mean_depth    min_leaves    max_leaves    mean_leaves
--  -----------------  --------------------------  ---------------------  -----------  -----------  ------------  ------------  ------------  -------------
    60                 60                          34577                  8            8            8             22            60            41.1667

ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.009404560154279543
RMSE: 0.09697711149688644
LogLoss: 0.041451591285602274
Mean Per-Class Error: 0.3121981036436281
AUC: 0.9285000099389746
AUCPR: 0.45444833270136004
Gini: 0.8570000198779493

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.27116272666237573
  

In [18]:
#======================================================================================================
# forecast
#======================================================================================================
pred= RF_AB_Tune.predict(test_data)
pred=pred.as_data_frame()
test_data=test_data.as_data_frame()

pred = pd.concat([pred['predict'], pred['p1'], test_data['Y']], axis=1)
pred.columns = ['Yhat','p1','Y']

#confusion matrix
print(confusion_matrix(pred['Yhat'],pred['Y']),
      classification_report(pred['Yhat'],pred['Y']))

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
[[1295    7]
 [  14    9]]               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1302
           1       0.56      0.39      0.46        23

    accuracy                           0.98      1325
   macro avg       0.78      0.69      0.73      1325
weighted avg       0.98      0.98      0.98      1325

