# load data

In [None]:
# import package
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GRU, LSTM, Dense, Dropout
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
import datetime
import inspect
from pickle import dump, load

In [None]:
# 데이터 불러오기
train_raw_df = pd.read_csv('e:/kma/data/TRAIN_tree_with_파생변수_euc-kr0804 (1).csv', encoding='cp949')
train_raw_df.drop(['Unnamed: 0'], axis=1, inplace=True)
col_list = train_raw_df.columns

full_raw_df = train_raw_df

full_raw_df = full_raw_df.astype({'yyyymmdd':'str'})
full_raw_df['yyyymmdd'] = pd.to_datetime(full_raw_df['yyyymmdd'])
full_raw_df


add_list = set(full_raw_df['add'])
indep_cols = full_raw_df.columns.difference(['yyyymmdd', 'add', 'sex', 'frequency'])

# def

#### standard scaling

In [None]:
def scaling(full_data):
    
    # scaling
    scaler_x = StandardScaler()
    scaled_df = scaler_x.fit_transform(full_data[indep_cols])
    scaled_df = pd.DataFrame(scaled_df, columns=indep_cols)

    scaler_y = StandardScaler()
    scaled_df['frequency'] = scaler_y.fit_transform(full_data['frequency'].values.reshape(-1,1))

    dump(scaler_x, open('e:/kma/scaler/scaler_x_{0}_{1}_{2}_{3}.pkl'.format(model_nm, sido, sex, nowDatetime), 'wb'))
    dump(scaler_y, open('e:/kma/scaler/scaler_y_{0}_{1}_{2}_{3}.pkl'.format(model_nm, sido, sex, nowDatetime), 'wb'))
    
    return scaler_x, scaler_y, scaled_df

#### lstm에 맞게 데이터 구조 변환

In [None]:
# 입력 파라미터 feature, label => numpy type
# 아래 pre_processing 함수에서 쓰임
def make_sequene_dataset(feature, label, window_size):

    feature_list = []      # 생성될 feature list
    label_list = []        # 생성될 label list

    for i in range(len(feature)-window_size):

        feature_list.append(feature[i:i+window_size])
        label_list.append(label[i+window_size])

    return np.array(feature_list), np.array(label_list)

In [None]:
def pre_processing(scaled_df, window_size):
    
    # feature_df, label_df 생성
    feature_df = pd.DataFrame(scaled_df, columns=indep_cols)
    label_df = pd.DataFrame(scaled_df['frequency'])

    # DataFrame => Numpy 변환
    feature_np = feature_df.to_numpy()
    label_np = label_df.to_numpy()
    print(feature_np.shape, label_np.shape)
    # print("__________________________________")

    # 시계열 데이터 생성 (make_sequence_dataset)
    X, Y = make_sequene_dataset(feature_np, label_np, window_size)
    print(X.shape, Y.shape)
    
    return X, Y

#### lstm 모델 생성 및 compile

In [None]:
from tensorflow.keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


def make_model(train_x_data):
        input_dim = train_x_data[0].shape
        
        model = Sequential()
        model.add(LSTM(256, activation='relu', input_shape=input_dim))
        model.add(Dropout(0.3))
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(1, activation='linear'))
        
        # model.compile(loss='mse', optimizer='rmsprop', metrics=['mse'])
        model.compile(loss=root_mean_squared_error, optimizer='rmsprop', metrics=['mse'])
        
        return model

# 시도별 성별 lstm 모델 생성

In [None]:
now = datetime.datetime.now()
nowDatetime = now.strftime('%m%d_%H%M')

result = pd.DataFrame(columns=['sido', 'sex', 'rmse'])
LB_result = pd.DataFrame(columns=['yyyymmdd', 'sido', 'sex', 'frequency'])

model_nm = "lstm"

# sido = '서울'
# sex = 1
for sido in add_list:#('광주','서울'):
    for sex in (1,2):
        # #-------------------------------------------------------------------------------------------#
        print("============================", sido, sex, "============================")

        condition = (full_raw_df['add']==sido) & (full_raw_df['sex']==sex)        
        tmp = full_raw_df.copy()[condition]
        tmp.reset_index(level=0, inplace=True, drop=True)


        # lstm에 넣을 수 있도록 데이터 형태 변환
        scaler_x, scaler_y, scaled_df = scaling(tmp)
        X, Y = pre_processing(scaled_df, 7)


        # 12~14 train, 15 valid 분리
        x_train = X[0:-365]
        y_train = Y[0:-365]

        x_valid = X[-365:]
        y_valid = Y[-365:]


        # model 생성 및 compile
        model = make_model(x_train)

        # model fitting
        early_stop = EarlyStopping(monitor='val_loss', patience=30)
        history = model.fit(x_train, y_train, 
                            validation_data=(x_valid, y_valid),
                            epochs=500, batch_size=512,
                            verbose=1,
                            callbacks=[early_stop])
        # model.save('e:/kma/model/{0}_{1}_{2}_{3}.h5'.format(model_nm, sido, sex, nowDatetime))


        # 2015 valid 셋으로 rmse 확인--------------------------
        # 그래프 확인
        pred_valid = model.predict(x_valid)
        rescaled_y_valid = scaler_y.inverse_transform(np.array(y_valid).reshape(-1,1))
        rescaled_pred_valid= scaler_y.inverse_transform(np.array(pred_valid).reshape(-1,1))

        # rmse
        rmse = mean_squared_error(rescaled_y_valid, rescaled_pred_valid, squared=False)
        print(rmse)
        result_tmp = pd.DataFrame([[sido, sex, rmse]], columns=['sido', 'sex', 'rmse'])
        result = pd.concat([result, result_tmp])
        #-------------------------------------------------------

        #-------------------------------------------------------------------------------------------#


# 2015 valid셋 rmse 결과 저장
result.to_csv('e:/kma/rmse/{0}_{1}.csv'.format(model_nm, nowDatetime), encoding = 'utf-8-sig')

In [None]:
def train(model, custom_loss, x_concat_data, y_concat_data, epoch=200, batch_size=24, n_splits=5, kf_shuffle=True):
        from sklearn.model_selection import KFold
        skf = KFold(n_splits=n_splits, shuffle=kf_shuffle)
                    
        # 계산과 수행
        with tf.device('/gpu:0'):
            accuracy = []
            model.compile(loss='mean_absolute_percentage_error', optimizer=Adam(lr=0.001))
            for train, validation in skf.split(x_concat_data, y_concat_data):
                print('train valid rate :', len(train),len(validation))
                model.fit(x_concat_data[train], y_concat_data[train], batch_size=batch_size, epochs=epoch, verbose=2, shuffle=True)

                score = model.evaluate(x_concat_data[validation], y_concat_data[validation], batch_size=batch_size)
                accuracy.append(score)


            print('\nK-fold cross validation score: {}'.format(accuracy))

        return model, accuracy

In [None]:
def objective_xgb(trial):
    str_kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 50)

    rmses=[]

    for train_index, test_index in str_kf.split(x_full_Gangwon_1, y_full_Gangwon_1):
        x_train, x_val = x_full_Gangwon_1.iloc[train_index], x_full_Gangwon_1.iloc[test_index]
        y_train, y_val = y_full_Gangwon_1.iloc[train_index], y_full_Gangwon_1.iloc[test_index]
        
        model = xgb.XGBRegressor(**params, random_state = 6, use_label_encoder = False)
        model.fit(x_train,y_train)
        
        y_pred=model.predict(x_val)
        rmses.append(round(sqrt(mean_squared_error(y_val, y_pred)),6))
        total=np.mean(rmses)
        

    return total

In [1]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split