In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import tensorflow as tf

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from tensorflow.keras.optimizers import Adam, Adamax, RMSprop, SGD

# Early stoping
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

# Pustaka untuk  evaluasi model prediksi
from math import sqrt
from sklearn.metrics import mean_squared_error

# pustaka untuk waktu komputasi
import time
from datetime import datetime

In [2]:
# Set waktu komputasi
start = time.time()
# fix random seed for reproducibility
np.random.seed(1)

### Read dataset

In [3]:
# baca dataset bmkg
df_bmkg = pd.read_csv(r'D:\UIN\BigData\UAS\hasil\bmkg_2001_2020_sumsel_clipped.csv')
df_bmkg = df_bmkg.drop(columns=['Unnamed: 0'])
bmkg_values = df_bmkg[['Suhu', 'Kelembapan', 'Curah_Hujan', 'Radiasi_Matahari', 'Kecepatan_Angin', 'hotspot']].values
bmkg_values = bmkg_values.astype('float64')
# print(bmkg_values)

# df yang disimpan memiliki 3 buah titik(long, lat) untuk tiap tanggal
# tiap param akan dikalkulasi meannya
# sehingga tidak ada duplikasi tanggal

# df_ceda = pd.read_csv(r'D:\UIN\BigData\UAS\hasil\ceda_2001_2020_sumsel_clipped.csv')
# df_ceda = df_ceda.drop(columns=['field_1', 'latitude', 'longitude'])
# df_ceda = df_ceda.groupby('date').mean()

# baca dataset ceda
df_ceda = pd.read_csv(r'D:\UIN\BigData\UAS\hasil\ceda_2001_2020_sumsel_clipped2.csv')
# copy hotspot karena variabel terikat sama
df_ceda['hotspot'] = df_bmkg['hotspot']
ceda_values = df_ceda[['sld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet', 'hotspot']].values
ceda_values = ceda_values.astype('float64')

### Normalisasi

In [4]:
# # membuat fungsi max-min scaler
# # menggunakan pustaka scikit-learn
# def normalisasi_max_min(df):
#     # memanggil fungsi max min scaler
#     hasil = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    
#     # proses max min scaler
#     hasil = hasil.fit_transform(df)
    
#     # pengembalian nilai
#     return hasil

In [5]:
# dataset bmkg
scaler = MinMaxScaler(feature_range=(-1, 1))
bmkg_scaled = scaler.fit_transform(bmkg_values)
# df_bmkg[['Suhu', 'Kelembapan', 'Curah_Hujan', 'Radiasi_Matahari', 'Kecepatan_Angin']] = scaler.fit_transform(df_bmkg[['Suhu', 'Kelembapan', 'Curah_Hujan', 'Radiasi_Matahari', 'Kecepatan_Angin']])
# print(df_bmkg)

# dataset ceda
ceda_scaled = scaler.fit_transform(ceda_values)
print(ceda_scaled)
# df_ceda[['sld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet']] = scaler.fit_transform(df_ceda[['sld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet']])
# print(df_ceda)

[[ 0.91267695 -0.47547545 -1.         ... -0.27302359  0.54204029
  -1.        ]
 [ 0.57151459 -0.39939941 -1.         ... -0.1230644   0.11261673
  -0.99972291]
 [ 0.30532982  0.13713718 -1.         ...  0.10676448  0.48491489
  -0.9990302 ]
 ...
 [ 0.38361943  0.24924927 -1.         ... -0.62673188  0.85702707
  -0.99487393]
 [ 0.85727195 -0.11311313 -1.         ... -0.17522418  0.69471125
  -0.99625935]
 [ 0.79855474 -0.63963967 -1.         ... -0.60065193  0.67956636
  -0.99930729]]


### Supervised

In [6]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    
    # return value
    return agg

In [7]:
# supervised learning

# bmkg
bmkg_reframed = series_to_supervised(bmkg_scaled, 1, 1)
# drop kolom yg tdk diprediksi
bmkg_reframed.drop(bmkg_reframed.columns[[7, 8, 9, 10, 11]], axis=1, inplace=True)
bmkg_values = bmkg_reframed.values
print('bmkg\n',bmkg_reframed.head())

# ceda
ceda_reframed = series_to_supervised(ceda_scaled, 1, 1)
# drop kolom yg tdk diprediksi
ceda_reframed.drop(ceda_reframed.columns[[12, 13, 14, 15, 16, 17, 18, 19, 20, 21]], axis=1, inplace=True)
ceda_values = ceda_reframed.values
print('ceda\n',ceda_reframed.head())

bmkg
    var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)   var1(t)
1  -0.834586   0.718575   0.114865  -0.415736   0.187500  -1.000000 -0.553859
2  -0.553859   0.448491  -0.372299  -0.094113   0.176339  -0.999723 -0.385607
3  -0.385607   0.295158  -0.123606  -0.009608  -0.625000  -0.999030 -0.053921
4  -0.053921   0.324631   0.120753   0.197369  -0.827778  -0.997645  0.013963
5   0.013963   0.488186  -0.619203   0.294729  -0.333333  -0.995012 -0.193770
ceda
    var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)  \
1   0.912677  -0.475475       -1.0  -0.556291   0.573809  -0.544056   
2   0.571515  -0.399399       -1.0  -0.231788  -0.000987  -0.454545   
3   0.305330   0.137137       -1.0   0.072848   0.337736  -0.401399   
4   0.239386   0.209209       -1.0  -0.238411   0.529485  -0.255944   
5   0.062030   0.167167       -1.0  -0.496689  -0.522678   0.135664   

   var7(t-1)  var8(t-1)  var9(t-1)  var10(t-1)  var11(t-1)   var1(t)  
1  -0.716129  -0.59325

### Pembagian data

In [8]:
# Split Validasi(train 80%, test 20%)

## dataset bmkg
# split into train and test sets
train_size = int(len(bmkg_values) * 0.8)
test_size = len(bmkg_values) - train_size
train_bmkg, test_bmkg = bmkg_values[0:train_size,:], bmkg_values[train_size:len(bmkg_values),:]

# split into input and outputs
x_train_bmkg, y_train_bmkg = train_bmkg[:, :-1], train_bmkg[:, -1]
x_test_bmkg, y_test_bmkg = test_bmkg[:, :-1], test_bmkg[:, -1]

# reshape input to be 3D [samples, timesteps, features]
x_train_bmkg = x_train_bmkg.reshape((x_train_bmkg.shape[0], 1, x_train_bmkg.shape[1]))
x_test_bmkg = x_test_bmkg.reshape((x_test_bmkg.shape[0], 1, x_test_bmkg.shape[1]))
print(x_train_bmkg.shape, x_test_bmkg.shape, y_train_bmkg.shape, y_test_bmkg.shape)


## dataset ceda
train_ceda, test_ceda = ceda_values[0:train_size,:], ceda_values[train_size:len(ceda_values),:]
# split into input and outputs
x_train_ceda, y_train_ceda = train_ceda[:, :-1], train_ceda[:, -1]
x_test_ceda, y_test_ceda = test_ceda[:, :-1], test_ceda[:, -1]

# reshape input to be 3D [samples, timesteps, features]
x_train_ceda = x_train_ceda.reshape((x_train_ceda.shape[0], 1, x_train_ceda.shape[1]))
x_test_ceda = x_test_ceda.reshape((x_test_ceda.shape[0], 1, x_test_ceda.shape[1]))
print(x_train_ceda.shape, x_test_ceda.shape, y_train_ceda.shape, y_test_ceda.shape)

(191, 1, 6) (48, 1, 6) (191,) (48,)
(191, 1, 11) (48, 1, 11) (191,) (48,)


### Pembuatan arsitektur NN : Hyperparameter GridSearchCV LTSM-RNN

In [9]:
# parameter
parameters = {'neurons' : [8, 16],
              'activation' : ['sigmoid', 'tanh', 'relu', 'selu', 'elu', 'softplus'],
              'optimizer' : ['adam', 'adamax', 'rmsprop', 'sgd'],
              'dropout_rate' : [0.1],
              'epochs' : [500, 1000, 1500, 2000],
              'batch_size' : [8, 16, 32, 64],
              'verbose' : [0]}
# parameters = {'neurons' : [16],
#               'activation' : ['softplus'],
#               'optimizer' : ['adam', 'adamax'],
#               'dropout_rate' : [0.1],
#               'epochs' : [1500],
#               'batch_size' : [16],
#               'verbose' : [0]}

In [10]:
keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]
print(len(combinations), str('kombinasi hyperparameter'))

768 kombinasi hyperparameter


In [11]:
def build_classifier_bmkg(neurons='', activation='', optimizer='', dropout_rate=''):
    tf.keras.backend.clear_session()
    
    # design network
    grid_model = Sequential()
    grid_model.add(LSTM(units=neurons, activation=activation, input_shape=(x_train_bmkg.shape[1], x_train_bmkg.shape[2])))
    grid_model.add(Dropout(dropout_rate))
    grid_model.add(Dense(1))

    # model compile
    grid_model.compile(loss='mae', optimizer=optimizer)
    
    # return value
    return grid_model

def build_classifier_ceda(neurons='', activation='', optimizer='', dropout_rate=''):
    tf.keras.backend.clear_session()
    
    # design network
    grid_model = Sequential()
    grid_model.add(LSTM(units=neurons, activation=activation, input_shape=(x_train_ceda.shape[1], x_train_ceda.shape[2])))
    grid_model.add(Dropout(dropout_rate))
    grid_model.add(Dense(1))

    # model compile
    grid_model.compile(loss='mae', optimizer=optimizer)
    
    # return value
    return grid_model

In [12]:
grid_model = KerasRegressor(build_fn=build_classifier_bmkg)
# grid_model = KerasRegressor(build_fn=build_classifier_ceda)

  grid_model = KerasRegressor(build_fn=build_classifier_bmkg)


In [13]:
grid_search = GridSearchCV(estimator=grid_model, param_grid=parameters, n_jobs=-1, cv=2, scoring= 'neg_root_mean_squared_error')

In [14]:
grid_result = grid_search.fit(x_train_bmkg, y_train_bmkg)
# grid_result = grid_search.fit(x_train_ceda, y_train_ceda)

In [15]:
# summarize results
print("Best parameters: %f using %s\n" % (grid_search.best_score_, grid_search.best_params_))

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best parameters: -0.336187 using {'activation': 'relu', 'batch_size': 32, 'dropout_rate': 0.1, 'epochs': 1000, 'neurons': 8, 'optimizer': 'rmsprop', 'verbose': 0}

-0.364571 (0.042132) with: {'activation': 'sigmoid', 'batch_size': 8, 'dropout_rate': 0.1, 'epochs': 500, 'neurons': 8, 'optimizer': 'adam', 'verbose': 0}
-0.367896 (0.032739) with: {'activation': 'sigmoid', 'batch_size': 8, 'dropout_rate': 0.1, 'epochs': 500, 'neurons': 8, 'optimizer': 'adamax', 'verbose': 0}
-0.363559 (0.034718) with: {'activation': 'sigmoid', 'batch_size': 8, 'dropout_rate': 0.1, 'epochs': 500, 'neurons': 8, 'optimizer': 'rmsprop', 'verbose': 0}
-0.390199 (0.039517) with: {'activation': 'sigmoid', 'batch_size': 8, 'dropout_rate': 0.1, 'epochs': 500, 'neurons': 8, 'optimizer': 'sgd', 'verbose': 0}
-0.369742 (0.041973) with: {'activation': 'sigmoid', 'batch_size': 8, 'dropout_rate': 0.1, 'epochs': 500, 'neurons': 16, 'optimizer': 'adam', 'verbose': 0}
-0.364444 (0.035761) with: {'activation': 'sigmoid', 'ba

In [16]:
result = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["score"])],axis=1)

In [17]:
result.sort_values(by="score", ascending=False)

Unnamed: 0,activation,batch_size,dropout_rate,epochs,neurons,optimizer,verbose,score
330,relu,32,0.1,1000,8,rmsprop,0,-0.336187
368,relu,64,0.1,1500,8,adam,0,-0.337997
434,selu,16,0.1,1500,8,rmsprop,0,-0.339102
269,relu,8,0.1,1000,16,adamax,0,-0.340585
399,selu,8,0.1,1000,16,sgd,0,-0.341890
...,...,...,...,...,...,...,...,...
536,elu,8,0.1,2000,8,adam,0,-0.506142
572,elu,16,0.1,2000,16,adam,0,-0.535365
346,relu,32,0.1,2000,8,rmsprop,0,-0.537755
302,relu,16,0.1,1000,16,rmsprop,0,-0.555107


In [18]:
result.to_excel('D:/UIN/BigData/UAS/hasil/lstm_result/gridsearch_lstm_bmkg.xlsx',index=False)
# result.to_excel('D:/UIN/BigData/UAS/hasil/lstm_result/gridsearch_lstm_ceda.xlsx',index=False)

### Evaluasi

In [19]:
# Set akhir waktu komputasi 
end = time.time()
# Proses menghitung waktu komputasi
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
# Hasil waktu komputasi
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

01:34:18.01
