In [None]:
def loguniform(low=0, high=1):
    val = np.exp(np.random.uniform(0, 1, None))
    scaled_val = (((val - np.exp(0)) * (high - low)) / (np.exp(1) - np.exp(0))) + low
    return scaled_val

def loguniform_int(low=0, high=1):
    val = np.exp(np.random.uniform(0, 1, None))
    scaled_val = (((val - np.exp(0)) * (high - low)) / (np.exp(1) - np.exp(0))) + low
    return int(scaled_val)

def uniform(low=0, high=1):
    val = np.random.uniform(low, high, None)
    return val

In [None]:
def generate_3d_data(data, max_len):
    data = data.sort_values('complete_timestamp', ascending=True, kind="mergesort").groupby('unique_id').head(max_len)
    grouped = data.sort_values('complete_timestamp', ascending=True, kind="mergesort").groupby('unique_id')

    data_dim = data.shape[1]-8
    n_cases = data['unique_id'].nunique()

    X = np.zeros((n_cases, max_len, data_dim), dtype=np.float32)
    y = np.zeros((n_cases, 2), dtype=np.float32)

    idx = 0
    # each prefix will be a separate instance
    for _, group in grouped:
        group = group.sort_values('complete_timestamp', ascending=True, kind="mergesort")
        label = group['Releasetreue'].iloc[0]
        group = group.to_numpy()
        X[idx] = pad_sequences(group[np.newaxis,:30,8:], maxlen=max_len, dtype=np.float32)
        y[idx, label] = 1
        idx += 1
    return (X, y)

In [None]:
class AUCHistory(keras.callbacks.Callback):
    def __init__(self, X_val, y_val):
        self.X_val = X_val
        self.y_val = y_val
        self.aucs = []
        self.aucs_pr_0 = []
        self.aucs_pr_1 = []
    
    def on_train_begin(self, logs={}):
        self.aucs = []
        self.aucs_pr = []
        self.aucs_pr_0 = []
        self.aucs_pr_1 = []

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_val)
        self.aucs.append(roc_auc_score(self.y_val[:,0], y_pred[:,0]))
        precision, recall, _ = precision_recall_curve(self.y_val[:,0],  y_pred[:,0])
        self.aucs_pr_0.append(auc(recall, precision))
        precision, recall, _ = precision_recall_curve(self.y_val[:,1],  y_pred[:,1])
        self.aucs_pr_1.append(auc(recall, precision))
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [None]:
#hyperparameter tuning
best_auc=0
best_param={}
all_auc=[]
all_param=[]
best_aucpr=0
all_aucpr=[]
best_parampr={}
time=[]
confusion=[]

activation = "sigmoid"
nb_epoch = 10
start=datetime.now()
max_len=30
data_dim=train_lstm_ready.shape[1] - 7
n_layers_values = [1, 2, 3]
batch_size_values = [8, 16, 32, 64]
optimizer_values = ["rmsprop", "nadam"]

for i in range(16):
     for k in range(5):
        
        print('run ' + str(i) + '.' + str(k) + ' started at ' + str(datetime.now()))
        np.random.seed(i)
        train_lstm_split = train_lstm_ready.reindex(np.random.permutation(train_lstm_ready.index))
        val_ids = list(train_lstm_split['EC batch'].unique())[-int(val_ratio*len(train_lstm_split['EC batch'].unique())):]
        val_df_pre = train_lstm_ready[train_lstm_ready['EC batch'].isin(val_ids)]
        train_df_pre = train_lstm_ready[~train_lstm_ready['EC batch'].isin(val_ids)]

        del train_lstm_split
        
        X_train, y_train = generate_3d_data(train_df_pre,max_len)      
        X_val,y_val = generate_3d_data(val_df_pre,max_len)

        del val_df_pre
        del train_df_pre
        
        lstmsize = loguniform_int(10, 150)
        dropout = uniform(0, 0.3)
        n_layers = n_layers_values[np.random.randint(0, len(n_layers_values))]
        batch_size = batch_size_values[np.random.randint(0, len(batch_size_values))]
        optimizer = optimizer_values[np.random.randint(0, len(optimizer_values))]
        learning_rate = loguniform(low=0.000001, high=0.0001)    

        main_input = Input(shape=(max_len, data_dim), name='main_input')
        if n_layers == 1:
            l2_3 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(main_input)
            b2_3 = BatchNormalization()(l2_3)

        elif n_layers == 2:
            l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
            b1 = BatchNormalization(axis=1)(l1)
            l2_3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b1)
            b2_3 = BatchNormalization()(l2_3)

        elif n_layers == 3:
            l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
            b1 = BatchNormalization(axis=1)(l1)
            l2 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(b1)
            b2 = BatchNormalization(axis=1)(l2)
            l3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b2)
            b2_3 = BatchNormalization()(l3)

        outcome_output = Dense(2, activation=activation, kernel_initializer='glorot_uniform', name='outcome_output')(b2_3)

        model = Model(inputs=[main_input], outputs=[outcome_output])

        if optimizer == "nadam":
            opt = Nadam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)
        elif optimizer == "rmsprop":
            opt = RMSprop(learning_rate=learning_rate, rho=0.9, epsilon=1e-08, decay=0.0)

        model.compile(loss={'outcome_output':'binary_crossentropy'}, optimizer=opt, metrics=[tf.keras.metrics.AUC(curve='PR')])

        auc_cb = AUCHistory(X_val, y_val)
        history = model.fit({'main_input': X_train}, {'outcome_output':y_train}, validation_data=(X_val, y_val),
                            verbose=2, callbacks=[auc_cb], batch_size=batch_size, epochs=nb_epoch)

        pr_auc=auc_cb.aucs_pr_1[-1]
        lr_auc=auc_cb.aucs[-1]



        if lr_auc>best_auc:
            best_auc=lr_auc
            best_param={'run ' +str(i)+'.'+str(k),'lstmsize: ' +str(lstmsize), lstmsize, 'dropout: ' +str(dropout), dropout, 'n_layers: ' +str(n_layers), n_layers,
                        'batch_size: ' +str(batch_size), batch_size, 'optimizer:  '+str(optimizer), optimizer,'learning_rate:  '+str(learning_rate),learning_rate}

        if pr_auc>best_aucpr:
            best_aucpr=pr_auc
            best_parampr={'run ' +str(i)+'.'+str(k),'lstmsize: ' +str(lstmsize), lstmsize, 'dropout: ' +str(dropout), dropout, 'n_layers: ' +str(n_layers), n_layers,
                        'batch_size: ' +str(batch_size), batch_size, 'optimizer:  '+str(optimizer), optimizer,'learning_rate:  '+str(learning_rate),learning_rate}

        all_auc.append(lr_auc)
        all_aucpr.append(pr_auc)
        all_param.append(['lstmsize= ' +str(lstmsize),'dropout= ' +str(dropout),'n_layers= ' +str(n_layers),'batch_size= ' +str(batch_size),
                          'optimizer= ' +str(optimizer),'learning_rate= ' +str(learning_rate)])


        print('run ' + str(i) + ' ended at ' + str(datetime.now()))
        
print('cv ended at ' + str(datetime.now()))

end=datetime.now()
time.append(end-start)
 

In [None]:
# Final testruns

best_auc=0
best_param={}
all_auc=[]
all_param=[]
best_aucpr=0
all_aucpr=[]
best_parampr={}
time=[]
confusion=[]

precision_list=[]
recall_list=[]
thres_pr_list=[]
fpr_list=[]
tpr_list=[]
thres_roc_list=[]

activation = "sigmoid"
nb_epoch = 50
start=datetime.now()
max_len=30
data_dim=train_lstm_ready.shape[1] - 7
n_layers_values = [1, 2, 3]
batch_size_values = [8, 16, 32, 64]
optimizer_values = ["rmsprop", "nadam"]

X_test, y_test = generate_3d_data(test_lstm_ready,max_len)

for i in range(13,14):
     for k in range(1):
        
        print('run ' + str(i) + '.' + str(k) + ' started at ' + str(datetime.now()))
        np.random.seed(i)
        train_lstm_split = train_lstm_ready.reindex(np.random.permutation(train_lstm_ready.index))
        val_ids = list(train_lstm_split['EC batch'].unique())[-int(val_ratio*len(train_lstm_split['EC batch'].unique())):]
        val_df_pre = train_lstm_ready[train_lstm_ready['EC batch'].isin(val_ids)]
        train_df_pre = train_lstm_ready[~train_lstm_ready['EC batch'].isin(val_ids)]
        
        
        val_split = val_df_pre.reindex(np.random.permutation(val_df_pre.index))
        cal_ids = list(val_split['EC batch'].unique())[-int(cal_ratio*len(val_split['EC batch'].unique())):]
        cal_df_pre = val_df_pre[val_df_pre['EC batch'].isin(cal_ids)]
        val_df_pre = val_df_pre[~val_df_pre['EC batch'].isin(cal_ids)]

        del train_lstm_split, val_split
                      
        X_train, y_train = generate_3d_data(train_df_pre,max_len)      
        X_val,y_val = generate_3d_data(val_df_pre,max_len)
        X_cal,y_cal = generate_3d_data(cal_df_pre,max_len)

        del val_df_pre
        del train_df_pre
        del cal_df_pre
        
        lstmsize = loguniform_int(10, 150)
        dropout = uniform(0, 0.3)
        n_layers = n_layers_values[np.random.randint(0, len(n_layers_values))]
        batch_size = batch_size_values[np.random.randint(0, len(batch_size_values))]
        optimizer = optimizer_values[np.random.randint(0, len(optimizer_values))]
        learning_rate = loguniform(low=0.000001, high=0.0001)    
        
        np.random.seed(i+k)

        main_input = Input(shape=(max_len, data_dim), name='main_input')
        if n_layers == 1:
            l2_3 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(main_input)
            b2_3 = BatchNormalization()(l2_3)

        elif n_layers == 2:
            l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
            b1 = BatchNormalization(axis=1)(l1)
            l2_3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b1)
            b2_3 = BatchNormalization()(l2_3)

        elif n_layers == 3:
            l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
            b1 = BatchNormalization(axis=1)(l1)
            l2 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(b1)
            b2 = BatchNormalization(axis=1)(l2)
            l3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b2)
            b2_3 = BatchNormalization()(l3)

        outcome_output = Dense(2, activation=activation, kernel_initializer='glorot_uniform', name='outcome_output')(b2_3)

        model = Model(inputs=[main_input], outputs=[outcome_output])

        if optimizer == "nadam":
            opt = Nadam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)
        elif optimizer == "rmsprop":
            opt = RMSprop(learning_rate=learning_rate, rho=0.9, epsilon=1e-08, decay=0.0)

        model.compile(loss={'outcome_output':'binary_crossentropy'}, optimizer=opt, metrics=[tf.keras.metrics.AUC(curve='PR')])

        auc_cb = AUCHistory(X_val, y_val)
        history = model.fit({'main_input': X_train}, {'outcome_output':y_train}, validation_data=(X_val, y_val),
                            verbose=2, callbacks=[auc_cb], batch_size=batch_size, epochs=nb_epoch)

        y_pred = model.predict(X_test)
        
        
        # Data to plot precision - recall curve
        precision, recall, thres_pr = precision_recall_curve(y_test[:,1],  y_pred[:,1])
        precision_list.append(precision)
        recall_list.append(recall)
        thres_pr_list.append(thres_pr)
        
        # Data to plot roc curve
        fpr, tpr, thres_roc = roc_curve(y_test[:,1],  y_pred[:,1])
        fpr_list.append(fpr)
        tpr_list.append(tpr)
        thres_roc_list.append(thres_roc)
        
        # Use AUC function to calculate the area under the curve of precision recall curve
        all_aucpr.append(auc(recall, precision))
        
        # calculate AUC ROC value
        all_auc.append(roc_auc_score(y_test[:,1], y_pred[:,1]))


        print('run ' + str(i) + ' ended at ' + str(datetime.now()))
        
print('cv ended at ' + str(datetime.now()))

end=datetime.now()
time.append(end-start)

np.set_printoptions(threshold=100000)
curve_set=[None]*8

curve_set[0]=tpr_list
curve_set[1]=fpr_list
curve_set[2]=thres_roc_list
curve_set[3]=all_auc
curve_set[4]=precision_list
curve_set[5]=recall_list
curve_set[6]=thres_pr_list
curve_set[7]=all_aucpr


 

In [None]:
# testing with buckets

best_auc=0
best_param={}
all_auc=[]
all_param=[]
best_aucpr=0
all_aucpr=[]
best_parampr={}
time=[]
confusion=[]

precision_list=[]
recall_list=[]
thres_pr_list=[]
fpr_list=[]
tpr_list=[]
thres_roc_list=[]

activation = "sigmoid"
nb_epoch = 20
start=datetime.now()
max_len=30
data_dim=train_lstm_ready.shape[1] - 7
n_layers_values = [1, 2, 3]
batch_size_values = [8, 16, 32, 64]
optimizer_values = ["rmsprop", "nadam"]

for n in range(2):
    for m in range(2):

        precision_list=[]
        recall_list=[]
        thres_pr_list=[]
        fpr_list=[]
        tpr_list=[]
        thres_roc_list=[]
        
        train_df_bucket=train_lstm_ready[((train_lstm_ready['is_commercial']==n) &  (train_lstm_ready['is_quality']==m))]
        test_df_bucket=test_lstm_ready[((test_lstm_ready['is_commercial']==n) &  (test_lstm_ready['is_quality']==m))]

        X_test, y_test = generate_3d_data(test_df_bucket,max_len)

        for i in range(13,14):
             for k in range(20):

                print('run ' + str(i) + '.' + str(k) + ' started at ' + str(datetime.now()))
                np.random.seed(i)
                train_lstm_split = train_df_bucket.reindex(np.random.permutation(train_df_bucket.index))
                val_ids = list(train_lstm_split['EC batch'].unique())[-int(val_ratio*len(train_lstm_split['EC batch'].unique())):]
                val_df_pre = train_df_bucket[train_df_bucket['EC batch'].isin(val_ids)]
                train_df_pre = train_df_bucket[~train_df_bucket['EC batch'].isin(val_ids)]

                del train_lstm_split

                X_train, y_train = generate_3d_data(train_df_pre,max_len)      
                X_val,y_val = generate_3d_data(val_df_pre,max_len)

                del val_df_pre
                del train_df_pre

                lstmsize = loguniform_int(10, 150)
                dropout = uniform(0, 0.3)
                n_layers = n_layers_values[np.random.randint(0, len(n_layers_values))]
                batch_size = batch_size_values[np.random.randint(0, len(batch_size_values))]
                optimizer = optimizer_values[np.random.randint(0, len(optimizer_values))]
                learning_rate = loguniform(low=0.000001, high=0.0001)    

                np.random.seed(i+k)

                main_input = Input(shape=(max_len, data_dim), name='main_input')
                if n_layers == 1:
                    l2_3 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(main_input)
                    b2_3 = BatchNormalization()(l2_3)

                elif n_layers == 2:
                    l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
                    b1 = BatchNormalization(axis=1)(l1)
                    l2_3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b1)
                    b2_3 = BatchNormalization()(l2_3)

                elif n_layers == 3:
                    l1 = LSTM(lstmsize, input_shape=(max_len, data_dim), implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(main_input)
                    b1 = BatchNormalization(axis=1)(l1)
                    l2 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=True, dropout=dropout)(b1)
                    b2 = BatchNormalization(axis=1)(l2)
                    l3 = LSTM(lstmsize, implementation=2, kernel_initializer='glorot_uniform', return_sequences=False, dropout=dropout)(b2)
                    b2_3 = BatchNormalization()(l3)

                outcome_output = Dense(2, activation=activation, kernel_initializer='glorot_uniform', name='outcome_output')(b2_3)

                model = Model(inputs=[main_input], outputs=[outcome_output])

                if optimizer == "nadam":
                    opt = Nadam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004, clipvalue=3)
                elif optimizer == "rmsprop":
                    opt = RMSprop(learning_rate=learning_rate, rho=0.9, epsilon=1e-08, decay=0.0)

                model.compile(loss={'outcome_output':'binary_crossentropy'}, optimizer=opt, metrics=[tf.keras.metrics.AUC(curve='PR')])

                auc_cb = AUCHistory(X_val, y_val)
                history = model.fit({'main_input': X_train}, {'outcome_output':y_train}, validation_data=(X_val, y_val),
                                    verbose=2, callbacks=[auc_cb], batch_size=batch_size, epochs=nb_epoch)

                y_pred = model.predict(X_test)


                # Data to plot precision - recall curve
                precision, recall, thres_pr = precision_recall_curve(y_test[:,1],  y_pred[:,1])
                precision_list.append(precision)
                recall_list.append(recall)
                thres_pr_list.append(thres_pr)

                # Data to plot roc curve
                fpr, tpr, thres_roc = roc_curve(y_test[:,1],  y_pred[:,1])
                fpr_list.append(fpr)
                tpr_list.append(tpr)
                thres_roc_list.append(thres_roc)

                # Use AUC function to calculate the area under the curve of precision recall curve
                all_aucpr.append(auc(recall, precision))

                # calculate AUC ROC value
                all_auc.append(roc_auc_score(y_test[:,1], y_pred[:,1]))


                print('run ' + str(i) + ' ended at ' + str(datetime.now()))

        print('cv ended at ' + str(datetime.now()))

        end=datetime.now()
        time.append(end-start)

        np.set_printoptions(threshold=100000)
        curve_set=[None]*8

        curve_set[0]=tpr_list
        curve_set[1]=fpr_list
        curve_set[2]=thres_roc_list
        curve_set[3]=all_auc
        curve_set[4]=precision_list
        curve_set[5]=recall_list
        curve_set[6]=thres_pr_list
        curve_set[7]=all_aucpr

        fileexport_curve='Data/curve_lstm_' + str(n) + '_' + str(m) + '.csv'
        pd.DataFrame(curve_set).to_csv(fileexport_curve,index=False, header=False)

        end=datetime.now()
        time.append(end-start)

        fileexport_time='Data/time_lstm_' + str(n) + '_' + str(m) + '.csv'
        pd.DataFrame(time).to_csv(fileexport_time,index=False, header=False)
        
        del train_df_bucket, test_df_bucket


 