In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os

from data_util import *

from data_generator import DataGenerator

import model as model_util

from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
import tensorflow.keras as keras

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix

from numpy import array
import numpy as np



ModuleNotFoundError: No module named 'tensorflow'

In [117]:
def reset_seed():
    seed_value= 0
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    import os
    os.environ['PYTHONHASHSEED']=str(seed_value)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    import random
    random.seed(seed_value)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    import numpy as np
    np.random.seed(seed_value)
    # 4. Set the `tensorflow` pseudo-random generator at a fixed value
    import tensorflow as tf
    tf.random.set_seed(seed_value)

reset_seed()

path = "./data/"
model_path = path + "stock.h5"

In [118]:
from imblearn.over_sampling import SMOTE

trainX, trainY = load_data("", "train", path)

print("{} {}".format(trainX.shape, trainY.shape))

x, y = create_dataset(trainX, trainY, time_steps=20, null_value = 0)

print("{} {}".format(x.shape, y.shape))

#oversample = SMOTE()
#X, y = oversample.fit_resample(trainX, trainY)


(363163, 7) (363163,)
(363143, 20, 7) (363143,)


In [119]:
trainX, trainY = load_data("", "train", path)
valX, valY = load_data("", "Val", path)

trainX, trainY = get_balanced_set(trainX, trainY)

print("{} {}".format(trainX.shape, valX.shape))

(639192, 7) (38036, 7)


In [120]:
neg, pos = np.bincount(trainY)
total = neg + pos
print("{} {} {}".format(neg, pos, total))

319596 319596 639192


In [121]:
features = trainX.shape[-1:]

In [122]:

normalizer = tf.keras.layers.experimental.preprocessing.Normalization()
normalizer.adapt(trainX)
tmp = normalizer(trainX)


In [123]:
trainX[2000]

array([ 8.48955492e-01, -2.70751320e-04,  9.99940892e-01,  4.40922436e-01,
       -1.21917050e+00,  4.00000000e+01,  1.60000000e+01])

In [126]:
#model.summary()
import tensorflow as tf


use_regular = True #@param {type:"boolean"}

checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss',
                             verbose=1,
                             save_best_only=True,
                             load_weights_on_restart=True)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

callbacks_list = [early_stopping]

epochs = 1000

BATCH_SIZE = 64

if (use_regular):
    
    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)

    class_weight = {0: weight_for_0, 1: weight_for_1}
    
    history = model.fit(
        x=trainX,
        y=trainY,
        validation_data=(valX, valY),
        epochs=epochs,
        batch_size=BATCH_SIZE,
        verbose=1,
        shuffle = True,
        callbacks=callbacks_list
    )
else:
    model_1 = tfdf.keras.RandomForestModel(num_trees=30)

    # Optionally, add evaluation metrics.
    model_1.compile(metrics=["accuracy"])
    history = model.fit(
        x=trainX,
        y=trainY,
        validation_data=(valX, valY)
    )



Epoch 1/1000
  45/9988 [..............................] - ETA: 36:03 - loss: 0.6931 - accuracy: 0.4986 - precision: 0.5046 - recall: 0.5973 - auc: 0.5000 - prc: 0.5069

KeyboardInterrupt: 

In [None]:
def show_metric(history, metric): 
    plt.plot(history.history[metric])
    plt.plot(history.history["val_{}".format(metric)])
    plt.title(metric)
    plt.ylabel(metric)
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

#show_metric(history, "loss")
#show_metric(history, "precision")

In [None]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

def plot_metrics(history):
  metrics = ['loss', 'prc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend()
    
plot_metrics(history)

In [None]:
def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('True Negatives: ', cm[0][0])
    print('True Positives: ', cm[1][1])

    print('False Positives: ', cm[0][1])
    print('False Negatives: ', cm[1][0])
    print('Total Fraudulent Transactions: ', np.sum(cm[1]))

In [None]:


baseline_results = model.evaluate(valX, valY, verbose=1)


for name, value in zip(model.metrics_names, baseline_results):
    print(name, ': ', value)



In [None]:
plot_cm(valY, model.predict(valX))

In [70]:
loss :  0.678965151309967
accuracy :  0.6153643727302551
precision :  0.1558239907026291
recall :  0.5212189555168152
auc :  0.6046226024627686
prc :  0.1751813441514969
True Negatives:  21097
True Positives:  2309
False Positives:  12509
False Negatives:  2121
Total Fraudulent Transactions:  4430

SyntaxError: invalid syntax (<ipython-input-70-961d9cb6fd18>, line 7)

In [115]:
#Model 

from tensorflow.keras.layers import BatchNormalization, GlobalMaxPool1D, Bidirectional, Dense, Flatten, Conv2D, LeakyReLU, Dropout, LSTM, GRU, Input
from tensorflow.keras import Model, Sequential
from tensorflow.keras import datasets, layers, models

import tensorflow as tf
#import tensorflow_addons as tfa
 
#Model 

dim = 50

def add_deep_layers(x, units = dim):
    #x = Dropout(0.2)(x)
    x = Dense(units, activation='relu')(x)
    #x = Dropout(0.5)(x)
    x = Dense(units, activation='relu')(x)
    return x

def get_model(layers, features):
    reset_seed()
    inputX = Input(shape=features)
    
    x = normalizer(inputX)
    
    for lay in range(int(layers)):
        x = add_deep_layers(x, dim)

    x = add_deep_layers(x, 20)
    
    x = Dense(10, activation='relu')(x)
    
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inputX], outputs=x)
    
    return model


def add_deep_layers_seq(model, units = dim):
    model.add(Dense(units, activation='relu'))

def get_model_seq(layers, features):
    reset_seed()
    model = Sequential()
    model.add(Input(shape=features))
    #model.add(normalizer)
    
    for lay in range(int(layers)):
        add_deep_layers_seq(model, dim)
        
    model.add(Dense(1, activation='sigmoid'))
    
    return model

path = "./data/"


In [125]:


#model = load_model(filepath = "drive/My Drive/model/stock.h5")

#!rm "drive/My Drive/model/encoder.h5"

model = Sequential()
reset_seed()
model.add(Input(shape=features))
model.add(normalizer)
model.add(Dense(dim, activation='relu'))
model.add(Dense(dim, activation='relu'))
model.add(Dense(dim, activation='relu'))
model.add(Dense(dim, activation='relu'))
model.add(Dense(dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#model = get_model_seq(10, features)

model = get_model(20, features)


METRICS = [
      #keras.metrics.TruePositives(name='truePositives'),
      #keras.metrics.FalsePositives(name='falsePositives'),
      #keras.metrics.TrueNegatives(name='trueNegatives'),
      #keras.metrics.FalseNegatives(name='falseNegatives'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]


model.compile(
    #optimizer=ranger,
    optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.0001, rho=0.95, epsilon=1e-07),
    #optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=METRICS
)

model.summary()



Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        [(None, 7)]               0         
_________________________________________________________________
normalization_6 (Normalizati (None, 7)                 15        
_________________________________________________________________
dense_256 (Dense)            (None, 500)               4000      
_________________________________________________________________
dense_257 (Dense)            (None, 500)               250500    
_________________________________________________________________
dense_258 (Dense)            (None, 500)               250500    
_________________________________________________________________
dense_259 (Dense)            (None, 500)               250500    
_________________________________________________________________
dense_260 (Dense)            (None, 500)               2505