In [224]:
import numpy as np
from numpy import genfromtxt # generate an array fron a text file

import matplotlib.pyplot as plt
%matplotlib inline

# it is going to split the features and the labels into a train set and training set
#  this (train_...) also does randomized shuffling so we do not have to worry about this concern that the labels happen to be sorted order
# -> this will automatically shuffle them for us
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

from keras.models import Sequential
from keras.layers import Dense

# we are passing the delimiter parameter to specify that the features are separated by a comma
# data = genfromtxt('./data/labels_features.csv', delimiter=",")#, names=True, dtype=None)
data = genfromtxt('./data/labels_features.csv', delimiter=",", names=True)#, dtype=None)
NAMES = data.dtype.names
data = genfromtxt('./data/labels_features.csv', delimiter=",")
data = data[1:, :]

In [225]:
# we could make it more universal I guess, but here it does not matter
# label_index = len(data[1, :]) - 1
# labels = data[:, label_index]
# features_no = len(data[1, :]) - 1
# features = data[:, 0:features_no]

IDS = data[:, 0] # data[:, 0]
LABELS = data[:, 1] # only class telling real / fake
FEATURES = data[:, 2:] # only features, no class

In [226]:
x = FEATURES
y = LABELS

test_size = 0.33
dim = len(FEATURES[0])

# passing the X features; y labels; test size of 33%; random_state => seed to have the same shuffle every time
# why 42? => https://news.mit.edu/2019/answer-life-universe-and-everything-sum-three-cubes-mathematics-0910
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42) # copied straight fron the docs

In [227]:
# force all the feature data to fall within a certain range
# this can actually help the neural network perform better
scaler_object = MinMaxScaler()

# fit the scaler object to our training data
# fit() finds the min and max value and then transform() is transforming the given array based on the MinMax we just calculated durring the fit
scaler_object.fit(X_train)
# we only fit to X_train and not X_test BECAUSE we do not want the scaler_object to peek at any test data - it would be cheating. If we would do that it is called data leakage and is essentially cheating. So we fit to the train data but transform both
scaled_X_train = scaler_object.transform(X_train)
scaled_X_test = scaler_object.transform(X_test)

In [228]:
# creates the model
model = Sequential()

# adding the layers
# add the dense layer, expecting 4 features (we have 4 neurons), input dimention; activation function ReLu
model.add(Dense(4, input_dim = dim, activation = 'relu'))

# here we can play arround with the neurons; too large / too small => bad results; we can do 1x or 2x input dimensions; we do not specify the input dim as it is not the input layer - it is a hidden layer
model.add(Dense(8, activation= 'relu'))

# 1 because we only have 1 neuron which has 1 output and is outputting the result of either 0 or 1; activation type sigmoid => fit between 0 and 1
model.add(Dense(1, activation= 'sigmoid'))

In [229]:
model.compile(loss= 'binary_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

In [230]:
model.fit(scaled_X_train, y_train, epochs= 300, verbose= 2)

Epoch 1/300
 - 0s - loss: 0.6999 - acc: 0.4253
Epoch 2/300
 - 0s - loss: 0.6917 - acc: 0.4943
Epoch 3/300
 - 0s - loss: 0.6855 - acc: 0.5747
Epoch 4/300
 - 0s - loss: 0.6791 - acc: 0.6437
Epoch 5/300
 - 0s - loss: 0.6738 - acc: 0.6897
Epoch 6/300
 - 0s - loss: 0.6701 - acc: 0.6897
Epoch 7/300
 - 0s - loss: 0.6646 - acc: 0.7011
Epoch 8/300
 - 0s - loss: 0.6587 - acc: 0.7126
Epoch 9/300
 - 0s - loss: 0.6547 - acc: 0.7126
Epoch 10/300
 - 0s - loss: 0.6502 - acc: 0.7126
Epoch 11/300
 - 0s - loss: 0.6458 - acc: 0.7126
Epoch 12/300
 - 0s - loss: 0.6401 - acc: 0.7126
Epoch 13/300
 - 0s - loss: 0.6375 - acc: 0.7126
Epoch 14/300
 - 0s - loss: 0.6325 - acc: 0.7126
Epoch 15/300
 - 0s - loss: 0.6298 - acc: 0.7126
Epoch 16/300
 - 0s - loss: 0.6260 - acc: 0.7126
Epoch 17/300
 - 0s - loss: 0.6245 - acc: 0.7126
Epoch 18/300
 - 0s - loss: 0.6221 - acc: 0.7126
Epoch 19/300
 - 0s - loss: 0.6202 - acc: 0.7126
Epoch 20/300
 - 0s - loss: 0.6190 - acc: 0.7126
Epoch 21/300
 - 0s - loss: 0.6191 - acc: 0.7126
E

<keras.callbacks.History at 0x2b23021dd68>

In [231]:
# Spits out probabilities by default.
# model.predict(scaled_X_test)

model.predict_classes(scaled_X_test)

array([[0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0]])

In [232]:
model.metrics_names

['loss', 'acc']

In [233]:
predictions = model.predict_classes(scaled_X_test)

# we have the answers because we have the y_test vector
confusion_matrix(y_test, predictions)

# [True Negative, False Negative]
# [False Positive, True Positive]

array([[ 4,  8],
       [10, 22]], dtype=int64)

In [234]:
# displaying the metrics
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

        0.0       0.29      0.33      0.31        12
        1.0       0.73      0.69      0.71        32

avg / total       0.61      0.59      0.60        44



In [235]:
# save the model
model.save('./model/myTestModel.h5')

In [236]:
# load the model
# from keras.models import load_model
# newmodel = load_model('../myTestModel.h5')

# use the loaded model to predict classes
# newmodel.predict_classes(scaled_X_test)