In [1]:
import pandas as pd
import numpy as np

In [2]:
# load dataset
data = pd.read_csv("data.csv", header=None)

In [3]:
# set columns
columns = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15", "label"]
data.columns = columns

In [4]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,label
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [5]:
# replace ? values with NaN to be handled properly
data = data.replace("?", np.nan)

In [6]:
# check missing values
data.isna().sum()

A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
label     0
dtype: int64

In [7]:
# drop missing values
data.dropna(inplace=True)
# data.fillna(method='bfill', inplace=True)
# check missing values
data.isna().sum()

A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
label    0
dtype: int64

In [8]:
# encode categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["A1"] = le.fit_transform(data["A1"])
data["A4"] = le.fit_transform(data["A4"])
data["A5"] = le.fit_transform(data["A5"])
data["A6"] = le.fit_transform(data["A6"])
data["A7"] = le.fit_transform(data["A7"])
data["A9"] = le.fit_transform(data["A9"])
data["A10"] = le.fit_transform(data["A10"])
data["A12"] = le.fit_transform(data["A12"])
data["A13"] = le.fit_transform(data["A13"])
data["label"] = le.fit_transform(data["label"])

In [9]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,label
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202,0,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43,560,0
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280,824,0
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100,3,0
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120,0,0


In [19]:
# # heat map
# import seaborn as sns
# import matplotlib as plt
# from matplotlib import pyplot
# corr = data.corr()
# fig, ax = pyplot.subplots(figsize=(20,10)) 
# sns.heatmap(corr, 
#             annot=True,
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values)

In [20]:
 # create X and y sets
features = ["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]
X = data[["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]]
y = data["label"]
X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43,560
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280,824
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100,3
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120,0


In [21]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [22]:
# standadizing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)   

In [23]:
# Merge inputs and targets
inputs = np.concatenate((X_train, X_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

In [24]:
# model = Sequential()
# model.add(Dense(15, activation='relu', input_shape=(15,)))
# model.add(Dense(8, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

In [25]:
from sklearn.model_selection import KFold
folds = 5
f_score_per_fold = []
kfold = KFold(n_splits=folds, shuffle=True)

In [17]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import f1_score
batch_size = 50
no_epochs = 25
verbosity = 2
fold_no = 1
for train, test in kfold.split(inputs, targets):    
    # Define the model architecture
    model = Sequential()
    model.add(Dense(15, activation='relu', input_shape=(15,)))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])


    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
    history = model.fit(inputs[train], targets[train],
                  batch_size=batch_size,
                  epochs=no_epochs,
                  verbose=verbosity)

    pred = model.predict(inputs[test])
    y_pred = []
    for i in pred:
        if(i[0] > 0.5):
            y_pred.append(1)
        else:
            y_pred.append(0)
    f_val = f1_score(targets[test], y_pred, average='macro')
    f_score_per_fold.append(f_val)
    print("F score for fold: "+str(fold_no)+" is: "+str(f_val))

    # Increase fold number
    fold_no = fold_no + 1


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
W0725 22:46:55.012428 139882462508864 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0725 22:46:55.029175 139882462508864 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/25
 - 0s - loss: 0.6926 - acc: 0.5594
Epoch 2/25
 - 0s - loss: 0.6692 - acc: 0.5766
Epoch 3/25
 - 0s - loss: 0.6512 - acc: 0.5881
Epoch 4/25
 - 0s - loss: 0.6367 - acc: 0.6111
Epoch 5/25
 - 0s - loss: 0.6240 - acc: 0.6322
Epoch 6/25
 - 0s - loss: 0.6131 - acc: 0.6360
Epoch 7/25
 - 0s - loss: 0.6029 - acc: 0.6686
Epoch 8/25
 - 0s - loss: 0.5937 - acc: 0.6782
Epoch 9/25
 - 0s - loss: 0.5842 - acc: 0.6897
Epoch 10/25
 - 0s - loss: 0.5753 - acc: 0.7031
Epoch 11/25
 - 0s - loss: 0.5665 - acc: 0.7203
Epoch 12/25
 - 0s - loss: 0.5581 - acc: 0.7299
Epoch 13/25
 - 0s - loss: 0.5501 - acc: 0.7414
Epoch 14/25
 - 0s - loss: 0.5422 - acc: 0.7490
Epoch 15/25
 - 0s - loss: 0.5343 - acc: 0.7529
Epoch 16/25
 - 0s - loss: 0.5265 - acc: 0.7625
Epoch 17/25
 - 0s - loss: 0.5189 - acc: 0.7682
Epoch 18/25
 - 0s - loss: 0.5114 - acc: 0.7720
Epoch 19/25
 - 0s - loss: 0.5039 - acc: 0.7778
Epoch 20/25
 - 0s -

In [18]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('F Score per fold')
for i in range(0, len(f_score_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - F Score: {f_score_per_fold[i]}')
print('------------------------------------------------------------------------')
print(f'> Avg F score: {np.mean(f_score_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
F Score per fold
------------------------------------------------------------------------
> Fold 1 - F Score: 0.7504761904761905
------------------------------------------------------------------------
> Fold 2 - F Score: 0.6237617959087978
------------------------------------------------------------------------
> Fold 3 - F Score: 0.8098693759071118
------------------------------------------------------------------------
> Fold 4 - F Score: 0.7281045751633987
------------------------------------------------------------------------
> Fold 5 - F Score: 0.803494769937723
------------------------------------------------------------------------
> Avg F score: 0.7431413414786444
------------------------------------------------------------------------
