In [1]:
from keras.models import Sequential
from keras.layers import Dense

import numpy as np
# fix random seed for reproducibility
np.random.seed(42)

import pandas as pd

Using TensorFlow backend.


In [2]:
#Folder for the dataset
datasetFolder = '~/dbpedia2016/dataset/'

#Number of files
numberOfFiles = 256

#Test split
testSplit=0.25

In [3]:
def load_data(datasetFolder, datasetXFile, datasetYFile):
    print('Loading {} & {}'.format(datasetXFile, datasetYFile))
    # load file
    dataX=pd.read_csv(datasetFolder + datasetXFile, engine='c', na_filter=False)
    dataY=pd.read_csv(datasetFolder + datasetYFile, engine='c', na_filter=False)

    # delete the id fields
    del dataX['id']
    del dataY['id']

    return dataX, dataY

In [4]:
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv')

Loading datasetX_1.csv & datasetY_1.csv


In [5]:
dataX.head()

Unnamed: 0,walk_1,walk_2,walk_3,walk_4,walk_5,walk_6,walk_7,walk_8,walk_9,walk_10,...,walk_5194,walk_5195,walk_5196,walk_5197,walk_5198,walk_5199,walk_5200,walk_5201,walk_5202,walk_5203
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
dataY.head()

Unnamed: 0,class_1,class_2,class_3,class_4,class_5,class_6,class_7,class_8,class_9,class_10,...,class_530,class_531,class_532,class_533,class_534,class_535,class_536,class_537,class_538,class_539
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print("Input Features: {}".format(dataX.shape[1]))
print("Output Classes: {}".format(dataY.shape[1]))

Input Features: 5203
Output Classes: 539


In [8]:
# create model
logisticRegression = Sequential(name='Simple Logistic Regression')
logisticRegression.add(Dense(dataY.shape[1], input_dim=dataX.shape[1], activation='sigmoid'))

simpleModel = Sequential(name='2 Fully Connected Layers')
simpleModel.add(Dense(1024, input_dim=dataX.shape[1], activation='relu'))
simpleModel.add(Dense(dataY.shape[1], activation='sigmoid'))

deepModel = Sequential(name='Deep Model (5 Dense Layers)')
deepModel.add(Dense(2048, input_dim=dataX.shape[1], activation='relu'))
deepModel.add(Dense(1024, activation='relu'))
deepModel.add(Dense(768, activation='relu'))
deepModel.add(Dense(512, activation='relu'))
deepModel.add(Dense(dataY.shape[1], activation='sigmoid'))

models = [logisticRegression, simpleModel, deepModel]

In [9]:
# Compile model
import keras.backend as K

def count_predictions(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives, predicted_positives, possible_positives

def f1score(y_true, y_pred):
    true_positives, predicted_positives, possible_positives = count_predictions(y_true, y_pred)
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1score = 2.0 * precision * recall / (precision+recall+ K.epsilon())
    return f1score

for model in models:
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1score])

In [10]:
def fit_data(model, dataX, dataY):
    # Fit the model
    #model.fit(dataX.as_matrix(), dataY.as_matrix(), nb_epoch=5, verbose=2, batch_size=256)
    return model.train_on_batch(dataX.as_matrix(), dataY.as_matrix())

In [11]:
def countPredictions(y_true, y_pred):
    true_positives = np.sum(np.round(y_pred*y_true))
    predicted_positives = np.sum(np.round(y_pred))
    possible_positives = np.sum(y_true)
    return true_positives, predicted_positives, possible_positives

In [12]:
#Randomize the list of numbers so we can split train and test dataset
listOfFiles=list(range(1,numberOfFiles+1))
import random
random.shuffle(listOfFiles)
splitIndex=int((1-testSplit)*numberOfFiles)

numberOfEons = 5
for eon in range(0, numberOfEons):
    print('\n***********************************\nEon {}/{}'.format(eon+1, numberOfEons))
    for trainIndex in range(0,splitIndex):
        print('==========================================')
        print('Learning for file {} / {} : datasetX/Y_{}'.format(trainIndex+1, splitIndex, listOfFiles[trainIndex]))
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[trainIndex]), 'datasetY_{}.csv'.format(listOfFiles[trainIndex]))
        print('Training...')
        for model in models:
            #print('Model = {}'.format(model.name))
            loss, f1score=fit_data(model,dataX, dataY)
            print('   Model = {} \t loss = {:.4f} \t f1-score = {:.4f}'.format(model.name, loss, f1score))
            
        
    counts = {} 
    for model in models:
        counts[model.name] = {'true_positives':0, 'predicted_positives':0, 'possible_positives':0}
    
    print('==========================================\n')
    for testIndex in range(splitIndex, numberOfFiles):
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[testIndex]), 'datasetY_{}.csv'.format(listOfFiles[testIndex]))
        print ('Testing for file {} / {} : datasetX/Y_{}'.format(testIndex+1, numberOfFiles, listOfFiles[trainIndex]))
        for model in models:
            predY=model.predict_on_batch(dataX.as_matrix())
            true_positives, predicted_positives, possible_positives = countPredictions(dataY.as_matrix(), predY)
            counts[model.name]['true_positives'] += true_positives
            counts[model.name]['predicted_positives'] += predicted_positives
            counts[model.name]['possible_positives'] += possible_positives
    print('==========================================\n')
    
    
    print('{}. Eon {}/{} - Testing score:'.format(eon+1, eon+1, numberOfEons))
    for model in models:
        count = counts[model.name]
        precision = (count['true_positives'])/(count['predicted_positives']+0.0001)
        recall = (count['true_positives'])/(count['possible_positives']+0.0001)
        f1score = 2.0 * precision * recall / (precision+recall+0.0001)
        print(' - Model = {} \t f1-score = {:.4f}\t precision = {:.4f} \t recall = {:.4f}'.format(model.name, f1score, precision, recall))
    print('==========================================\n')


***********************************
Eon 1/5
Learning for file 1 / 192 : datasetX/Y_124
Loading datasetX_124.csv & datasetY_124.csv
Training...
   Model = Simple Logistic Regression 	 loss = 0.6936 	 f1-score = 0.0102
   Model = 2 Fully Connected Layers 	 loss = 0.6929 	 f1-score = 0.0094
   Model = Deep Model (5 Dense Layers) 	 loss = 0.6934 	 f1-score = 0.0121
Learning for file 2 / 192 : datasetX/Y_27
Loading datasetX_27.csv & datasetY_27.csv
Training...
   Model = Simple Logistic Regression 	 loss = 0.6924 	 f1-score = 0.0106
   Model = 2 Fully Connected Layers 	 loss = 0.6878 	 f1-score = 0.0096
   Model = Deep Model (5 Dense Layers) 	 loss = 0.6890 	 f1-score = 0.0138
Learning for file 3 / 192 : datasetX/Y_42
Loading datasetX_42.csv & datasetY_42.csv
Training...
   Model = Simple Logistic Regression 	 loss = 0.6910 	 f1-score = 0.0112
   Model = 2 Fully Connected Layers 	 loss = 0.6817 	 f1-score = 0.0094
   Model = Deep Model (5 Dense Layers) 	 loss = 0.6776 	 f1-score = 0.0157
L

==================================================
# Attribute, Relationship & Incoming Relationship Presence
==================================================
1. Eon 1/5 - Testing score: 
 - Model = Simple Logistic Regression 	 f1-score = 0.2946	 precision = 0.9060 	 recall = 0.1759
 - Model = 2 Fully Connected Layers 	 f1-score = 0.2197	 precision = 0.7597 	 recall = 0.1284
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.5102	 precision = 0.7287 	 recall = 0.3925
2. Eon 2/5 - Testing score: 
 - Model = Simple Logistic Regression 	 f1-score = 0.3546	 precision = 0.8919 	 recall = 0.2213
 - Model = 2 Fully Connected Layers 	 f1-score = 0.5013	 precision = 0.8329 	 recall = 0.3586
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.6710	 precision = 0.8087 	 recall = 0.5735
3. Eon 3/5 - Testing score:
 - Model = Simple Logistic Regression 	 f1-score = 0.3879	 precision = 0.8819 	 recall = 0.2487
 - Model = 2 Fully Connected Layers 	 f1-score = 0.6124	 precision = 0.7887 	 recall = 0.5006
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.6880	 precision = 0.8119 	 recall = 0.5970
4. Eon 4/5 - Testing score:
 - Model = Simple Logistic Regression 	 f1-score = 0.4091	 precision = 0.8753 	 recall = 0.2669
 - Model = 2 Fully Connected Layers 	 f1-score = 0.6401	 precision = 0.8001 	 recall = 0.5335
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.6911	 precision = 0.8148 	 recall = 0.6001
5. Eon 5/5 - Testing score:
 - Model = Simple Logistic Regression 	 f1-score = 0.4262	 precision = 0.8716 	 recall = 0.2821
 - Model = 2 Fully Connected Layers 	 f1-score = 0.6579	 precision = 0.8067 	 recall = 0.5556
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.6630	 precision = 0.8783 	 recall = 0.5325