In [1]:
#Imports
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Input, Dense, Activation,Dropout
from keras import regularizers
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization
import numpy as np
import pandas as pd
import random

# fix random seed for reproducibility
np.random.seed(42)

Using TensorFlow backend.


In [2]:
'''
    -----------------Inputs-------------------------
    These are the inputs for the run. 
    datasetFolder -  Change to the dataset folder generated by InMemoryGraphLoader.java.
    numberOfFiles - Set to the number of files in the dataset.
'''
#Folder for the dataset
#
datasetFolder = '/home/carnd/dbpedia2016/all4_2x125/dataset/'

#Number of files
numberOfFiles = 638

#Test split
testSplit=0.1
validationSplit=0.2

In [4]:
def load_data(datasetFolder, datasetXFile, datasetYFile, wrap=True, printIt=False):
    '''
        Function to load the data given one file. 
        While the extension is .csv, we actually have a sparse representation internally.
        This function unrolls the sparse representation into a feature and target vector numpy arrays
        by using an encoding strategy if needed.
        - datasetFolder: The dataset folder
        - datasetXFile: The name of one batch file with feature data
        - datasetYFile: The name of the file with target data for same batch
        - wrap: True if the data should be incoded in feature vector of length 8384
               False otherwise       
    '''
    # find number of rows and columns to create the numpy array for feature vector
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline()
        cols = head.split(',')
        numberOfCols = len(cols)
        #print(numberOfCols)
        numberOfRows=0
        for line in f:
            numberOfRows+=1
        f.close()
    if(printIt):
        print('Input Features: {} x {}'.format(numberOfRows,numberOfCols-1))
        
    #Max width of feature vector for encoding
    if(wrap==True):
        maxY = 8384
    else:
        maxY = numberOfCols-1
        
    #Scaling factor (shift, actually)
    half=(numberOfCols//maxY)*0.5
    
    # Initialize the numpy array
    dataX = np.zeros([numberOfRows,maxY],np.int8)
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline() #Skip the header
        rowCounter=0
        for line in f:
            # Read each line
            row=line.split(',')
            for i in range(1, len(row)):
                if(int(row[i])<=0):
                    continue;
                #Create the value based on the encoding
                val = 1 + ((int(row[i])-1)//maxY);
                if(val>half):
                    val = 0 - (val - half) #Scale if needed
                #Assign the value
                dataX[rowCounter][(int(row[i])-1)%maxY]= val
            rowCounter+=1
        f.close()
   
    # Read target vector file to find the size
    with open(datasetFolder + datasetYFile, "r") as f:
        head = f.readline()
        cols = head.split(',')
        numberOfCols = len(cols)
        numberOfRows=0
        for line in f:
            numberOfRows+=1
        f.close()

    if(printIt):
        print('Output Features: {} x {}'.format(numberOfRows,numberOfCols-1))

    #Create the target vector numpy array
    dataY = np.zeros([numberOfRows,(numberOfCols-1)],np.float16)
    with open(datasetFolder + datasetYFile, "r") as f:
        head = f.readline() #Skip the header
        rowCounter=0
        #Read each line and set the target class value to 1
        for line in f:
            row=line.split(',')
            for i in range(1, len(row)):
                if(int(row[i])<=0):
                    continue;
                dataY[rowCounter][(int(row[i])-1)]=1
            rowCounter+=1
        f.close()
        
    #Return
    return dataX, dataY

In [5]:
#Load the data
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv', printIt=True)

Input Features: 4995 x 702241
Output Features: 4995 x 525


In [5]:
#Test load without print
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv')

In [6]:
#Details of features
print(dataX.shape)
print(dataX[0:5])

(4995, 8384)
[[1 1 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [1 1 1 ..., 0 0 0]]


In [7]:
#Details of targets
print(dataY.shape)
print(dataY[0:5])

(4995, 525)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [8]:
print("Input Features for classification: {}".format(dataX.shape[1]))
print("Output Classes for classification: {}".format(dataY.shape[1]))

Input Features for classification: 8384
Output Classes for classification: 525


In [9]:
#Create the final model after all the refinements.
deepModel = Sequential(name='Deep Model (5 Dense Layers)')
deepModel.add(Dense(2048, input_dim=dataX.shape[1], init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(1024, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(768, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(512, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(256, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(dataY.shape[1], activation='sigmoid', init='glorot_normal'))

In [10]:
# Compile model
import keras.backend as K

deepModel.compile(loss='binary_crossentropy', optimizer='nadam', metrics=[f1score])

In [None]:
#Define important functions for calculating F1-score
#(Based on Keras code, which is no-longer part of the codebase)
def count_predictions(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives, predicted_positives, possible_positives

def f1score(y_true, y_pred):
    true_positives, predicted_positives, possible_positives = count_predictions(y_true, y_pred)
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1score = 2.0 * precision * recall / (precision+recall+ K.epsilon())
    return f1score

In [12]:
#Helper function for counting
def countPredictions(y_true, y_pred):
    true_positives = np.sum(np.round(y_pred*y_true))
    predicted_positives = np.sum(np.round(y_pred))
    possible_positives = np.sum(y_true)
    return true_positives, predicted_positives, possible_positives

In [14]:
#Randomize the list of numbers so we can split train and test dataset
listOfFiles=list(range(1,numberOfFiles+1))
random.shuffle(listOfFiles) #Randomize the file order
#Split index separates training and validataion data
splitIndex=int((1-(testSplit+validationSplit))*numberOfFiles) 
#Test split index separates validation and test data
testSplitIndex=int((1-(testSplit))*numberOfFiles)

#Eons are similar to epochs, except since the model.fit function also uses epoch, 
#we switch to a name with longer time horizon
numberOfEons = 8
for eon in range(0, numberOfEons):
    print('{}. Eon {}/{}'.format(eon+1,eon+1, numberOfEons))
    #Train the data
    for trainIndex in range(0,splitIndex):
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[trainIndex]), 'datasetY_{}.csv'.format(listOfFiles[trainIndex]))
        deepModel.fit(dataX, dataY, nb_epoch=1, verbose=0, batch_size=256)
        print('Learning deep model for file {} / {} : datasetX/Y_{}'.format(trainIndex+1, splitIndex, listOfFiles[trainIndex]), end='\r')

    #Hold the counts of the predictions for the validation
    counts = {} 
    counts[deepModel.name] = {'true_positives':0, 'predicted_positives':0, 'possible_positives':0}
    
    #Validation
    for testIndex in range(splitIndex, testSplitIndex):
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[testIndex]), 'datasetY_{}.csv'.format(listOfFiles[testIndex]))
        predY=deepModel.predict_on_batch(dataX)
        true_positives, predicted_positives, possible_positives = countPredictions(dataY, predY)
        counts[deepModel.name]['true_positives'] += true_positives
        counts[deepModel.name]['predicted_positives'] += predicted_positives
        counts[deepModel.name]['possible_positives'] += possible_positives
        print ('Validating deep model {} / {} : - true +ve:{}  pred +ve:{} possible +ve:{}'.format(testIndex+1, testSplitIndex, true_positives,predicted_positives,possible_positives), end='\r')
    
    #Metrics calculation
    count = counts[deepModel.name]
    precision = (count['true_positives'])/(count['predicted_positives']+0.0001)
    recall = (count['true_positives'])/(count['possible_positives']+0.0001)
    f1score = 2.0 * precision * recall / (precision+recall+0.0001)
    print(' - Model = {} \t f1-score = {:.4f}\t precision = {:.4f} \t recall = {:.4f}'.format(deepModel.name, f1score, precision, recall))

#Reinitialize counters
counts = {} 
counts[deepModel.name] = {'true_positives':0, 'predicted_positives':0, 'possible_positives':0}
#Testing 
for testIndex in range(testSplitIndex, numberOfFiles):
    dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[testIndex]), 'datasetY_{}.csv'.format(listOfFiles[testIndex]))
    predY=deepModel.predict_on_batch(dataX)
    true_positives, predicted_positives, possible_positives = countPredictions(dataY, predY)
    counts[deepModel.name]['true_positives'] += true_positives
    counts[deepModel.name]['predicted_positives'] += predicted_positives
    counts[deepModel.name]['possible_positives'] += possible_positives
    print ('Testing deep model {} / {} : - true +ve:{}  pred +ve:{} possible +ve:{}'.format(testIndex+1, numberOfFiles, true_positives,predicted_positives,possible_positives), end='\r')

#Metrics reporting
count = counts[deepModel.name]
precision = (count['true_positives'])/(count['predicted_positives']+0.0001)
recall = (count['true_positives'])/(count['possible_positives']+0.0001)
f1score = 2.0 * precision * recall / (precision+recall+0.0001)
print(' - Final Test Score for {} \t f1-score = {:.4f}\t precision = {:.4f} \t recall = {:.4f}'.format(deepModel.name, f1score, precision, recall))

#Save the model
deepModel.save('deepModelDBpediaOntologyTypes.h5')

1. Eon 1/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9112	 precision = 0.9350 	 recall = 0.8886
2. Eon 2/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9174	 precision = 0.9359 	 recall = 0.8997
3. Eon 3/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9195	 precision = 0.9365 	 recall = 0.9032
4. Eon 4/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9194	 precision = 0.9340 	 recall = 0.9054
5. Eon 5/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9205	 precision = 0.9368 	 recall = 0.9048
6. Eon 6/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9202	 precision = 0.9359 	 recall = 0.9051
7. Eon 7/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9203	 precision = 0.9351 	 recall = 0.9061
8. Eon 8/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9199	 precision = 0.9350 	 recall = 0.9053
 - Final Test Score for Deep Model (5 Dense Layers) 	 f1-score = 0.9200	 precision = 0.9352 	 recall = 0.9054


1. Eon 1/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9112	 precision = 0.9350 	 recall = 0.8886
2. Eon 2/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9174	 precision = 0.9359 	 recall = 0.8997
3. Eon 3/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9195	 precision = 0.9365 	 recall = 0.9032
4. Eon 4/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9194	 precision = 0.9340 	 recall = 0.9054
5. Eon 5/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9205	 precision = 0.9368 	 recall = 0.9048
6. Eon 6/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9202	 precision = 0.9359 	 recall = 0.9051
7. Eon 7/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9203	 precision = 0.9351 	 recall = 0.9061
8. Eon 8/8
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9199	 precision = 0.9350 	 recall = 0.9053
 - Final Test Score for Deep Model (5 Dense Layers) 	 f1-score = 0.9200	 precision = 0.9352 	 recall = 0.9054