In [9]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense, Activation,Dropout
from keras import regularizers
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization

import numpy as np
# fix random seed for reproducibility
np.random.seed(42)

import pandas as pd

In [10]:
#Folder for the dataset
datasetFolder = '/home/carnd/dbpedia2016/all3_1xall/dataset/'

#Number of files
numberOfFiles = 638

#Test split
testSplit=0.2

In [11]:
def load_data(datasetFolder, datasetXFile, datasetYFile, printIt=False):
    #print('Loading X')
    # load file
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline()
        cols = head.split(',')
        numberOfCols = len(cols)
        #print(numberOfCols)
        numberOfRows=0
        for line in f:
            numberOfRows+=1
        f.close()
    if(printIt):
        print('Input Features: {} x {}'.format(numberOfRows,numberOfCols))
    
    maxY = numberOfCols-1
    dataX = np.zeros([numberOfRows,maxY],np.float16)
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline()
        rowCounter=0
        for line in f:
            row=line.split(',')
            for i in range(1, len(row)):
                if(int(row[i])<=0):
                    continue;
                dataX[rowCounter][(int(row[i])-1)]= 1
                #if((1 + ((int(row[i])-1)//maxY))>1):
                #    print("{} data[{}][{}] = {}".format(int(row[i])-1, rowCounter,(int(row[i])-1)%maxY,1 + ((int(row[i])-1)//maxY)))
            rowCounter+=1
        f.close()
   
    #print('Loading Y')
    # load file
    with open(datasetFolder + datasetYFile, "r") as f:
        head = f.readline()
        cols = head.split(',')
        numberOfCols = len(cols)
        #print(numberOfCols)
        numberOfRows=0
        for line in f:
            numberOfRows+=1
        f.close()

    if(printIt):
        print('Output Features: {} x {}'.format(numberOfRows,numberOfCols))
    dataY = np.zeros([numberOfRows,(numberOfCols-1)],np.float16)
    with open(datasetFolder + datasetYFile, "r") as f:
        head = f.readline()
        rowCounter=0
        for line in f:
            row=line.split(',')
            for i in range(1, len(row)):
                if(int(row[i])<=0):
                    continue;
                dataY[rowCounter][(int(row[i])-1)]=1
            rowCounter+=1
        f.close()
        

    return dataX, dataY

In [12]:
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv', printIt=True)

Input Features: 4995 x 4169
Output Features: 4995 x 526


In [13]:
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv')

In [14]:
print(dataX.shape)
print(dataX[0:5])

(4995, 4168)
[[ 1.  1.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 1.  1.  1. ...,  0.  0.  0.]]


In [15]:
print(dataY.shape)
print(dataY[0:5])

(4995, 525)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [16]:
print("Input Features for classification: {}".format(dataX.shape[1]))
print("Output Classes for classification: {}".format(dataY.shape[1]))

Input Features for classification: 4168
Output Classes for classification: 525


In [17]:
deepModel = Sequential(name='Deep Model (5 Dense Layers)')
deepModel.add(Dense(1024, input_dim=dataX.shape[1], init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(1024, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(1024, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(1024, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(1024, init='glorot_normal'))
deepModel.add(BatchNormalization())
deepModel.add(Activation('relu'))
deepModel.add(Dropout(0.2))
deepModel.add(Dense(dataY.shape[1], activation='sigmoid', init='glorot_normal'))
models = [deepModel]

In [18]:
# Compile model
import keras.backend as K

def count_predictions(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives, predicted_positives, possible_positives

def f1score(y_true, y_pred):
    true_positives, predicted_positives, possible_positives = count_predictions(y_true, y_pred)
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1score = 2.0 * precision * recall / (precision+recall+ K.epsilon())
    return f1score

def fBetaScore(y_true, y_pred, beta):
    true_positives, predicted_positives, possible_positives = count_predictions(y_true, y_pred)
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1score = (1+(beta*beta)) * precision * recall / ((beta*beta*precision)+recall+ K.epsilon())
    return f1score

for model in models:
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=[f1score])

In [19]:
def fit_data(model, dataX, dataY):
    # Fit the model
    #model.fit(dataX, dataY, nb_epoch=5, verbose=2, batch_size=256)
    return model.train_on_batch(dataX, dataY)

In [20]:
def countPredictions(y_true, y_pred):
    true_positives = np.sum(np.round(y_pred*y_true))
    predicted_positives = np.sum(np.round(y_pred))
    possible_positives = np.sum(y_true)
    return true_positives, predicted_positives, possible_positives

In [22]:
#Randomize the list of numbers so we can split train and test dataset
listOfFiles=list(range(1,numberOfFiles+1))
import random
random.shuffle(listOfFiles)
splitIndex=int((1-testSplit)*numberOfFiles)

numberOfEons = 10
for model in models:
    for eon in range(0, numberOfEons):
        print('{}. Eon {}/{}'.format(eon+1,eon+1, numberOfEons))
        for trainIndex in range(0,splitIndex):
            dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[trainIndex]), 'datasetY_{}.csv'.format(listOfFiles[trainIndex]))
            #print('Model = {}'.format(model.name))
            #model.fit(dataX, dataY, nb_epoch=1, verbose=0, batch_size=512)
            #sc=model.test_on_batch(dataX,dataY)
            #loss = sc[0]
            #f1score = sc[1]
            loss, f1score=fit_data(model,dataX, dataY)
            print('Learning for file {} / {} : datasetX/Y_{}\t\tloss={:.4f} f1score={:.4f}'.format(trainIndex+1, splitIndex, listOfFiles[trainIndex], loss, f1score), end='\r')

        counts = {} 
    
        counts[model.name] = {'true_positives':0, 'predicted_positives':0, 'possible_positives':0}
    
        for testIndex in range(splitIndex, numberOfFiles):
            dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[testIndex]), 'datasetY_{}.csv'.format(listOfFiles[testIndex]))
            predY=model.predict_on_batch(dataX)
            true_positives, predicted_positives, possible_positives = countPredictions(dataY, predY)
            counts[model.name]['true_positives'] += true_positives
            counts[model.name]['predicted_positives'] += predicted_positives
            counts[model.name]['possible_positives'] += possible_positives
            print ('Testing for file {} / {} : datasetX/Y_{} - true +ve:{}  pred +ve:{} possible +ve:{}'.format(testIndex+1, numberOfFiles, listOfFiles[testIndex], true_positives,predicted_positives,possible_positives), end='\r')
    
        count = counts[model.name]
        precision = (count['true_positives'])/(count['predicted_positives']+0.0001)
        recall = (count['true_positives'])/(count['possible_positives']+0.0001)
        f1score = 2.0 * precision * recall / (precision+recall+0.0001)
        print(' - Model = {} \t f1-score = {:.4f}\t precision = {:.4f} \t recall = {:.4f}'.format(model.name, f1score, precision, recall))

1. Eon 1/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9163	 precision = 0.9400 	 recall = 0.8940008.0
2. Eon 2/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9229	 precision = 0.9380 	 recall = 0.9082008.0
3. Eon 3/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9238	 precision = 0.9386 	 recall = 0.9096008.0
4. Eon 4/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9243	 precision = 0.9391 	 recall = 0.9099008.0
5. Eon 5/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9245	 precision = 0.9390 	 recall = 0.9106008.0
6. Eon 6/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9249	 precision = 0.9394 	 recall = 0.9109008.0
7. Eon 7/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9251	 precision = 0.9387 	 recall = 0.9119008.0
8. Eon 8/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9253	 precision = 0.9389 	 recall = 0.9122008.0
9. Eon 9/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9254	 precision = 0.93

KeyboardInterrupt: 

==================================================
# all3_1xall (final)
==================================================
1. Eon 1/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9163	 precision = 0.9400 	 recall = 0.8940008.0
2. Eon 2/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9229	 precision = 0.9380 	 recall = 0.9082008.0
3. Eon 3/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9238	 precision = 0.9386 	 recall = 0.9096008.0
4. Eon 4/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9243	 precision = 0.9391 	 recall = 0.9099008.0
5. Eon 5/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9245	 precision = 0.9390 	 recall = 0.9106008.0
6. Eon 6/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9249	 precision = 0.9394 	 recall = 0.9109008.0
7. Eon 7/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9251	 precision = 0.9387 	 recall = 0.9119008.0
8. Eon 8/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9253	 precision = 0.9389 	 recall = 0.9122008.0
9. Eon 9/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9254	 precision = 0.9393 	 recall = 0.9119008.0
10. Eon 10/15
 - Model = Deep Model (5 Dense Layers) 	 f1-score = 0.9254	 precision = 0.9406 	 recall = 0.9109008.0