In [14]:
from keras.models import Sequential
from keras.layers import Dense

import numpy as np
# fix random seed for reproducibility
np.random.seed(42)

import pandas as pd

In [15]:
#Folder for the dataset
datasetFolder = '/Users/rparundekar/dataspace/dbpedia2016/dataset/'

#Number of files
numberOfFiles = 4#256

#Test split
testSplit=0.25

In [18]:
def load_data(datasetFolder, datasetXFile, datasetYFile):
    print('Loading {} & {}'.format(datasetXFile, datasetYFile))
    # load file
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline()
        cols = head.split(',')
        numberOfCols = len(cols)
        print(numberOfCols)
        numberOfRows=0
        for line in f:
            numberOfRows+=1
        f.close()
    
    print('{} x {}'.format(numberOfRows,numberOfCols))
    dataX = np.zeros([numberOfRows,numberOfCols-1],np.float32)
    
    with open(datasetFolder + datasetXFile, "r") as f:
        head = f.readline()
        rowCounter=0
        for line in f:
            row=line.split(',')
            for i in range(1, len(row)):
                dataX[rowCounter][row[i]-1]=1.0
            rowCounter+=1
        f.close()
        
    dataY=pd.read_csv(datasetFolder + datasetYFile)

    # delete the id fields
    #del dataX['id']
    del dataY['id']

    return dataX, dataY.as_matrix()

In [19]:
dataX, dataY = load_data(datasetFolder,'datasetX_1.csv', 'datasetY_1.csv')

Loading datasetX_1.csv & datasetY_1.csv
5806
6184 x 5806


In [31]:
print(dataX[0:5])

[[ 1.  1.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  1.  0. ...,  0.  1.  0.]
 [ 0.  1.  0. ...,  0.  1.  0.]
 [ 1.  1.  1. ...,  0.  1.  0.]]


In [32]:
print(dataY[0:5])

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [33]:
print("Input Features: {}".format(dataX.shape[1]))
print("Output Classes: {}".format(dataY.shape[1]))

Input Features: 5805
Output Classes: 539


In [35]:
# create model
logisticRegression = Sequential(name='Simple Logistic Regression')
logisticRegression.add(Dense(dataY.shape[1], input_dim=dataX.shape[1], activation='sigmoid'))

simpleModel = Sequential(name='2 Fully Connected Layers')
simpleModel.add(Dense(1024, input_dim=dataX.shape[1], activation='relu'))
simpleModel.add(Dense(dataY.shape[1], activation='sigmoid'))

deepModel = Sequential(name='Deep Model (5 Dense Layers)')
deepModel.add(Dense(2048, input_dim=dataX.shape[1], activation='relu'))
deepModel.add(Dense(1024, activation='relu'))
deepModel.add(Dense(768, activation='relu'))
deepModel.add(Dense(512, activation='relu'))
deepModel.add(Dense(dataY.shape[1], activation='sigmoid'))

models = [logisticRegression, simpleModel, deepModel]

In [36]:
# Compile model
from keras import losses
import keras.backend as K

def count_predictions(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives, predicted_positives, possible_positives

def f1score(y_true, y_pred):
    true_positives, predicted_positives, possible_positives = count_predictions(y_true, y_pred)
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1score = 2.0 * precision * recall / (precision+recall+ K.epsilon())
    return f1score

for model in models:
    model.compile(loss=losses.binary_crossentropy, optimizer='adam', metrics=[f1score])

In [37]:
def fit_data(model, dataX, dataY):
    # Fit the model
    model.fit(dataX, dataY, epochs=1, verbose=2, batch_size=256)

In [38]:
def countPredictions(y_true, y_pred):
    true_positives = np.sum(np.round(y_pred*y_true))
    predicted_positives = np.sum(np.round(y_pred))
    possible_positives = np.sum(y_true)
    return true_positives, predicted_positives, possible_positives
#     for class_i in range(0,y_true.shape[0]):
#         for class_j in range(0,y_true.shape[1]):
#             y_true_class_i = y_true[class_i,class_j]
#             y_pred_class_i = y_pred[class_i,class_j]
#             if(y_true_class_i>0.5):
#                 if(y_pred_class_i>0.5):
#                     confusionMatrix[1][1]+=1
#                 else:
#                     confusionMatrix[1][0]+=1
#             else:
#                 if(y_pred_class_i>=0.5):
#                     confusionMatrix[0][1]+=1
#                 else:
#                     confusionMatrix[0][0]+=1
#     return confusionMatrix

In [40]:
#Randomize the list of numbers so we can split train and test dataset
listOfFiles=list(range(1,numberOfFiles+1))
import random
random.shuffle(listOfFiles)
splitIndex=int((1-testSplit)*numberOfFiles)

numberOfEpochs = 15
for epoch in range(0, numberOfEpochs):
    print('\n***********************************\nMain Epoch {}/{}'.format(epoch+1, numberOfEpochs))
    for trainIndex in range(0,splitIndex):
        print('==========================================')
        print('Learning for file {} / {} : datasetX/Y_{}'.format(trainIndex+1, splitIndex, listOfFiles[trainIndex]))
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[trainIndex]), 'datasetY_{}.csv'.format(listOfFiles[trainIndex]))
        print('Training...')
        for model in models:
            print('Model = {}'.format(model.name))
            fit_data(model,dataX, dataY)
            #print('   Model = {} \t loss = {:.4f} \t f1-score = {:.4f}'.format(model.name, loss, f1score))
            
        print('==========================================\n')
        
    counts = {} 
    for model in models:
        counts[model.name] = {'true_positives':0, 'predicted_positives':0, 'possible_positives':0}
        
    for testIndex in range(splitIndex, numberOfFiles):
        dataX, dataY = load_data(datasetFolder,'datasetX_{}.csv'.format(listOfFiles[testIndex]), 'datasetY_{}.csv'.format(listOfFiles[testIndex]))
        print ('Testing for file {} / {} : datasetX/Y_{}'.format(testIndex+1, numberOfFiles, listOfFiles[trainIndex]))
        for model in models:
            predY=model.predict_on_batch(dataX)
            true_positives, predicted_positives, possible_positives = countPredictions(dataY, predY)
            counts[model.name]['true_positives'] += true_positives
            counts[model.name]['predicted_positives'] += predicted_positives
            counts[model.name]['possible_positives'] += possible_positives
    print('==========================================\n')
    
    
    for model in models:
        count = counts[model.name]
        precision = (count['true_positives'])/(count['predicted_positives']+0.0001)
        recall = (count['true_positives'])/(count['possible_positives']+0.0001)
        f1score = 2.0 * precision * recall / (precision+recall+0.0001)
        print('Testing score: ')
        print('   Model = {} \t f1-score = {:.4f}\t precision = {:.4f} \t recall = {:.4f}'.format(model.name, f1score, precision, recall))
    print('==========================================\n')


***********************************
Main Epoch 1/15
Learning for file 1 / 3 : datasetX/Y_1
Loading datasetX_1.csv & datasetY_1.csv
5806
6184 x 5806
Training...
Model = Simple Logistic Regression
Epoch 1/1
4s - loss: 0.5328 - f1score: 0.3478
Model = 2 Fully Connected Layers
Epoch 1/1
7s - loss: 0.0352 - f1score: 0.0000e+00
Model = Deep Model (5 Dense Layers)
Epoch 1/1
18s - loss: 0.0587 - f1score: 0.0151

Learning for file 2 / 3 : datasetX/Y_3
Loading datasetX_3.csv & datasetY_3.csv
5806
6272 x 5806
Training...
Model = Simple Logistic Regression
Epoch 1/1
4s - loss: 0.4823 - f1score: 0.3426
Model = 2 Fully Connected Layers
Epoch 1/1
7s - loss: 0.0286 - f1score: 2.7851e-04
Model = Deep Model (5 Dense Layers)
Epoch 1/1
18s - loss: 0.0466 - f1score: 6.5422e-04

Learning for file 3 / 3 : datasetX/Y_4
Loading datasetX_4.csv & datasetY_4.csv
5806
5831 x 5806
Training...
Model = Simple Logistic Regression
Epoch 1/1
3s - loss: 0.4384 - f1score: 0.3462
Model = 2 Fully Connected Layers
Epoch 1/1

KeyboardInterrupt: 