In [None]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Input, Dense, Activation,Dropout
from keras import regularizers
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.normalization import BatchNormalization
import random
import numpy as np
# fix random seed for reproducibility
np.random.seed(42)

import pandas as pd  

In [None]:
#Folder for the dataset
rootFolder = '/home/carnd/dbpedia2016/all4_2x125/'
datasetFolder = rootFolder+'dataset/'

#Number of files
numberOfFiles = 638

In [None]:
def loadFeaturesAndTargets():
    features = []
    targetClasses = []
    with open(rootFolder + 'headerX.csv', "r") as f:
        head = f.readline() #skip header
        f.readline() #skip id
        cols = head.split(',')
        for line in f:
            str = line[0:line.rfind(',')]
            features.append(str)
        print('headerX.csv : {}'.format(len(features)))
        f.close()
    with open(rootFolder + 'headerY.csv', "r") as f:
        head = f.readline() #skip header
        f.readline() #skip id
        cols = head.split(',')
        for line in f:
            row = line.split(',')
            targetClasses.append(row[0])
        print('headerY.csv : {}'.format(len(targetClasses)))
        f.close()
    return features, targetClasses
features, targetClasses = loadFeaturesAndTargets()

In [None]:
def findFeaturesAndTargets(individuals, wrap=True):
    '''
        This function fill search for the individual in all the dataset.
        It will then load the features and targets from the data for those individuals.
    '''
    numberOfCols = len(features)+1
    if(wrap==True):
        maxY = 8384
    else:
        maxY = numberOfCols-1
    half=(numberOfCols//maxY)*0.5
    #Initialize arrays
    dataX = np.zeros([len(individuals),maxY],np.int8)
    dataY = np.zeros([len(individuals),len(targetClasses)],np.float16)
    index = [] 
    featuresLists = []
    targetClassesLists = []
    xRowCounter=0
    yRowCounter=0
    print('Loading...')
    for fileIndex in range(0, numberOfFiles):
        datasetXFile = 'datasetX_{}.csv'.format(fileIndex+1)
        datasetYFile = 'datasetY_{}.csv'.format(fileIndex+1)
        print('Searching in {}'.format(datasetXFile), end='\r')
        with open(datasetFolder + datasetXFile, "r") as f:
            head = f.readline()# skip header
            for line in f:
                row=line.split(',')
                identifier = row[0]
                if(identifier in individuals):
                    index.append(identifier)
                    featuresList=[]
                    for i in range(1, len(row)):
                        if(int(row[i])<=0):
                            continue;
                        if(wrap == True):
                            val = 1 + ((int(row[i])-1)//maxY);
                            if(val>half):
                                val = 0 - (val - half)
                            dataX[xRowCounter][(int(row[i])-1)%maxY]= val
                        else:
                            dataX[xRowCounter][(int(row[i])-1)]=1
                        featuresList.append(features[(int(row[i])-1)])
                    featuresLists.append(featuresList)
                    xRowCounter+=1
            f.close()

        with open(datasetFolder + datasetYFile, "r") as f:
            head = f.readline() # skip header
            for line in f:
                row=line.split(',')
                identifier = row[0]
                if(identifier in individuals):
                    targetClassesList=[]
                    for i in range(1, len(row)):
                        if(int(row[i])<=0):
                            continue;
                        dataY[yRowCounter][(int(row[i])-1)]=1
                        targetClassesList.append(targetClasses[(int(row[i])-1)])
                    targetClassesLists.append(targetClassesList)
                    yRowCounter+=1
            f.close()

    #Return the index, the features, targets and example features
    return index, dataX, dataY, featuresLists, targetClassesLists

In [None]:
#Get the data for the instances
individuals = ['United_States', 'Washington__D_C_', 'Aristotle']
index, dataX, dataY, featuresLists, targetClassesLists = findFeaturesAndTargets(individuals)

print('Found: {}'.format(index))
print('Feature: {}'.format(dataX.shape))
print(dataX[0:5])
print('Targets: {}'.format(dataY.shape))
print(dataY[0:5])
print('')

In [None]:
#For each individual, display the sample features & targets
for i in range(0,len(index)):
    individual = index[i]
    print('Examples for {}: '.format(individual))
    featureList = featuresLists[i]
    targetList = targetClassesLists[i]
    row = dataX[i]
    numberOfExamples=5
    print('\tFeatures: (upto {} of {})'.format(numberOfExamples, len(featureList)) )
    if(len(featureList)>numberOfExamples):
        sample = random.sample(featureList, numberOfExamples)
    else:
        sample = featureList
    for k in range(0, len(sample)):
        if(k>=numberOfExamples): break
        print('\t\t{}'.format(sample[k]))
    print('\tTypes: (upto {} of {})'.format(numberOfExamples, len(targetList)))
    if(len(targetList)>numberOfExamples):
        sample = random.sample(targetList, numberOfExamples)
    else:
        sample = targetList
    for k in range(0, len(sample)):
        if(k>=numberOfExamples): break
        print('\t\t{}'.format(sample[k]))
    print('')

In [None]:
#Load the model
from keras.models import load_model
model = load_model('deepModelDBpediaOntologyTypes.h5')

In [None]:
#Predict the classes
predY = model.predict_on_batch(dataX)

##Print the predicted classes for each individual
for i in range(0,len(index)):
    individual = index[i]
    print('Predicted classes for {}: '.format(individual))
    targetList = []
    row = predY[i]
    for j in range(0,len(row)):
        if(row[j]>=0.5):
            targetList.append(targetClasses[j])
    for k in range(0, len(targetList)):
        print('\t\t{}'.format(targetList[k]))
    print('')