## This code is used for getting entities for CEA task 

In [32]:
# ENTITY look up code, referenced from GitHub(2020) ernestojimenezruiz/tabular-data-semantics-py.

from enum import Enum

class KG(Enum):
        DBpedia = 0
        Wikidata = 1
        Google = 2        
        All = 3
class URI_KG(object):
    dbpedia_uri_resource = 'http://dbpedia.org/resource/'
    dbpedia_uri_property = 'http://dbpedia.org/property/'
    
    dbpedia_uri = 'http://dbpedia.org/ontology/'
    wikidata_uri ='http://www.wikidata.org/'
    schema_uri = 'http://schema.org/' 

    uris = list()
    uris.append(dbpedia_uri)
    uris.append(wikidata_uri)
    uris.append(schema_uri)
    
    uris_resource = list()
    uris_resource.append(dbpedia_uri_resource)
    uris_resource.append(wikidata_uri)
          
    avoid_predicates=set()
    avoid_predicates.add("http://dbpedia.org/ontology/wikiPageDisambiguates")
    avoid_predicates.add("http://dbpedia.org/ontology/wikiPageRedirects")
    avoid_predicates.add("http://dbpedia.org/ontology/wikiPageWikiLink")
    avoid_predicates.add("http://dbpedia.org/ontology/wikiPageID")
    
    def __init__(self):
        ''''
        '''
class KGEntity(object):
    
    def __init__(self, enity_id, label, description, types, source):
        
        self.ident = enity_id
        self.label = label
        self.desc = description #sometimes provides a very concrete type or additional semantics
        self.types = types  #set of semantic types
        self.source = source  #KG of origin such as dbpedia, wikidata or google KG
        
    def __repr__(self):
        return "<id: %s , label: %s, description: %s, types: %s, source: %s>" % (self.ident, self.label,self.desc, self.types, self.source)
    def __str__(self):
        return "<id: %s , label: %s, description: %s, types: %s, source: %s>" % (self.ident, self.label, self.desc, self.types, self.source)
    def getId(self):
        return self.ident
    
    '''
    One can retrieve all types or filter by KG: DBpedia, Wikidata and Google (Schema.org)
    '''
    def getTypes(self, kgfilter=KG.All):
        if kgfilter==KG.All:
            return self.types
        else:
            kg_uri = URI_KG.uris[kgfilter.value]
            filtered_types = set()
            for t in self.types:
                if t.startswith(kg_uri):
                    filtered_types.add(t)
            
            return filtered_types 
    
    def getLabel(self):
        return self.label
    def getDescription(self):
        return self.desc
    def getSource(self):
        return self.sourcec
    def addType(self, cls):
        self.types.add(cls)
    def addTypes(self, types):
        self.types.update(types)
if __name__ == '__main__':
    print(URI_KG.uris[KG.Wikidata.value])
    print(KG.DBpedia.value)
    

http://www.wikidata.org/
0


In [33]:
# WIKIDATA LOOKUP

# Import the libraries 
import json
from pprint import pprint
import time
from urllib import parse, request

class KGLookup(object):
    '''
    classdocs
    '''
    def __init__(self, lookup_url):
        self.service_url = lookup_url
    def getJSONRequest(self, params, attempts=3):
        
        try:
            #urllib has been split up in Python 3. 
            #The urllib.urlencode() function is now urllib.parse.urlencode(), 
            #and the urllib.urlopen() function is now urllib.request.urlopen().
            #url = service_url + '?' + urllib.urlencode(params)
            url = self.service_url + '?' + parse.urlencode(params)
            #print(url)
            #response = json.loads(urllib.urlopen(url).read())

            req = request.Request(url)
            
            req.add_header('Accept', 'application/json')
            
            response = json.loads(request.urlopen(req).read())
            
            return response
        
        except:
            
            print("Lookup '%s' failed. Attempts: %s" % (url, str(attempts)))
            time.sleep(60) #to avoid limit of calls, sleep 60s
            attempts-=1
            if attempts>0:
                return self.getJSONRequest(params, attempts)
            else:
                return None
'''
Wikidata web search API
'''
class WikidataAPI(KGLookup):
    '''
    classdocs
    
    '''
    def __init__(self):
        '''
        Constructor
        '''
        super().__init__(self.getURL())

    def getURL(self):
        return "https://www.wikidata.org/w/api.php"

    def __createParams(self, query, limit, type='item'):
        
        params = {
            'action': 'wbsearchentities',
            'format' : 'json',
            'search': query,
            'type': type,
            'limit': limit,
            'language' : 'en'
        }
        
        return params

    def getKGName(self):
        return 'Wikidata'
    
    '''
    Returns list of ordered entities according to relevance: wikidata
    '''
    def __extractKGEntities(self, json, filter=''):
        
        entities = list()
        
        for element in json['search']:
            #empty list of type from wikidata lookup
            types = set()

            description=''
            if 'description' in element:
                description = element['description']
            kg_entity = KGEntity(
                element['concepturi'],
                element['label'],
                description,
                types,
                self.getKGName()
                )
            
            #We filter according to givem URI
            if filter=='' or element['concepturi']==filter:
                entities.append(kg_entity)
        #for entity in entities:
        #    print(entity)    
        return entities

    def getKGEntities(self, query, limit, type='item', filter=''):        
        json = self.getJSONRequest(self.__createParams(query, limit, type), 3)     
        
        if json==None:
            print("None results for", query)
            return list()
        return self.__extractKGEntities(json, filter) #Optionally filter by URI

#if __name__ == '__main__':
    
    
        
        

In [34]:
# Import the libraries
import pandas as pd 
import numpy as np
import os
os.getcwd()
df = pd.read_csv("C:\\Users\\01-18-20\\Documents\\City_University\\round3\\CEA_Round3_Targets.csv", header=None, nrows=10) # CEA targets file is used to read
df.columns=["Table_id", "Row_id", "Column_id"] # Assign the header to the dataframe
df

Unnamed: 0,Table_id,Row_id,Column_id
0,88TAWLJF,1,0
1,88TAWLJF,1,2
2,88TAWLJF,2,0
3,88TAWLJF,2,2
4,88TAWLJF,3,0
5,88TAWLJF,3,2
6,88TAWLJF,4,0
7,88TAWLJF,4,2
8,88TAWLJF,5,0
9,88TAWLJF,5,2


In [35]:
df["Cell"] = "" # Adding new column in the dataframe df to fetch the cell value in it

#df.head()

def capitalize_word(word):
    return word.capitalize()

def remove_special_signs(word):
    result = ""
    for w in word:
        if w.isalpha() or w=="'" or w.isspace() or w== "©" or w == "Ã":
            result += w 
    return result

def replace_space(word):
    try:
        return word.replace(" ", "_")
    except:
        print(word)
        return word
    
def get_entity(df,  row_id, column_id): 
    try:
        cell = df.iloc[row_id, column_id]
        #cell = remove_special_signs(cell)
        cell = capitalize_word(cell)
        cell = replace_space(cell)
        return cell
    except:
        return np.nan
    
df = df.sort_values("Table_id").reset_index(drop=True)
first_table = df["Table_id"].unique()[0]
#first_table




In [36]:
def function_for_row(row):
    global first_table
    table_id = row["Table_id"]
    if first_table == table_id:
         df = pd.read_csv(f"C:\\Users\\01-18-20\\Documents\\City_University\\round3\\Tables_Round3\\tables\\{table_id}.csv", header=None)
    else:
        df = pd.read_csv(f"C:\\Users\\01-18-20\\Documents\\City_University\\round3\\Tables_Round3\\tables\\{table_id}.csv", header=None)
        first_table = table_id
    df.head()
    
    row_id = row["Row_id"]
    column_id = row["Column_id"]
    cell = get_entity(df, row_id, column_id,)
    row["Cell"]=cell
    return row

In [37]:
df = df.apply(function_for_row, axis=1)
df["Wikidata_Entity"]=''

In [38]:
# Retrieve the cell entity for the cell value from KG and store in to seperate column
for i in range (0, 10):
    if __name__ == '__main__':
        query = df.loc[i]["Cell"]
        limit=1 # Limit variable is varied  up to 10
        type="item" 

        wikidata = WikidataAPI()
        entities = wikidata.getKGEntities(query, limit, type)
        df["Wikidata_Entity"][i] = entities
        print(entities)    
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


[<id: http://www.wikidata.org/entity/Q11886086 , label: Oulu YMCA, description: Finnish society, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q13567944 , label: Koskikeskus, description: city district in Oulu, Finland, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q10495580 , label: Finnish Art Society, description: Finnish society, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q1792414 , label: Kunsthalle Helsinki, description: art exhibition venue in Helsinki, Finland, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q4208381 , label: BMS World Mission, description: Christian missionary society, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q1009242 , label: Didcot, description: town and civil parish in Oxfordshire, UK; formerly in Berkshire, types: set(), source: Wikidata>]
[<id: http://www.wikidata.org/entity/Q1139380 , label: National Speleological Society, descrip

In [None]:
#This code is for save the file and download from Google Colab platfrom
#from google.colab import files
#df.to_csv('CEA_Round3_result_37.csv') 
#files.download('CEA_Round3_result_37.csv') # This csv file is the result for CEA task in the challenge

In [None]:
# This code is used to check the duplicate values in the outcome file 

df = pd.read_csv("combine_output_final - Copy.csv")

duplicateDFRow = df[df.duplicated()]
print(duplicateDFRow)

df = df.drop_duplicates()
print(df)

#df.to_csv('CEA_result_task1.csv')

### This section of code is for calculating cosine similarity in string matching for CEA task

In [None]:
# this file is taken after getting entities from Wikidata KG 
df = pd.read_csv("C:\\Users\\01-18-20\\Documents\\City_University\\Round2\\filnal_process_data\\file_06.csv", nrows=5)

In [None]:
# convert the text in to lower case in some columns from the dataframe
df["Cell"] = df["Cell"].str.lower()
df["Label1"] = df["Label1"].str.lower()
df["Label2"] = df["Label2"].str.lower()
df["Label3"] = df["Label3"].str.lower()
df["Label4"] = df["Label4"].str.lower()

In [None]:
#remove the _ from the Cell column to match the string
pec_chars = ["_"]
for char in spec_chars:
    df['Cell'] = df['Cell'].str.replace(char, ' ')
#df.fillna('empty cell', inplace=True)
df1=df
df1['Label1']= df1['Label1'].apply(str)

In [None]:
#df1['simi']=''
df1['Similar_label']=''
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
for k in range(0, 1):
    #print(row['c1'], row['c2'])
     
    test=[]

# X = input("Enter first string: ").lower() 
# Y = input("Enter second string: ").lower() 
    X = df1.iloc[k]['Cell']
    Y1 =df1.iloc[k]['Label1']
    Y2 =df1.iloc[k]['Label2']
    Y3 =df1.iloc[k]['Label3']
    Y4 =df1.iloc[k]['Label4']
# tokenization 

    X_list = word_tokenize(X)  
    Y1_list = word_tokenize(Y1) 
    Y2_list = word_tokenize(Y2) 
    Y3_list = word_tokenize(Y3) 
    Y4_list = word_tokenize(Y4) 
  
    # sw contains the list of stopwords 
    sw = stopwords.words('english')  
    l1 =[]
    l2 =[] 
  
    # remove stop words from the string 
    X_set = {w for w in X_list if not w in sw}  
    Y1_set = {w for w in Y1_list if not w in sw} 
    Y2_set = {w for w in Y2_list if not w in sw}
    Y3_set = {w for w in Y3_list if not w in sw}
    Y4_set = {w for w in Y4_list if not w in sw}


    # form a set containing keywords of both strings  
    rvector1 = X_set.union(Y1_set)  
    for w in rvector1: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y1_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector1)): 
            c+= l1[i]*l2[i]
    try:       
        cosine1 = c / float((sum(l1)*sum(l2))**0.5) 
    
    except ZeroDivisionError:
        cosine1 = 0
    test.append(cosine1)
    l1 =[]
    l2 =[] 

    rvector2 = X_set.union(Y2_set)  
    for w in rvector2: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y2_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector2)): 
            c+= l1[i]*l2[i]
    try:
        cosine2 = c / float((sum(l1)*sum(l2))**0.5) 
    
    except ZeroDivisionError:
        cosine2 = 0
   
    #print("similarity: ", cosine2) 
    test.append(cosine2)

    l1 =[]
    l2 =[] 

    rvector3 = X_set.union(Y3_set)  
    for w in rvector3: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y3_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector3)): 
            c+= l1[i]*l2[i]
    try:
        cosine3 = c / float((sum(l1)*sum(l2))**0.5)
    except ZeroDivisionError:
        cosine3 = 0
    ##print("similarity: ", cosine3) 
    test.append(cosine3)

    l1 =[]
    l2 =[] 

    rvector4 = X_set.union(Y4_set)  
    for w in rvector4: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y4_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
  
    # cosine formula  
    for i in range(len(rvector4)): 
            c+= l1[i]*l2[i]
    try:
        cosine4 = c / float((sum(l1)*sum(l2))**0.5)
    except ZeroDivisionError:
        cosine4 = 0
    test.append(cosine4)
    similarity_max = max(float(sub) for sub in test)

    maxpos = test.index(max(test)) 
    if maxpos == 0:
        df1["Similar_label"][k] = df1["Wikidata_Entity1"][k]
    elif maxpos == 1:
        df1["Similar_label"][k] = df1["Wikidata_Entity2"][k]
    elif maxpos == 2:
        df1["Similar_label"][k] = df1["Wikidata_Entity3"][k]
    else:
        df1["Similar_label"][k] = df1["Wikidata_Entity4"][k]
    
    #df1["simi"][k] = test
        #print(test)
    

In [None]:
df1.to_csv('simi1.csv') # save the dataframe to csv

### This code section is used for getting the entity for missing values. this code is used the DBpedia KG lookup and the DBpedia endpoint to get the same entity in Wikidata KG

In [6]:
# DBpedia ENDPOINT refernced from GitHub(2020) ernestojimenezruiz/tabular-data-semantics-py
import time

from SPARQLWrapper import SPARQLWrapper, JSON
import sys


class SPARQLEndpoint(object):
    '''
    classdocs
    '''
    def __init__(self, endpoint_url):
        '''
        Constructor
        '''
        #"http://dbpedia.org/sparql"
        self.sparqlw = SPARQLWrapper(endpoint_url)
        
        self.sparqlw.setReturnFormat(JSON)
        
        
    def getSameEntities(self, ent):
        
        query = self.createSPARQLQuerySameAsEntities(ent)
        
        
        return self.getQueryResultsArityOne(query)
    
    
    
    def getEnglishLabelsForEntity(self, ent):
        
        query = self.createEnglishLabelsForURI(ent)
        
        return self.getQueryResultsArityOne4Literals(query)
    
    
        
    def getEntitiesForType(self, cls, offset=0, limit=1000):
        
        query = self.createSPARQLEntitiesForClass(cls, offset, limit)
        
        #print(query)
        
        return self.getQueryResultsArityOne(query)
    
    
    def getEntitiesLabelsForType(self, cls, offset=0, limit=1000):
        
        query = self.createSPARQLEntitiesLabelsForClass(cls, offset, limit)
        
        #print(query)
        
        #Second element is a string so we do not filter it
        return self.getQueryResultsArityTwo(query, True, False)
    
        
    def getTypesForEntity(self, entity):
        
        query = self.createSPARQLQueryTypesForSubject(entity)
        
        return self.getQueryResultsArityOne(query)
    

    def getAllTypesForEntity(self, entity):
        
        query = self.createSPARQLQueryAllTypesForSubject(entity)
        
        return self.getQueryResultsArityOne(query)
        

    def getEquivalentClasses(self, uri_class):
        
        query = self.createSPARQLQueryEquivalentClasses(uri_class)
        
        return self.getQueryResultsArityOne(query)
    
    def getAllSuperClasses(self, uri_class):
        
        query = self.createSPARQLQueryAllSuperClassesFoClass(uri_class)
        
        return self.getQueryResultsArityOne(query)
    
    
    def getDistanceToAllSuperClasses(self, uri_class):
        
        query = self.createSPARQLQueryDistanceToAllSuperClassesForClass(uri_class)
             
        super2dist = self.getQueryResultsArityTwo(query, False, False)
        
        #Filter top classes   
        for top_cls in URI_KG.avoid_top_concepts:
            super2dist.pop(top_cls, None)
    
    
        return super2dist
    
    
    
    def getAllSubClasses(self, uri_class):
        
        query = self.createSPARQLQueryAllSubClassesFoClass(uri_class)
        
        return self.getQueryResultsArityOne(query)
    
    
    def getDistanceToAllSubClasses(self, uri_class, max_level=-1):
        
        query = self.createSPARQLQueryDistanceToAllSubClassesForClass(uri_class)
             
        sub2dist = self.getQueryResultsArityTwo(query, False, False)
        
        if max_level>0:
            sub2dist_new = sub2dist.copy()
            for scls in sub2dist.keys():
                #Only one element in set
                if int(sorted(sub2dist_new[scls])[0])>max_level:
                    sub2dist_new.pop(scls)
                
            return sub2dist_new
        else:
            return sub2dist
            
        
    
    
    def getPredicatesForSubject(self, subject_entity, limit=1000):
        
        query = self.createSPARQLQueryPredicatesForSubject(subject_entity, limit)
        
        return self.getQueryResultsArityOne(query)
        
    def getPredicatesForObject(self, obj_entity, limit=1000):
        
        query = self.createSPARQLQueryPredicatesForObject(obj_entity, limit)
        
        return self.getQueryResultsArityOne(query)   
    
    
    
    #Exploits the domain types of the properties
    def getTypesUsingPredicatesForSubject(self, subject_entity, limit=1000):
        
        query = self.createSPARQLQueryDomainTypesOfPredicatesForSubject(subject_entity, limit)
        
        return self.getQueryResultsArityOne(query)
        
    #Exploits the range types of the properties
    def getTypesUsingPredicatesForObject(self, obj_entity, limit=1000):
        
        query = self.createSPARQLQueryRangeTypesOfPredicatesForObject(obj_entity, limit)
        
        return self.getQueryResultsArityOne(query)   
    
    
    
    def getTopTypesUsingPredicatesForSubject(self, subject_entity, limit=5):
        
        query = self.createSPARQLQueryDomainTypesCountOfPredicatesForSubject(subject_entity, limit)
        
        return self.getQueryResultsArityOne(query)
        
    #Exploits the range types of the properties
    def getTopTypesUsingPredicatesForObject(self, obj_entity, limit=5):
        
        query = self.createSPARQLQueryRangeTypesCountOfPredicatesForObject(obj_entity, limit)
        
        return self.getQueryResultsArityOne(query)   
    
    
    
    def getTriplesForSubject(self, subject_entity, limit=1000):
        
        query = self.createSPARQLQueryTriplesForSubject(subject_entity, limit)
        
        #print(query)        
        #print(self.getQueryResultsArityTwo(query, False, False))
        
        return self.getQueryResultsArityTwo(query, False, False)
        
        
    def getTriplesForObject(self, obj_entity, limit=1000):
        
        query = self.createSPARQLQueryTriplesForObject(obj_entity, limit)
        
        return self.getQueryResultsArityTwo(query)    
    
    
    def getSomeValuesForPredicate(self, predicate, limit=100):
        
        query = self.createSPARQLQuerySomeValuesForPredicate(predicate, limit)
        
        return self.getQueryResultsArityOne(query)    
        
        
        
    
    
    def getQueryResults(self, query, attempts=5):
        
        try:
            
            self.sparqlw.setQuery(query)
            
            return self.sparqlw.query().convert()
        
        except:
            
            print("Query '%s' failed. Attempts: %s" % (query, str(attempts)))
            time.sleep(60) #to avoid limit of calls, sleep 60s
            attempts-=1
            if attempts>0:
                return self.getQueryResults(query, attempts)
            else:
                return None


    def getQueryResultsArityOne(self, query, filter_uri=True):
        
        
        results = self.getQueryResults(query, 3)
            
            
        result_set = set()
    
        if results==None:
            print("None results for", query)
            return result_set
            
    
        for result in results["results"]["bindings"]:
            #print(result)
            #print(result["uri"]["value"])
            uri_value = result["uri"]["value"]
            
            if not filter_uri or uri_value.startswith(URI_KG.dbpedia_uri) or uri_value.startswith(URI_KG.wikidata_uri) or uri_value.startswith(URI_KG.schema_uri) or uri_value.startswith(URI_KG.dbpedia_uri_resource) or uri_value.startswith(URI_KG.dbpedia_uri_property): 
                result_set.add(uri_value)
        
        
        return result_set
    
    
    
    def getQueryResultsArityOne4Literals(self, query):
        
        
        results = self.getQueryResults(query, 3)
            
            
        result_set = set()
    
        if results==None:
            print("None results for", query)
            return result_set
            
    
        for result in results["results"]["bindings"]:
            
            value = result["literal"]["value"]
            
            result_set.add(value)
        
        
        return result_set
    
    
    
    
    def getQueryResultsArityTwo(self, query, filter_outA=True, filter_outB=True):
        
        #self.sparqlw.setQuery(query)
        #results = self.sparqlw.query().convert()

        results = self.getQueryResults(query, 3)
    
        result_dict = dict()
        
        if results==None:
            print("None results for", query)
            return result_dict
        
    
        for result in results["results"]["bindings"]:
            #print(result)
            #print(result["uri"]["value"])
            outA_value = result["outA"]["value"]
            outB_value = result["outB"]["value"]
            
            
            if not filter_outA or outA_value.startswith(URI_KG.dbpedia_uri) or outA_value.startswith(URI_KG.wikidata_uri) or outA_value.startswith(URI_KG.schema_uri) or outA_value.startswith(URI_KG.dbpedia_uri_resource) or outA_value.startswith(URI_KG.dbpedia_uri_property): 
                
                    if not filter_outB or outB_value.startswith(URI_KG.dbpedia_uri) or outB_value.startswith(URI_KG.wikidata_uri) or outB_value.startswith(URI_KG.schema_uri) or outB_value.startswith(URI_KG.dbpedia_uri_resource) or outB_value.startswith(URI_KG.dbpedia_uri_property):
                        
                        if outA_value not in result_dict:
                            result_dict[outA_value] = set() 
                        
                        result_dict[outA_value].add(outB_value)
                
        
        return result_dict
        
    
    def createSPARQLQueryTriplesForObject(self, obj, limit=1000):
        
        props_to_filter=""
        for p in URI_KG.avoid_predicates:
            props_to_filter+="<" + p + ">," 
        props_to_filter = props_to_filter[0:len(props_to_filter)-1]
        
        #props_to_filter = ",".join(URI_KG.avoid_predicates)        
        
        return "SELECT DISTINCT ?outA ?outB WHERE { ?outA ?outB <" + obj + "> . FILTER( ?outB NOT IN("+ props_to_filter+")) } limit " + str(limit)
        
    def createSPARQLQueryTriplesForSubject(self, subject, limit=1000):
        
        props_to_filter=""
        for p in URI_KG.avoid_predicates:
            props_to_filter+="<" + p + ">,"
            #props_to_filter+= p + ", " 
        props_to_filter = props_to_filter[0:len(props_to_filter)-1]
        
        #props_to_filter = ",".join(URI_KG.avoid_predicates)
        
        return "SELECT DISTINCT ?outA ?outB WHERE { <" + subject + "> ?outA ?outB  FILTER( ?outA NOT IN("+ props_to_filter+")) } limit " + str(limit)
        #return "SELECT DISTINCT ?outA ?outB WHERE { <" + subject + "> ?outA ?outB . } limit " + str(limit)
    
    
    def createSPARQLQueryPredicatesForSubject(self, subject, limit=1000):
        return "SELECT DISTINCT ?uri WHERE { <" + subject + "> ?uri [] . } limit " + str(limit)
    
    def createSPARQLQueryPredicatesForObject(self, obj, limit=1000):
        return "SELECT DISTINCT ?uri WHERE { [] ?uri <" + obj + "> . } limit " + str(limit)
    
    
    def createSPARQLQueryDomainTypesOfPredicatesForSubject(self, subject, limit=1000):
        return "SELECT DISTINCT ?uri WHERE { <" + subject + "> ?p [] . ?p rdfs:domain ?uri . } limit " + str(limit)
    
    def createSPARQLQueryRangeTypesOfPredicatesForObject(self, obj, limit=1000):
        return "SELECT DISTINCT ?uri WHERE { [] ?p <" + obj + "> . ?p rdfs:range ?uri . } limit " + str(limit)
    
    
    def createSPARQLQuerySomeValuesForPredicate(self, predicate, limit=100):
        return "SELECT DISTINCT ?uri WHERE { ?s <" + predicate + "> ?uri . } limit " + str(limit)
    
       
    #SELECT DISTINCT ?outA COUNT(?outA) as ?outB WHERE { [] ?p <http://dbpedia.org/resource/Scotland> . ?p rdfs:range ?outA . } GROUP BY ?outA ORDER BY DESC(?outB) limit 3
    #SELECT DISTINCT ?outA WHERE { [] ?p <http://dbpedia.org/resource/Scotland> . ?p rdfs:range ?outA . } GROUP BY ?outA ORDER BY DESC(COUNT(?outA)) limit 3
    #SELECT DISTINCT ?outA WHERE { <http://dbpedia.org/resource/Allan_Pinkerton> ?p [] . ?p rdfs:domain ?outA . } GROUP BY ?outA ORDER BY DESC(COUNT(?outA)) limit 3
    #SELECT DISTINCT ?outA COUNT(?outA) as ?outB WHERE { <http://dbpedia.org/resource/Allan_Pinkerton> ?p [] . ?p rdfs:domain ?outA . } GROUP BY ?outA ORDER BY DESC(?outB) limit 3 
    
    def createSPARQLQueryDomainTypesCountOfPredicatesForSubject(self, subject, limit=3):
        return "SELECT DISTINCT ?uri WHERE { <" + subject + "> ?p [] . ?p rdfs:domain ?uri . } GROUP BY ?uri HAVING (COUNT(?uri) > 3) ORDER BY DESC(COUNT(?uri)) limit " + str(limit)
    
    def createSPARQLQueryRangeTypesCountOfPredicatesForObject(self, obj, limit=3):
        return "SELECT DISTINCT ?uri WHERE { [] ?p <" + obj + "> . ?p rdfs:range ?uri . } GROUP BY ?uri HAVING (COUNT(?uri) > 3) ORDER BY DESC(COUNT(?uri)) limit " + str(limit)
    
    
    def createEnglishLabelsForURI(self, uri_subject):
        return "SELECT DISTINCT ?literal WHERE { <" + uri_subject + "> rdfs:label ?literal . FILTER( langMatches(lang(?literal), 'en')) }"
    

class DBpediaEndpoint(SPARQLEndpoint):
    '''
    classdocs
    
    '''
    
    def __init__(self):
        '''
        Constructor
        '''
        super().__init__(self.getEndpoint())
        #"http://dbpedia.org/sparql"
       
        
    def getEndpoint(self):
        return "http://dbpedia.org/sparql"

    def getWikiPageRedirect(self, uri_entity):
        
        query = self.createSPARQLQueryWikiPageRedirects(uri_entity)        
        return self.getQueryResultsArityOne(query)
 
    def getWikiPageRedirectFrom(self, uri_entity):
        
        query = self.createSPARQLQueryWikiPageRedirectsFrom(uri_entity)        
        return self.getQueryResultsArityOne(query)
    
    def createSPARQLEntitiesForClass(self, class_uri, offset=0, limit=1000):
            
        return "SELECT DISTINCT ?uri WHERE { ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + class_uri + "> . } ORDER BY RAND() OFFSET " + str(offset) + " limit " + str(limit)
        #return "SELECT DISTINCT ?uri WHERE { ?uri a dbo:Country . } ORDER BY RAND() limit " + str(limit)
    
    def createSPARQLEntitiesLabelsForClass(self, class_uri, offset=0, limit=1000):
            
        return "SELECT DISTINCT ?outA ?outB WHERE { ?outA <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + class_uri + "> . ?outA rdfs:label ?outB . FILTER( langMatches(lang(?outB), 'en')) } ORDER BY RAND() OFFSET " + str(offset) + " limit " + str(limit)
        #Lang restriction required
        #return "SELECT DISTINCT ?outA ?outB WHERE { ?outA <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <" + class_uri + "> . ?outA rdfs:label ?outB . } ORDER BY RAND() OFFSET " + str(offset) + " limit " + str(limit)
    #def createEnglishLabelsForURI(self, uri_subject):
    #    return "SELECT DISTINCT ?literal WHERE { <" + uri_subject + "> rdfs:label ?literal . FILTER( langMatches(lang(?literal), 'en')) }"

    def createSPARQLQueryTypesForSubject(self, uri_subject):
            
        return "SELECT DISTINCT ?uri WHERE { <" + uri_subject + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?uri . }"
    
    def createSPARQLQueryWikiPageRedirects(self, uri_subject):
            
        return "SELECT DISTINCT ?uri WHERE { <" + uri_subject + "> <http://dbpedia.org/ontology/wikiPageRedirects> ?uri . }"
    
    def createSPARQLQueryWikiPageRedirectsFrom(self, uri_object):
            
        return "SELECT DISTINCT ?uri WHERE { ?uri <http://dbpedia.org/ontology/wikiPageRedirects> <" + uri_object + "> . }"
        
    def createSPARQLQueryAllTypesForSubject(self, uri_subject):
            
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "{<" + uri_subject + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?dt . " \
        + "?dt <http://www.w3.org/2000/01/rdf-schema#subClassOf>* ?uri " \
        + "}" \
        + "UNION {" \
        + "<" + uri_subject + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?uri . " \
        + "}" \
        + "UNION {" \
        + "<" + uri_subject + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?dt . " \
        + "?dt <http://www.w3.org/2002/07/owl#equivalentClass> ?uri " \
        + "}" \
        + "}"
        
        
    def createSPARQLQueryEquivalentClasses(self, uri_class):
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "{<" + uri_class + "> <http://www.w3.org/2002/07/owl#equivalentClass> ?uri ." \
        + "} " \
        + "UNION " \
        + "{ ?uri <http://www.w3.org/2002/07/owl#equivalentClass> <" + uri_class + "> ." \
        + "} " \
        + "}";
        
        
        
    def createSPARQLQueryDistanceToAllSuperClassesForClass(self, uri_cls):
        return "SELECT  ?outA (count(?mid) as ?outB) " \
        + "WHERE {" \
        + "<"+uri_cls+"> <http://www.w3.org/2000/01/rdf-schema#subClassOf>* ?mid . " \
        + "?mid <http://www.w3.org/2000/01/rdf-schema#subClassOf>+ ?outA . " \
        + "}" \
        + "GROUP BY ?outA";  
        ##+ "values ?uri_subject { <" + uri_subject + "> }" \
        
            
        
    def createSPARQLQueryAllSuperClassesForClass(self, uri_cls):
            
        return "SELECT DISTINCT ?uri " \
        + "WHERE { " \
        + "{<" + uri_cls + "> <http://www.w3.org/2000/01/rdf-schema#subClassOf>* ?uri ." \
        + "}" \
        + "UNION {" \
        + "<" + uri_cls + "> <http://www.w3.org/2002/07/owl#equivalentClass> ?uri ." \
        + "}" \
        + "}"
    
    
    
    def createSPARQLQueryAllSubClassesForClass(self, uri_cls):
        
        return "SELECT DISTINCT ?uri " \
        + "WHERE { " \
        + "{?uri <http://www.w3.org/2000/01/rdf-schema#subClassOf>* <" + uri_cls + "> ." \
        + "}" \
        + "UNION {" \
        + "<" + uri_cls + "> <http://www.w3.org/2002/07/owl#equivalentClass> ?uri ." \
        + "}" \
        + "}"
    
    
    def createSPARQLQueryDistanceToAllSubClassesForClass(self, uri_cls):
        

        return "SELECT  ?outA (count(?mid) as ?outB) " \
        + "WHERE {" \
        + "?mid <http://www.w3.org/2000/01/rdf-schema#subClassOf>* <"+uri_cls+"> . " \
        + "?outA <http://www.w3.org/2000/01/rdf-schema#subClassOf>+ ?mid . " \
        + "}" \
        + "GROUP BY ?outA";  
        ##+ "values ?uri_subject { <" + uri_subject + "> }" \
    
    
        
    def createSPARQLQuerySameAsEntities(self, uri_entity):
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "{<" + uri_entity + "> <http://www.w3.org/2002/07/owl#sameAs> ?uri ." \
        + "} " \
        + "UNION {" \
        + "?uri <http://www.w3.org/2002/07/owl#sameAs> <" + uri_entity + "> ." \
        + "} " \
        + "}";
        

        
        
    ##TODO revise
    def createSPARQLQuerySameAsEntities(self, uri_entity):
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "{<" + uri_entity + "> <http://www.wikidata.org/prop/direct/P460> ?uri ." \
        + "} " \
        + "UNION {" \
        + "?uri <http://www.wikidata.org/prop/direct/P460> <" + uri_entity + "> ." \
        + "} " \
        + "}";
        
        
        
    def createSPARQLQueryAllSuperClassesForClass(self, uri_cls):
        
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "<" + uri_cls + "> <http://www.wikidata.org/prop/direct/P279>+ ?uri " \
        + "}";
    
    
    def createSPARQLQueryDistanceToAllSuperClassesForClass(self, uri_cls):
        

        return "SELECT  ?outA (count(?mid) as ?outB) " \
        + "WHERE {" \
        + "values ?uri_cls { <" + uri_cls + "> }" \
        + "?uri_cls <http://www.wikidata.org/prop/direct/P279>* ?mid ." \
        + "?mid <http://www.wikidata.org/prop/direct/P279>+ ?outA . " \
        + "}" \
        + "GROUP BY ?uri_cls ?outA";  
    
    
    
    
    def createSPARQLQueryAllSubClassesForClass(self, uri_cls):
        
        return "SELECT DISTINCT ?uri " \
        + "WHERE {" \
        + "?uri <http://www.wikidata.org/prop/direct/P279>+ <" + uri_cls + ">" \
        + "}";
    
    
    def createSPARQLQueryDistanceToAllSubClassesForClass(self, uri_cls):
        

        return "SELECT  ?outA (count(?mid) as ?outB) " \
        + "WHERE {" \
        + "values ?uri_cls { <" + uri_cls + "> }" \
        + "?mid <http://www.wikidata.org/prop/direct/P279>* ?uri_cls ." \
        + "?outA <http://www.wikidata.org/prop/direct/P279>+ ?mid . " \
        + "}" \
        + "GROUP BY ?uri_cls ?outA";  
    
    
    


if __name__ == '__main__':
    
    '''
    TODO: Filter by schema.org, dbpedia or wikidata
    '''

    #ent="http://www.wikidata.org/entity/Q470813" #Prim's algorithm
    #ent="http://www.wikidata.org/entity/Q466575" #middle-square method
    #print(ent)
    #ep = WikidataEndpoint()
    #types = ep.getTypesForEntity(ent)
    #print(len(types), types)
    cls = "http://dbpedia.org/ontology/Country"
    #cls = "http://dbpedia.org/ontology/Person"
    
    
    
    ep = DBpediaEndpoint()
    
    # seconds passed since epoch
    init = time.time()
    
    entities=set()
    entities = ep.getEntitiesForType(cls, 0, 100)
    print("Extracted entities: ", len(entities))
    for ent in entities:
        print(ent)
    
    end = time.time()
    

    #local_time = time.ctime(seconds)
    print("Time:", end-init)
    
    
    
    
    
    
    if True:
        sys.exit(0) 
    


    ent = "http://dbpedia.org/resource/Scotland"
    #ent = "http://dbpedia.org/resource/Allan_Pinkerton"
    #ent = 'http://www.wikidata.org/entity/Q22'
    #"http://dbpedia.org/resource/Hern%C3%A1n_Andrade"
    ent="http://dbpedia.org/resource/Chicago_Bulls"
    ep = DBpediaEndpoint()
    types = ep.getTypesForEntity(ent)
    print(len(types), types)
    
    
    sameas = ep.getSameEntities(ent)
    print(len(sameas), sameas)
    
    
    
    labels = ep.getEnglishLabelsForEntity(ent)
    print(len(labels), labels)
    
    
    
    
    cls = "http://dbpedia.org/ontology/BaseballTeam"
    
    print("Domain types")
    types_domain = ep.getTopTypesUsingPredicatesForSubject(ent, 3)
    for t in types_domain:
        print(t)
    
    print("Range types")
    types_range = ep.getTopTypesUsingPredicatesForObject(ent, 3)
    for t in types_range:
        print(t)
    
    
    
    cls = "http://dbpedia.org/ontology/Country"
    #cls = "http://dbpedia.org/ontology/Person"
    #cls = 'http://www.wikidata.org/entity/Q6256'
    
   
    sup2dist = ep.getDistanceToAllSuperClasses(cls)
    print(len(sup2dist), sup2dist)
    
    sub2dist = ep.getDistanceToAllSubClasses(cls)
    print(len(sub2dist), sub2dist)
    
    
    
    ep = WikidataEndpoint()
    #types = ep.getAllTypesForEntity("http://www.wikidata.org/entity/Q22")
    #print(len(types), types)
    
    
    
    #equiv = ep.getEquivalentClasses(cls)
    #print(len(equiv), equiv)
    
    
    #same = ep.getSameEntities(ent)
    #print(len(same), same)
    
    
    gt_cls="http://www.wikidata.org/entity/Q5"
    
    sup2dist = ep.getDistanceToAllSuperClasses(gt_cls)
    print(len(sup2dist), sup2dist)
    
    
    sub2dist = ep.getDistanceToAllSubClasses(gt_cls, 2)
    print(len(sub2dist), sub2dist)
    
    
    
    
    ent="http://www.wikidata.org/entity/Q22"
    ent="http://www.wikidata.org/entity/Q5"
    #ent="http://www.wikidata.org/entity/Q128109"
    labels = ep.getEnglishLabelsForEntity(ent)
    print(len(labels), labels)
    
    
    

Extracted entities:  100
http://dbpedia.org/resource/James_Dunwoody_Bulloch
http://dbpedia.org/resource/Cambodia
http://dbpedia.org/resource/Dapeng_(state)
http://dbpedia.org/resource/Recherla_Nayaks
http://dbpedia.org/resource/Archaic–Early_Basketmaker_Era
http://dbpedia.org/resource/United_Kingdom_of_Portugal,_Brazil_and_the_Algarves
http://dbpedia.org/resource/Stem_duchy
http://dbpedia.org/resource/Batavian_Republic
http://dbpedia.org/resource/Northern_Han
http://dbpedia.org/resource/Fiscal_year
http://dbpedia.org/resource/Saloum
http://dbpedia.org/resource/Taifa_of_Arjona
http://dbpedia.org/resource/Coins_of_the_Italian_lira
http://dbpedia.org/resource/Los_Angeles_in_the_1920s
http://dbpedia.org/resource/German_Federal_Republic
http://dbpedia.org/resource/Religion_in_Western_Ganga_kingdom
http://dbpedia.org/resource/French_West_Africa
http://dbpedia.org/resource/Tusculan_Papacy
http://dbpedia.org/resource/Reign_of_Terror
http://dbpedia.org/resource/Kamboja_(name)
http://dbpedia.org

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
df2 = pd.read_csv("C:\\Users\\01-18-20\\Documents\\City_University\\round3\\CEA_round3_DB_get.csv")

df2

In [None]:
df2["col_Entity"]=''
for i, row in df2.iterrows():
    #print(i)
    ent1 = df2.loc[i]["Wikidata_Entity"]
    print(ent1)
    ep = DBpediaEndpoint()
    sameas = ep.getSameEntities(ent1) # to get the same entity from the wikidata by using DBpedia entity
    df2["col_Entity"][i]=sameas
    print(sameas)


In [None]:
#df2.to_csv('Round3_cea_missing_DB_WIKI.csv')

## This code is used for getting class for the column in CTA task

In [None]:
# Read the CEA result file
df1 = pd.read_csv("C:\\Users\\01-18-20\\Documents\\City_University\\Project_Masters_Thesis_CTA_CPA_CEA\\CTA_TASK\\CEA_END_RESULT_v03.csv")
rslt_df = df1[df1['Column_id'] == 0]
rslt_df

In [None]:
rslt_df["col_Entity"]='' # add the column in the dataframe

In [None]:
# call the function WikidataEndpoint 
for i, row in rslt_df.iterrows():
    #print(i)
    ent1 = rslt_df.loc[i]["Wikidata_Entity"]
    print(ent1)
    ep = WikidataEndpoint()
    types = ep.getTypesForEntity(ent1)
    rslt_df["col_Entity"][i]=types
    print(types)

In [None]:
#rslt_df.to_csv('final.csv') # for getting the output for CTA task