In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse

In [2]:
df_no_inf_all_sameas = pd.read_csv("results/evaluation/no_inf_all_sameas.tsv", sep=" ")
df_no_inf_099_sameas = pd.read_csv("results/evaluation/no_inf_099_sameas.tsv", sep=" ")
df_no_inf_04_sameas = pd.read_csv("results/evaluation/no_inf_04_sameas.tsv", sep=" ")
df_with_inf_all_sameas = pd.read_csv("results/evaluation/with_inf_all_sameas.tsv", sep=" ")
df_with_inf_099_sameas = pd.read_csv("results/evaluation/with_inf_099_sameas.tsv", sep=" ")
df_with_inf_04_sameas = pd.read_csv("results/evaluation/with_inf_04_sameas.tsv", sep=" ")


In [8]:
# Our benchmark consists of 742 concept alignments
# We compare the impact of adding sameAs with and without rdfs:subClassOf inference (no_inf / with_inf)
# We compare the impact of adding only a subset of sameAs links according to certain error degrees (all_sameAs / err <0.99 / err <0.4)

print(df_no_inf_all_sameas.shape)
print(df_with_inf_all_sameas.shape)

print(df_no_inf_099_sameas.shape)
print(df_with_inf_099_sameas.shape)

print(df_no_inf_04_sameas.shape)
print(df_with_inf_04_sameas.shape)

(742, 17)
(742, 17)
(742, 17)
(742, 17)
(742, 17)
(742, 17)


In [10]:
# Example of one row containing 17 columns
# This is the evaluation on adding all sameAs links without considering rdfs:subClassOf inference on the alignment between foaf:Person and dbr:Person
# In this case adding sameAs links decreases Jaccard index by 33% (from 0.0473 to 0.0314)
# We notice that this is due to the large decrease of the intersection when sameAs is considered (inter_no_sameAs / inter_with_sameAs)

df_no_inf_all_sameas.head(1)

Unnamed: 0,row_id,concept1,concept1_nbr_inst,concept1_nbr_unique_instances,concept2,concept2_nbr_inst,concept2_nbr_unique_instances,inter_no_sameas,inter_with_sameas,union_no_sameas,union_with_sameas,jacc_no_sameas,jacc_with_sameas,isSubset_LOD,isSubset_no_sameAs,isSubset_with_sameas,diff_with_sameas
0,1,http://xmlns.com/foaf/0.1/Person,132919327,130504747,http://dbpedia.org/ontology/Person,6568866,4361741,6298105,4103666,133190088,130762822,0.0473,0.0314,False,False,False,-33.62


In [15]:
# Cases when Jaccard index decreases when sameAs is added

df_jaccard_decrease = df_with_inf_all_sameas[(df_with_inf_all_sameas["diff_with_sameas"] <0)].sort_values(by=['diff_with_sameas'], ascending=True)
print("Jaccard decreases in", df_jaccard_decrease.shape[0], "cases")
df_jaccard_decrease.head(5)



Jaccard decreases in 25 cases


Unnamed: 0,row_id,concept1,concept1_nbr_inst,concept1_nbr_unique_instances,concept2,concept2_nbr_inst,concept2_nbr_unique_instances,inter_no_sameas,inter_with_sameas,union_no_sameas,union_with_sameas,jacc_no_sameas,jacc_with_sameas,isSubset_LOD,isSubset_no_sameAs,isSubset_with_sameas,diff_with_sameas
151,152,http://www4.wiwiss.fu-berlin.de/drugbank/vocab...,9335,4985,http://www4.wiwiss.fu-berlin.de/dailymed/resou...,4308,1222,4308,1222,9335,4985,0.4615,0.2451,False,True,True,-46.89
9,10,http://xmlns.com/foaf/0.1/Person,133365843,130889693,http://dbpedia.org/ontology/Person,6572380,4364980,6298218,4103760,133640005,131150913,0.0471,0.0313,False,False,False,-33.55
14,15,http://xmlns.com/foaf/0.1/Person,133365843,130889693,http://schema.org/Person,6287617,4098156,6287611,4098150,133365849,130889699,0.0471,0.0313,False,False,False,-33.55
28,29,http://schema.org/Place,14818944,12440025,http://dbpedia.org/ontology/Place,4613844,2596723,4481791,2525556,14950997,12511192,0.2998,0.2019,False,False,False,-32.66
24,25,http://purl.org/ontology/bibo/Book,27149348,26481591,http://dbpedia.org/ontology/Book,269350,194910,262001,189698,27156697,26486803,0.0096,0.0072,False,False,False,-25.0


In [17]:
def jaccardImprovement(my_df):
    results = [0,0,0,0] # (0) from 0 to Positive Jaccard after sameAs, (1) better Jaccard, (2) no changes, (3) worse Jaccard  
    total_rows = my_df.shape[0]
    for index, row in my_df.iterrows():
        if row['diff_with_sameas']==9999:
            results[0]+=1
        else:
            if row['diff_with_sameas']>0:
                results[1]+=1
            else:
                if row['diff_with_sameas']==0:
                    results[2]+=1
                else:
                    results[3]+=1
    print("Total:", total_rows, "| Results:", float("{:.2f}".format(results[0]*100/total_rows)), "%,", float("{:.2f}".format(results[1]*100/total_rows)), "%,", float("{:.2f}".format(results[2]*100/total_rows)), "%,", float("{:.2f}".format(results[3]*100/total_rows)), "%")
    return results

In [20]:
# For instance when all sameAs are considered and rdfs:subClassOf sumbsumption is not considered
# Jaccard increases from 0 to a positive value in 322 cases
# Jaccard increases from a positive value to a higher positive value in 39 cases
# Jaccard does not change in 356 cases
# Jaccard decreases in 25 cases

print("==== ALL SAMEAS ====")
print(jaccardImprovement(df_no_inf_all_sameas))
print(jaccardImprovement(df_with_inf_all_sameas))
print("")
print("==== SAMEAS <0.99 ====")
print(jaccardImprovement(df_no_inf_099_sameas))
print(jaccardImprovement(df_with_inf_099_sameas))
print("")
print("==== SAMEAS <0.4 ====")
print(jaccardImprovement(df_no_inf_04_sameas))
print(jaccardImprovement(df_with_inf_04_sameas))

==== ALL SAMEAS ====
Total: 742 | Results: 43.4 %, 5.26 %, 47.98 %, 3.37 %
[322, 39, 356, 25]
Total: 742 | Results: 45.28 %, 6.06 %, 45.28 %, 3.37 %
[336, 45, 336, 25]

==== SAMEAS <0.99 ====
Total: 742 | Results: 42.86 %, 5.26 %, 48.52 %, 3.37 %
[318, 39, 360, 25]
Total: 742 | Results: 44.74 %, 5.93 %, 45.96 %, 3.37 %
[332, 44, 341, 25]

==== SAMEAS <0.4 ====
Total: 742 | Results: 8.76 %, 2.56 %, 83.42 %, 5.26 %
[65, 19, 619, 39]
Total: 742 | Results: 10.11 %, 3.1 %, 81.54 %, 5.26 %
[75, 23, 605, 39]


In [21]:
def generateBins(my_df, my_attribute):
    bins = pd.cut(my_df[my_attribute], [-1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
    return my_df.groupby(bins)['row_id'].agg(['count'])

In [22]:
generateBins(df_no_inf_all_sameas, 'jacc_no_sameas')

Unnamed: 0_level_0,count
jacc_no_sameas,Unnamed: 1_level_1
"(-1.0, 0.0]",655
"(0.0, 0.1]",17
"(0.1, 0.2]",6
"(0.2, 0.3]",5
"(0.3, 0.4]",1
"(0.4, 0.5]",5
"(0.5, 0.6]",1
"(0.6, 0.7]",3
"(0.7, 0.8]",0
"(0.8, 0.9]",4


In [23]:
generateBins(df_no_inf_all_sameas, 'jacc_with_sameas')

Unnamed: 0_level_0,count
jacc_with_sameas,Unnamed: 1_level_1
"(-1.0, 0.0]",333
"(0.0, 0.1]",113
"(0.1, 0.2]",63
"(0.2, 0.3]",40
"(0.3, 0.4]",29
"(0.4, 0.5]",38
"(0.5, 0.6]",7
"(0.6, 0.7]",9
"(0.7, 0.8]",13
"(0.8, 0.9]",9


In [24]:
generateBins(df_with_inf_all_sameas, 'jacc_no_sameas')

Unnamed: 0_level_0,count
jacc_no_sameas,Unnamed: 1_level_1
"(-1.0, 0.0]",645
"(0.0, 0.1]",21
"(0.1, 0.2]",5
"(0.2, 0.3]",6
"(0.3, 0.4]",2
"(0.4, 0.5]",4
"(0.5, 0.6]",2
"(0.6, 0.7]",3
"(0.7, 0.8]",3
"(0.8, 0.9]",4


In [25]:
generateBins(df_with_inf_all_sameas, 'jacc_with_sameas')

Unnamed: 0_level_0,count
jacc_with_sameas,Unnamed: 1_level_1
"(-1.0, 0.0]",309
"(0.0, 0.1]",147
"(0.1, 0.2]",59
"(0.2, 0.3]",38
"(0.3, 0.4]",27
"(0.4, 0.5]",31
"(0.5, 0.6]",6
"(0.6, 0.7]",10
"(0.7, 0.8]",15
"(0.8, 0.9]",10


In [26]:
def checkDifferenceSets(df_1, df_2):
    concepts_set1 = set()
    concepts_set2 = set()
    for index, row in df_1.iterrows():
        concepts_set1.add(row['concept1'])
        concepts_set1.add(row['concept2'])
    for index, row in df_2.iterrows():
        concepts_set2.add(row['concept1'])
        concepts_set2.add(row['concept2'])
    return concepts_set1.difference(concepts_set2)

In [27]:
def getNameSpace(url):
    parsed_uri = urlparse(url)
    return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

In [28]:
def checkRelationWithNamespace(my_df):
    results = [0,0,0,0] # (0) from 0 to Positive Jaccard after sameAs, (1) better Jaccard, (2) no changes, (3) worse Jaccard  
    mappings_dict = {}
    mapping=""
    for index, row in my_df.iterrows():
        namespace1 = getNameSpace(row['concept1'])
        namespace2 = getNameSpace(row['concept2'])  
        if namespace1 < namespace2:
            mapping = namespace1+"<-->"+namespace2
        else:
            mapping = namespace2+"<-->"+namespace1   
        if row['diff_with_sameas']==9999:
            this_result = mappings_dict.get(mapping, [0,0,0,0])
            this_result[0]+=1
            mappings_dict.update({mapping: this_result})
        else:
            if row['diff_with_sameas']>0:
                this_result = mappings_dict.get(mapping, [0,0,0,0])
                this_result[1]+=1
                mappings_dict.update({mapping: this_result})
            else:
                if row['diff_with_sameas']==0:
                    this_result = mappings_dict.get(mapping, [0,0,0,0])
                    this_result[2]+=1
                    mappings_dict.update({mapping: this_result})
                else:
                    this_result = mappings_dict.get(mapping, [0,0,0,0])
                    this_result[3]+=1
                    mappings_dict.update({mapping: this_result})
    return mappings_dict

In [29]:
results = checkRelationWithNamespace(df_with_inf_all_sameas)

In [70]:
for key,value in results.items():
    print(key, ":", value)

http://www.cyc.com/<-->http://www.w3.org/ : [0, 0, 1, 0]
http://www.cyc.com/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://purl.org/<-->http://xmlns.com/ : [0, 1, 4, 1]
http://web.resource.org/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://rdfs.org/<-->http://www.w3.org/ : [0, 0, 1, 0]
http://www.w3.org/<-->http://xmlns.com/ : [0, 0, 1, 1]
http://www.aktors.org/<-->http://xmlns.com/ : [0, 0, 3, 0]
http://bblfish.net/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://dbpedia.org/<-->http://xmlns.com/ : [0, 0, 0, 1]
http://www.mindswap.org/<-->http://xmlns.com/ : [0, 0, 2, 0]
http://annotation.semanticweb.org/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://umbel.org/<-->http://xmlns.com/ : [0, 1, 1, 0]
http://schema.org/<-->http://xmlns.com/ : [0, 1, 0, 1]
http://mycampus.cs.cmu.edu/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://pervasive.semanticweb.org/<-->http://xmlns.com/ : [0, 0, 1, 0]
http://swrc.ontoware.org/<-->http://xmlns.com/ : [0, 0, 3, 0]
http://www.daml.org/<-->http://xmlns.com/ : [0, 0, 2, 0]

In [31]:
# bins = pd.cut(df_with_inf_all_sameas['jacc_no_sameas'], [-1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
# df_jacc1 = df_with_inf_all_sameas.groupby('jacc_no_sameas').count()
# df_jacc1 = df_jacc1.sort_values(by='jacc_no_sameas', ascending=False)
# df_jacc1
# #df_jacc1['row_id'].to_csv('jaccard-distribution/with_inf_jaccard_diff.tsv', sep='\t')

In [32]:
# bins = pd.cut(df_no_inf_all_sameas['jacc_with_sameas'], [-1, 0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0])
# df_no_inf_all_sameas.groupby(bins)['jacc_with_sameas'].agg(['count']).to_csv('jaccard-distribution/jacc_no_inf_with_sameas.tsv', sep='\t')



In [33]:
# bins = pd.cut(df_with_inf_all_sameas['jacc_with_sameas'], [-1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
# df_with_inf_all_sameas.groupby(bins)['jacc_with_sameas'].agg(['count']).to_csv('jaccard-distribution/jacc_with_inf_with_sameas.tsv', sep='\t')

