In [2]:
#!pip install sentence-transformers

In [3]:
import pandas as pd
import gensim
import numpy as np
import random
import json
import os

# Bert Sentence transformer

# Generate word2vec embedding using dbpedia taxonomy

In [159]:
os.chdir('../')

In [160]:
!pwd

/home/jovyan/work/iswc2020-smarttask


In [5]:
from src import evaluate

In [6]:
# training data frm the challenge
dbpedia_df = pd.read_json('datasets/DBpedia/smarttask_dbpedia_train.json')

Generation of DBPedia hierarchy embeddings

In [7]:
#!git clone https://github.com/cnikas/isl-smart-task

In [8]:

import csv
import json


"""Load models &  resources"""
resources_dir = 'isl-smart-task/resources_dir' 
mapping_csv = resources_dir+'/mapping.csv'
hierarchy_json = resources_dir+'/dbpedia_hierarchy.json'


id_to_label = {}
label_to_id = {}
with open(mapping_csv) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        id_to_label[row[1]] = row[0]
        label_to_id[row[0]] = row[1]


hierarchy = {}
with open(hierarchy_json) as json_file:
    hierarchy = json.load(json_file)


In [9]:
# import DBPedia class hierarchy
# flatten the hierarchy to sequences
def import_db_hry(hierarchy):
    #grouped= concepts_df.groupby('Parent')
    sequence = []
    for name in hierarchy:
        sequence.append(hierarchy[name]['children'] + [name])
    return sequence

In [10]:
sequence = import_db_hry(hierarchy)
#sequence =[]

In [11]:
# also added type order from the training set
dbpedia_res_df = dbpedia_df[dbpedia_df.category == 'resource']
type_list = dbpedia_res_df.type.values

In [12]:
type_list

array([list(['dbo:Opera', 'dbo:MusicalWork', 'dbo:Work']),
       list(['dbo:EducationalInstitution', 'dbo:Organisation', 'dbo:Agent']),
       list(['dbo:State', 'dbo:PopulatedPlace', 'dbo:Place', 'dbo:Location']),
       ...,
       list(['dbo:University', 'dbo:EducationalInstitution', 'dbo:Organisation', 'dbo:Agent']),
       list(['dbo:Person', 'dbo:Agent']),
       list(['dbo:River', 'dbo:Stream', 'dbo:BodyOfWater', 'dbo:NaturalPlace', 'dbo:Place', 'dbo:Location'])],
      dtype=object)

In [13]:
for s in type_list.tolist():
    sequence.append(s)

In [14]:
# train Word2Vec model for DBPedia class hierarchy
model= gensim.models.Word2Vec(size=100,min_count=0, window=5, sg=1, iter=5,cbow_mean=1)
model.build_vocab(sequence)

In [15]:
model.train(sequence, total_examples=model.corpus_count, epochs=5)

(63441, 153815)

In [16]:
model.wv.most_similar('dbo:Person',topn=5)

[('dbo:OfficeHolder', 0.9819596409797668),
 ('dbo:SportsTeam', 0.9717636108398438),
 ('dbo:Company', 0.9704587459564209),
 ('dbo:RecordLabel', 0.9689014554023743),
 ('dbo:Band', 0.9679181575775146)]

In [17]:
model.wv.save_word2vec_format('datasets/output/type_embedding.txt')

# Pre-processing on dataframe

In [18]:
dbpedia_df.head()

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."


In [19]:
test_df= pd.read_json('datasets/DBpedia/smarttask_dbpedia_test.json')

In [20]:
answer_types = {}
for answers in dbpedia_df.type:
    for a in answers:
        if a in answer_types:
            answer_types[a]+=1
        else:
            answer_types[a]=1

def types_to_freqtype(type_list):
    for t in sorted(sorted_types, key=lambda item: item[1]):
        if sorted_types[t] < 1000: 
          if t in type_list:
              return t

    for t in sorted(sorted_types, key=lambda item: item[1]):
          if t in type_list:
              return t
    

def types_to_specific_type(type_list):
    if len(type_list) == 0:
      return None 
    return type_list[0]


def types_to_top_type(type_list):
    if len(type_list) == 0:
      return None 
    return type_list[-1]
            
answ_df =pd.DataFrame.from_dict(answer_types, orient='index', columns=['Freq'])
sorted_types = answ_df.sort_values(by='Freq', ascending=False).to_dict()['Freq']

dbpedia_df['frequent_type'] =dbpedia_df.type.apply(types_to_freqtype)
dbpedia_df['bottom_level_type'] =dbpedia_df.type.apply(types_to_specific_type)
dbpedia_df['top_level_type'] =dbpedia_df.type.apply(types_to_top_type)

In [21]:
dbpedia_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean],boolean,boolean,boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]",dbo:Work,dbo:Opera,dbo:Work
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date],date,date,date
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean],boolean,boolean,boolean
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,...",dbo:EducationalInstitution,dbo:EducationalInstitution,dbo:Agent


In [22]:
test_df['frequent_type'] =test_df.type.apply(types_to_freqtype)
test_df['bottom_level_type'] =test_df.type.apply(types_to_specific_type)
test_df['top_level_type'] =test_df.type.apply(types_to_top_type)

In [23]:
# cleaning DBpedia dataset

dbpedia_df= dbpedia_df[dbpedia_df.category.notna()]
test_df= test_df[test_df.category.notna()]

dbpedia_df= dbpedia_df[dbpedia_df['frequent_type'].notna()]
test_df= test_df[test_df['frequent_type'].notna()]

dbpedia_df.dropna( subset=['question'], inplace=True)
test_df.dropna( subset=['question'], inplace=True)

In [24]:
dbpedia_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean],boolean,boolean,boolean
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]",dbo:Work,dbo:Opera,dbo:Work
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date],date,date,date
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean],boolean,boolean,boolean
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,...",dbo:EducationalInstitution,dbo:EducationalInstitution,dbo:Agent


In [25]:
# only choose sample with 'resource' category
dbpedia_res_df = dbpedia_df[dbpedia_df.category == 'resource']
test_res_df = test_df[test_df.category == 'resource']

In [26]:
from gensim.models import KeyedVectors

In [27]:
sbert_vectors = KeyedVectors.load_word2vec_format('datasets/embedding/sbert_w2v.txt', binary=False)


In [28]:
sbert_vectors['dbpedia_3681']

array([-0.9934367 ,  1.743141  , -0.11504482, ..., -0.00684955,
       -0.8644674 , -0.7142398 ], dtype=float32)

In [29]:
dbpedia_res_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]",dbo:Work,dbo:Opera,dbo:Work
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,...",dbo:EducationalInstitution,dbo:EducationalInstitution,dbo:Agent
6,dbpedia_12020,What is the federated state located in the Wei...,resource,"[dbo:State, dbo:PopulatedPlace, dbo:Place, dbo...",dbo:State,dbo:State,dbo:Location
9,dbpedia_10315,What are the opera which start with the letter z,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]",dbo:Work,dbo:Opera,dbo:Work
11,dbpedia_1335,Which is the state and country of the Watergat...,resource,"[dbo:Country, dbo:State, dbo:PopulatedPlace, d...",dbo:Country,dbo:Country,dbo:Location


In [30]:
top_level_concepts = dbpedia_res_df.top_level_type.unique()

In [31]:
len(top_level_concepts)

43

In [32]:
# only generates one positive sample but potentilly we can use all types except the top type
# for the positive samples 
# generate negative samples based on types and top-level type
#   a
#  b        c
# d  e     f   g

# positive: (a, b), (b, d), (a, d)
# negatives: (a, e), (a, c), (a,f) , (a, g)
def negative_sample(row):
    types = list(row['type']) 
    gen_type =  row['top_level_type']
  
    children=[]
    for t in types:
        if t in hierarchy:
            children.extend( hierarchy[t]['children'] ) 

    canditates = []
    for t in types:
        for c in children:
            if c not in canditates and t in hierarchy and c in hierarchy:
                if hierarchy[t]['level'] == hierarchy[c]['level']:
                    #print (t,c)
                    canditates.append(c)
                    break

    if len(canditates) >= len(types):
        neg = random.choices(canditates, k=len(types)) 
    else:
        n_add = len(types)- len(canditates)
        generic_tyeps = list(top_level_concepts)
        generic_tyeps.remove(gen_type)
        additonal_types = random.choices(generic_tyeps , k=n_add)
        canditates.extend(additonal_types )
        #canditates.remove(gen_type)
        neg= canditates
    return neg
  

In [33]:
#hierarchy['dbo:Agent']

In [34]:
row= dbpedia_res_df.loc[9978]
row, negative_sample(row)

(id                                                        dbpedia_7821
 question             Which is the city and state for the twinned ad...
 category                                                      resource
 type                 [dbo:City, dbo:State, dbo:Settlement, dbo:Popu...
 frequent_type                                           dbo:Settlement
 bottom_level_type                                             dbo:City
 top_level_type                                            dbo:Location
 Name: 9978, dtype: object,
 ['dbo:HistoricalSettlement',
  'dbo:Department',
  'dbo:Street',
  'dbo:SkiResort',
  'dbo:CelestialBody',
  'dbo:Flag'])

In [35]:
#len(row['type'])

In [36]:
dbpedia_res_df['neg_spec_types']= dbpedia_res_df.apply(negative_sample,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
dbpedia_res_df[dbpedia_res_df.id=='dbpedia_7821']

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type,neg_spec_types
9978,dbpedia_7821,Which is the city and state for the twinned ad...,resource,"[dbo:City, dbo:State, dbo:Settlement, dbo:Popu...",dbo:Settlement,dbo:City,dbo:Location,"[dbo:HistoricalSettlement, dbo:Department, dbo..."


In [38]:
top_level_concepts

array(['dbo:Work', 'dbo:Agent', 'dbo:Location', 'dbo:TopicalConcept',
       'dbo:Name', 'dbo:EthnicGroup', 'dbo:ChemicalSubstance',
       'dbo:Award', 'dbo:Species', 'dbo:Activity', 'dbo:PersonFunction',
       'dbo:Biomolecule', 'dbo:Event', 'dbo:PoliticalParty',
       'dbo:Disease', 'dbo:MeanOfTransportation', 'dbo:Media',
       'dbo:UnitOfWork', 'dbo:Device', 'dbo:Language',
       'dbo:AnatomicalStructure', 'dbo:MedicalSpecialty', 'dbo:Currency',
       'dbo:OfficeHolder', 'dbo:MusicalArtist', 'dbo:HorseRace',
       'dbo:Country', 'dbo:BaseballTeam', 'dbo:Food', 'dbo:City',
       'dbo:River', 'dbo:SoccerPlayer', 'dbo:TimePeriod',
       'dbo:MountainRange', 'dbo:Holiday', 'dbo:PublicService',
       'dbo:University', 'dbo:Museum', 'dbo:SoccerClub',
       'dbo:CollegeCoach', 'dbo:Flag', 'dbo:Person', 'dbo:RecordLabel'],
      dtype=object)

In [39]:
# typearray = np.asanyarray( dbpedia_res_df.neg_spec_types.values)

In [40]:
from sklearn.model_selection import train_test_split

In [185]:
train_res_df,valid_res_df = train_test_split(dbpedia_res_df, test_size=0.1, random_state=42)

In [186]:
test_res_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type,pred_types
6,dbpedia_22599,Where did the war take place where one of the ...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place]",dbo:Country,dbo:Country,dbo:Place,"[dbo:Settlement, dbo:State, dbo:City, dbo:Popu..."
9,dbpedia_19677,Which mountains are contained in Inyo National...,resource,"[dbo:Mountain, dbo:NaturalPlace, dbo:Place]",dbo:NaturalPlace,dbo:Mountain,dbo:Place,"[dbo:RouteOfTransportation, dbo:Mountain, dbo:..."
11,dbpedia_11163,What is the seat of Frankfurter Allgemeine Zei...,resource,"[dbo:City, dbo:Settlement, dbo:PopulatedPlace,...",dbo:Settlement,dbo:City,dbo:Place,"[dbo:Settlement, dbo:City, dbo:State, dbo:Popu..."
16,dbpedia_18792,What did the people buried in Toronto die of?,resource,[dbo:Disease],dbo:Disease,dbo:Disease,dbo:Disease,"[dbo:Settlement, dbo:State, dbo:PopulatedPlace..."
19,dbpedia_11251,What is the league of major league lacrosse?,resource,"[dbo:SportsTeam, dbo:Organisation, dbo:Agent]",dbo:SportsTeam,dbo:SportsTeam,dbo:Agent,"[dbo:Settlement, dbo:State, dbo:City, dbo:Popu..."


In [187]:
# combine embeddings for tokens of question, general and spcific answer type 
def getCombinedEmbedding(db_id, gen_type, specific_type):
    #avg_emb = np.add(embedding(gen_type), embedding(specific_type))  / 2.0
    avg_emb= embedding(specific_type)
    ques_vec = sbert_vectors[db_id]
    return np.concatenate([avg_emb, ques_vec], axis=None)

def embedding(entity):
  #print ('entity',entity)
    return model.wv[entity]


In [188]:
def createCombinedEmbedding(row):
    return getCombinedEmbedding(row['id'], row['generic_type'],row['specific_type'])

def createNegCombinedEmbedding(row, i):
    #print (row['general_type'], row['neg_spec_types'])
    return getCombinedEmbedding(row['id'], row['generic_type'],row['neg_spec_types'] )

In [189]:
def example_generation(train_df):
  train = []
  for i, row in train_df.iterrows():
    types = list(row['type'])

    for i in range(len(types)):
      train.append([row['id'], row['top_level_type'], types[i], 1])
      train.append([row['id'], row['top_level_type'], row['neg_spec_types'][i], 0 ])
  
  return pd.DataFrame(train, columns=['id','generic_type','specific_type','class'])

In [236]:
train_resampled= example_generation(train_res_df)

In [237]:
train_resampled.head(20)

Unnamed: 0,id,generic_type,specific_type,class
0,dbpedia_22967,dbo:Work,dbo:TelevisionShow,1
1,dbpedia_22967,dbo:Work,dbo:Website,0
2,dbpedia_22967,dbo:Work,dbo:Work,1
3,dbpedia_22967,dbo:Work,dbo:PersonFunction,0
4,dbpedia_19382,dbo:Agent,dbo:OfficeHolder,1
5,dbpedia_19382,dbo:Agent,dbo:Architect,0
6,dbpedia_19382,dbo:Agent,dbo:Person,1
7,dbpedia_19382,dbo:Agent,dbo:Family,0
8,dbpedia_19382,dbo:Agent,dbo:Agent,1
9,dbpedia_19382,dbo:Agent,dbo:Food,0


In [238]:
#train_df = train_df[train_df.specific_type != 'owl:Thing']

In [239]:
valid_resampled= example_generation(valid_res_df)

In [240]:
train_resampled['class'].value_counts()

1    24949
0    24949
Name: class, dtype: int64

In [241]:
valid_resampled['class'].value_counts()

1    2862
0    2862
Name: class, dtype: int64

In [242]:
#train_resampled.to_csv('datasets/output/dbpedia_train_resampled.csv', index=False)

In [243]:
#valid_resampled.to_csv('datasets/output/dbpedia_valid_resampled.csv', index=False)

In [244]:
row = valid_resampled.iloc[100]
print (row)
concat_vec = createCombinedEmbedding(row)

id               dbpedia_11008
generic_type         dbo:Agent
specific_type       dbo:Person
class                        1
Name: 100, dtype: object


In [245]:
concat_vec

array([ 3.1876564e-03,  9.3735807e-04, -4.6780482e-02, ...,
       -8.9567590e-01, -1.5002191e+00,  1.1927358e+00], dtype=float32)

In [246]:

#indicies = np.arange(train_resampled.shape[0])
#np.random.shuffle(indicies)
#train_df_sample=train_resampled.iloc[indicies]

#indicies = np.arange(valid_resampled.shape[0])
#np.random.shuffle(indicies)
#valid_df_sample=valid_resampled.iloc[indicies]

In [247]:
def vectorize(train_df, valid_df, vec_size):
    train_X = train_df.apply(createCombinedEmbedding, axis=1)
    train_y = train_df['class'].values
    
    valid_X = valid_df.apply(createCombinedEmbedding, axis=1)
    valid_y = valid_df['class'].values

    return np.stack(train_X.values), train_y, np.stack(valid_X.values), valid_y

In [248]:
len(train_resampled), len(valid_resampled)

(49898, 5724)

In [249]:
train_resampled.specific_type.value_counts()

dbo:Agent            3847
dbo:Family           3741
dbo:Person           2634
dbo:Location         2164
dbo:CelestialBody    2073
                     ... 
dbo:Mollusca            1
dbo:Dancer              1
dbo:HotSpring           1
dbo:Wrestler            1
dbo:Tax                 1
Name: specific_type, Length: 344, dtype: int64

In [250]:
valid_resampled.specific_type.value_counts()

dbo:Agent            447
dbo:Family           437
dbo:Person           319
dbo:Location         263
dbo:CelestialBody    253
                    ... 
dbo:Bank               1
dbo:Gnetophytes        1
dbo:RugbyPlayer        1
dbo:Comedian           1
dbo:Philosopher        1
Name: specific_type, Length: 227, dtype: int64

In [60]:
from imblearn.under_sampling import RandomUnderSampler

In [251]:

rus = RandomUnderSampler(sampling_strategy='all')

In [252]:
train_resampled_under, _= rus.fit_resample(train_resampled, train_resampled['id'])

In [257]:
train_resampled_under.id.value_counts()

dbpedia_17503    2
dbpedia_833      2
dbpedia_16602    2
dbpedia_7172     2
dbpedia_13381    2
                ..
dbpedia_17151    2
dbpedia_9595     2
dbpedia_11121    2
dbpedia_7442     2
dbpedia_22613    2
Name: id, Length: 8601, dtype: int64

In [253]:
train_resampled_under.specific_type.value_counts()

dbo:Agent                     1580
dbo:Family                    1552
dbo:Person                    1179
dbo:Location                   518
dbo:Work                       489
                              ... 
dbo:CanadianFootballPlayer       1
dbo:Locomotive                   1
dbo:Prison                       1
dbo:Tax                          1
dbo:WaterTower                   1
Name: specific_type, Length: 269, dtype: int64

In [255]:
train_resampled_under['class'].value_counts()

0    8758
1    8444
Name: class, dtype: int64

In [258]:
train_resampled = train_resampled_under

In [259]:
#valid_resampled, _= rus.fit_resample(valid_resampled, valid_resampled['generic_type'])

In [260]:
vec_size = concat_vec.shape[0]
print (vec_size)
X, y, X_val, y_val = vectorize(train_resampled, valid_resampled, vec_size)

1124


In [261]:
X.shape, y.shape

((17202, 1124), (17202,))

In [262]:
#train RF calssifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
rf =RandomForestClassifier(n_estimators=200,max_depth=5,n_jobs=5)
rf.fit(X, y)

RandomForestClassifier(max_depth=5, n_estimators=200, n_jobs=5)

In [263]:
y_pred= rf.predict(X_val)

In [264]:
np.sum(y_pred ==y_val)/y_val.shape[0]

0.8857442348008385

In [266]:
rf.feature_importances_[100:].sum()

0.022785991235582722

In [None]:
#dbpedia_types.head()

In [267]:
# look for the best specific type based on RF model for given generic and question text
def get_best_type(row, clf, thers=0.01):
   
    gen_type = row['top_level_type']
    #cand_list = train_df_sample[train_df_sample.generic_type == gen_type]['specific_type'].unique()

    #if len(cand_list) == 0: return [gen_type]
    
    c_mat = []
    cand_list= label_to_id
    for c in label_to_id:
        c_mat.append( getCombinedEmbedding(row['id'], row['top_level_type'], c))
    probs =clf.predict_proba(np.array(c_mat))
    #print (cand_list, probs)
    if type(clf) == RandomForestClassifier:
        probs= probs[:,1]
        #print (probs)
    else:
        probs= probs[:,0]
    #print (probs)
    #best = np.argsort(probs)
    norm = probs
    result_before = np.argsort(norm)[::-1]
    #print('before')
    #for r in result_before:
    #    print(id_to_label[str(r)])
    #reward top class
    initial_top_index = np.argmax(norm)
    initial_top = hierarchy[id_to_label[str(initial_top_index)]]
    if initial_top != {}:
        norm[initial_top_index] = norm[initial_top_index] + int(initial_top['level'])/6
        #reward sub classes of top class
        initial_top_children = initial_top['children']
        for c in initial_top_children:
            if c in label_to_id:
                norm[int(label_to_id[c])] = norm[int(label_to_id[c])] + int(hierarchy[c]['level'])/6
    #classes in descending order
    result = np.argsort(norm)[::-1]
    #print('after')
    #for r in result:
    #    print(id_to_label[str(r)])
    result_mapped = []
    for r in result:
        result_mapped.append(id_to_label[str(r)])
   
    #print (cand_list[best], probs[best])
    return  result_mapped[:5]

In [268]:
valid_res_df.iloc[3]

id                                                       dbpedia_14802
question             What is the version, edition or translation of...
category                                                      resource
type                                                       [dbo:Media]
frequent_type                                                dbo:Media
bottom_level_type                                            dbo:Media
top_level_type                                               dbo:Media
neg_spec_types                                          [dbo:Activity]
Name: 15427, dtype: object

In [269]:
get_best_type(valid_res_df.iloc[3], rf) 

['dbo:Settlement',
 'dbo:RouteOfTransportation',
 'dbo:Mountain',
 'dbo:Stream',
 'dbo:State']

# Prediction 

In [136]:
test_res_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
6,dbpedia_22599,Where did the war take place where one of the ...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place]",dbo:Country,dbo:Country,dbo:Place
9,dbpedia_19677,Which mountains are contained in Inyo National...,resource,"[dbo:Mountain, dbo:NaturalPlace, dbo:Place]",dbo:NaturalPlace,dbo:Mountain,dbo:Place
11,dbpedia_11163,What is the seat of Frankfurter Allgemeine Zei...,resource,"[dbo:City, dbo:Settlement, dbo:PopulatedPlace,...",dbo:Settlement,dbo:City,dbo:Place
16,dbpedia_18792,What did the people buried in Toronto die of?,resource,[dbo:Disease],dbo:Disease,dbo:Disease,dbo:Disease
19,dbpedia_11251,What is the league of major league lacrosse?,resource,"[dbo:SportsTeam, dbo:Organisation, dbo:Agent]",dbo:SportsTeam,dbo:SportsTeam,dbo:Agent


In [137]:
#db_valid_res_df =db_valid_res_df[db_valid_res_df['type'] !='resource']
test_res_df['top_level_type'] = test_res_df.type.apply(types_to_top_type)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [138]:
test_res_df.head()

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
6,dbpedia_22599,Where did the war take place where one of the ...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place]",dbo:Country,dbo:Country,dbo:Place
9,dbpedia_19677,Which mountains are contained in Inyo National...,resource,"[dbo:Mountain, dbo:NaturalPlace, dbo:Place]",dbo:NaturalPlace,dbo:Mountain,dbo:Place
11,dbpedia_11163,What is the seat of Frankfurter Allgemeine Zei...,resource,"[dbo:City, dbo:Settlement, dbo:PopulatedPlace,...",dbo:Settlement,dbo:City,dbo:Place
16,dbpedia_18792,What did the people buried in Toronto die of?,resource,[dbo:Disease],dbo:Disease,dbo:Disease,dbo:Disease
19,dbpedia_11251,What is the league of major league lacrosse?,resource,"[dbo:SportsTeam, dbo:Organisation, dbo:Agent]",dbo:SportsTeam,dbo:SportsTeam,dbo:Agent


In [139]:
test_res_df = test_res_df[test_res_df.question.notna()]

In [140]:
test_res_df.sample(5)

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type
740,dbpedia_19256,Name the agency of Election Commission of Thai...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place]",dbo:Country,dbo:Country,dbo:Place
2123,dbpedia_696,Who is {club manager} of {actress} of {Wrestle...,resource,"[dbo:Person, dbo:Agent]",dbo:Agent,dbo:Person,dbo:Agent
3036,dbpedia_21860,"What does the bridge go over, which is in Kati...",resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N...",dbo:NaturalPlace,dbo:River,dbo:Place
3251,dbpedia_18009,Who is {scriptwriter} of {bibliography} of {Na...,resource,"[dbo:Person, dbo:Agent]",dbo:Agent,dbo:Person,dbo:Agent
411,dbpedia_58,Which { meansseason starts} in {February} ?,resource,[dbo:Activity],dbo:Activity,dbo:Activity,dbo:Activity


In [141]:
row =test_res_df.iloc[123] 
print (row)
get_best_type(row, rf)

id                                                       dbpedia_20440
question             Name the sea whose inflow is Kemijoki and outf...
category                                                      resource
type                 [dbo:Lake, dbo:Sea, dbo:BodyOfWater, dbo:Natur...
frequent_type                                         dbo:NaturalPlace
bottom_level_type                                             dbo:Lake
top_level_type                                               dbo:Place
Name: 223, dtype: object


['dbo:Settlement',
 'dbo:City',
 'dbo:State',
 'dbo:PopulatedPlace',
 'dbo:Country']

In [142]:
test_res_df['pred_types']= test_res_df.apply(lambda x: get_best_type(x, rf), axis=1)

In [143]:
test_res_df.sample(5)

Unnamed: 0,id,question,category,type,frequent_type,bottom_level_type,top_level_type,pred_types
1343,dbpedia_3439,Who is the {Wikimedia category} for {category ...,resource,[dbo:Media],dbo:Media,dbo:Media,dbo:Media,"[dbo:Settlement, dbo:State, dbo:City, dbo:Popu..."
2518,dbpedia_20331,What is the debut team of the baseball player ...,resource,"[dbo:BaseballTeam, dbo:SportsTeam, dbo:Organis...",dbo:SportsTeam,dbo:BaseballTeam,dbo:Agent,"[dbo:Settlement, dbo:State, dbo:PopulatedPlace..."
2432,dbpedia_14121,What is a professional services firm whose nam...,resource,"[dbo:Company, dbo:Organisation, dbo:Agent]",dbo:Company,dbo:Company,dbo:Agent,"[dbo:RouteOfTransportation, dbo:Mountain, dbo:..."
3896,dbpedia_12409,William McGonagall's birth place has a twin ci...,resource,"[dbo:City, dbo:Settlement, dbo:PopulatedPlace,...",dbo:Settlement,dbo:City,dbo:Place,"[dbo:Settlement, dbo:City, dbo:State, dbo:Popu..."
2572,dbpedia_19980,Where did john o conner study?,resource,"[dbo:University, dbo:EducationalInstitution, d...",dbo:EducationalInstitution,dbo:University,dbo:Agent,"[dbo:Settlement, dbo:PopulatedPlace, dbo:State..."


In [144]:
def patk(actual, pred, k):
  #we return 0 if k is 0 because 
  #   we can't divide the no of common values by 0 
  if k == 0:
    return 0

  #taking only the top k predictions in a class 
  k_pred = pred[:k]

  #print (actual)
  #taking the set of the actual values 
  actual_set = set(actual)

  #taking the set of the predicted values 
  pred_set = set(k_pred)

  #taking the intersection of the actual set and the pred set
    # to find the common values
  common_values = actual_set.intersection(pred_set) 
  return len(common_values)/len(pred[:k])

In [161]:
test_res_df[['id','question','category','type']].to_json('datasets/output/dbpedia_test_gold.json', orient='records')

In [162]:
pred_df= test_res_df[['id','question','category','pred_types']]
pred_df.rename(columns={'pred_types': 'type'},inplace=True)
pred_df.to_json('datasets/output/dbpedia_test_pred.json', orient='records')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [163]:
type_hierarchy, max_depth = evaluate.load_type_hierarchy('datasets/dbpedia_types.tsv')

Loading type hierarchy from datasets/dbpedia_types.tsv... 760 types loaded (max depth: 7)


In [164]:
ground_truth = evaluate.load_ground_truth('datasets/output/dbpedia_test_gold.json', type_hierarchy)

Loading ground truth from datasets/output/dbpedia_test_gold.json... 
   2445 questions loaded


In [169]:
system_output = evaluate.load_system_output('datasets/output/dbpedia_test_pred.json')

Loading system predictions from datasets/output/dbpedia_test_pred.json... 
   2445 predictions loaded


In [170]:
evaluate.evaluate(ground_truth, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 2445 questions)
  Accuracy: 1.000
Type ranking (based on 2439 questions)
  NDCG@5:  0.793
  NDCG@10: 0.712


In [171]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 2445 questions)
  Accuracy: 1.000
Type ranking (based on 2439 questions)
  NDCG@5:  0.095
  NDCG@10: 0.088


In [172]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 2445 questions)
  Accuracy: 1.000
Type ranking (based on 2439 questions)
  NDCG@5:  0.095
  NDCG@10: 0.088


In [None]:
def to_list(t):
    return [t]
test_df.type = test_df.type.apply(to_list)

In [None]:
for i,row in test_res_df.iterrows():
    index =np.where(test_df['id']==row['id'])
    #print(row['id'])
    test_df.iloc[index[0][0]]['type'] =  row['pred_types']