In [18]:
import pandas as pd
import pickle
from tqdm.auto import tqdm
import time

from joblib import Parallel, delayed

%load_ext autoreload
%autoreload 2
from kbqa.wikidata import Entity
from kbqa.candidate_selection import QuestionToRankInstanceOf
from kbqa.logger import get_logger

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
logger = get_logger()

dataset_version = 'wdsq' # wdsq, rubq

if dataset_version == 'mintaka':
    answers_df = pd.read_pickle('./mintaka_onehop_answers_no_tree.pkl')
    mgenre_df = pd.read_pickle('./mintaka_onehop_mgenre_no_tree.pkl')
elif dataset_version == 'wdsq':
    answers_df = pd.read_csv('./filtered_test_with_answers.csv')
    mgenre_df = pd.read_pickle('./filtered_test_with_mgenre_no_prefix_tree.pkl')
elif dataset_version == 'rubq':
    answers_df = pd.read_csv('rubq_test_with_answers_no_prefix.csv')
    mgenre_df = pd.read_pickle('./test_rubq2_mgenre.pkl')
else:
    raise ValueError('Wrong dataset_version')

mgenre_df = mgenre_df.loc[:,~mgenre_df.columns.duplicated()]
answers_df = answers_df.loc[:,~answers_df.columns.duplicated()]

mgenre_df.head()

Unnamed: 0,S,P,O,Q,mgenre_results,selected_mgenre_results,selected_entities
0,Q154335,P509,Q12152,what was the cause of death of yves klein?,"[Yves Klein >> en, Yves Klein >> fr, Yves Klei...",[Yves Klein >> en],"[Q154335, Q98233558]"
1,Q62498,P21,Q6581097,how does engelbert zaschka identify?,"[Engelbert Zaschka >> en, Engelbert Zaschka >>...",[Engelbert Zaschka >> en],[Q62498]
2,Q182485,P413,Q1143358,what position does pee wee reese play in baseb...,"[Pee Wee Reese >> en, Peewee Reese >> en, Pee-...",[Pee Wee Reese >> en],[Q182485]
3,Q12152,P509,Q6371569,Which Swiss conductor's cause of death is myoc...,"[Myocardial infarction >> en, Cardiovascular d...",[Myocardial infarction >> en],"[Q28200479, Q66842762, Q67551104, Q67848010, Q..."
4,Q472382,P19,Q23051,what is the place of birth of sam edwards??,"[Sam Edwards >> en, Sam Edwards (musician) >> ...",[Sam Edwards >> en],"[Q472382, Q3470479, Q7407443, Q27925002, Q6459..."


In [20]:
answer_cols = [c for c in answers_df.columns if 'answer_' in c]

if dataset_version == 'wdsq':
    df = pd.merge(left=answers_df[['Q']+answer_cols], right=mgenre_df[['O', 'Q', 'selected_entities']], left_on='Q', right_on='Q', how='left')[['O', 'Q', 'selected_entities']+answer_cols]
elif dataset_version == 'rubq':
    df = pd.merge(left=answers_df, right=mgenre_df, left_on='Q', right_on='Q', how='left')[['O', 'Q', 'selected_entities']+answer_cols]
elif dataset_version == 'mintaka':
    cols_to_use = ['id'] + answers_df.columns.difference(mgenre_df.columns).tolist()
    df = pd.merge(left=answers_df[cols_to_use], right=mgenre_df, left_on='id', right_on='id', how='left', )
    df['O'] = df['answerEntity'].apply(lambda e: [_e.get('name') for _e in e])

df.head()

Unnamed: 0,O,Q,selected_entities,answer_0,answer_1,answer_2,answer_3,answer_4,answer_5,answer_6,...,answer_190,answer_191,answer_192,answer_193,answer_194,answer_195,answer_196,answer_197,answer_198,answer_199
0,Q12152,what was the cause of death of yves klein?,"[Q154335, Q98233558]",myocardial infarction,myocardial infarction,myocardial infarction,myocardial infarction,myocardial infarction,tuberculosis,pancreatic cancer,...,septicaemic shock,pancreatic cancer,pneumonia,cancer,pancreatitis,heart failure,lung cancer,leukemia,septic shock,pulmonary artery failure
1,Q6581097,how does engelbert zaschka identify?,[Q62498],Germans,Germans,Germans,Germans,Germans,Germans,Germans,...,Germanic people,Austrian,Germanic Americans,Germanic languages,German Americans,Germanics,Jew,German,Christian,East Germans
2,Q1143358,what position does pee wee reese play in baseb...,[Q182485],catcher,catcher,catcher,outfielder,catcher,catcher,catcher,...,pitching guard,baseman,outfielder.,first basemen,right fielder,outfielder,pitching guard,closer,catcher of record,catcher/catcher
3,Q6371569,Which Swiss conductor's cause of death is myoc...,"[Q28200479, Q66842762, Q67551104, Q67848010, Q...",Franz Liszt,Paul Hindemith,Claudio Abbado,Erich Leinsdorf,Wilhelm Furtwängler,Carlo Maria Giulini,Paul Hindemith,...,Richard Strauss,Jacques Thibaudet,Carlo Maria von Trapp,Herbert Blomstedt,Fritz Reiner,Gustav Mahler,Hans Richter,Heinz Holliger,Heinz Giesler,Paul Hindemith
4,Q23051,what is the place of birth of sam edwards??,"[Q472382, Q3470479, Q7407443, Q27925002, Q6459...",London Borough of Hackney,Hemel Hempstead,London Borough of Hackney,Wolverhampton,Chelmsford,Wolverhampton,Sutton Coldfield,...,Northamptonshire,San Diego,Newbury,Newtown,Wigan,Stroud,St. Louis,Ayrshire,Stroudwater,Chelmsford City


In [21]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [22]:
def _row_proc(row):
    answers_candidates = []
    for lbl in row[answer_cols].dropna().unique():
        try:
            answers_candidates.extend(Entity.from_label(lbl)[:2])
        except ValueError:
            pass

    question_entities = [Entity(e) for e in row['selected_entities']]

    qtr = QuestionToRankInstanceOf(
        row['Q'],
        question_entities,
        answers_candidates,
        only_forward_one_hop=True,
    )

    answers = qtr.final_answers()
    if len(answers) > 0:
        return answers[0][1].idx
    else:
        return None

filtered_answers = Parallel(n_jobs=6)(
    delayed(_row_proc)(row)
    for _, row in tqdm(df.iterrows(), total=df.index.size)
)
# filtered_answers = [_row_proc(row) for _, row in tqdm(df.iterrows(), total=df.index.size)]

  4%|▍         | 108/2464 [00:57<20:17,  1.94it/s]

2023-01-12 15:22:56,723 [ERROR]: {"msg": "Expecting value: line 1 column 1 (char 0)", "params": {"format": "json", "query": "\n        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n        SELECT * WHERE{\n            ?item rdfs:label \"William \"Bill\" Carter\"@en .\n        }\n        "}, "endpoint": "https://query.wikidata.org/sparql", "response": {"status_code": 400, "headers": {"server": "nginx/1.14.2", "date": "Thu, 12 Jan 2023 15:22:56 GMT", "content-type": "text/plain", "x-served-by": "wdqs1015", "access-control-allow-origin": "*", "content-encoding": "gzip", "vary": "Accept-Encoding", "age": "0", "x-cache": "cp3050 miss, cp3054 pass", "x-cache-status": "pass", "server-timing": "cache;desc=\"pass\", host;desc=\"cp3054\"", "strict-transport-security": "max-age=106384710; includeSubDomains; preload", "report-to": "{ \"group\": \"wm_nel\", \"max_age\": 86400, \"endpoints\": [{ \"url\": \"https://intake-logging.wikimedia.org/v1/events?stream=w3c.reportingapi.network_error&

 55%|█████▌    | 1356/2464 [11:32<07:49,  2.36it/s]

KeyboardInterrupt: 

In [6]:
import re

df['filtered_answer'] = filtered_answers
df['filtered_answer'] = df['filtered_answer'].apply(lambda x: Entity(x) if x is not None else None)

def _parse(s, pattern=re.compile(r'Q[0-9]+')):
    if isinstance(s, str):
        try:
            return [Entity(e) for e in re.findall(pattern, s)]
        except:
            return []
    else:
        return [Entity(e) for e in s]


df['target'] = df['O'].apply(_parse)

df['is_correct'] = df.apply(
    lambda row: row['filtered_answer'] in row['target'],
    axis=1
)

df[df['is_correct']].index.size / df.index.size

0.18235294117647058

In [17]:
class QuestionToRankInstanceOfHtml(QuestionToRankInstanceOf):
    def _repr_html_(self) -> str:
        html = ["""
        <style>
        .flex-row-container {
            display: flex;
            flex-wrap: wrap;
        }
        .flex-row-container > .flex-row-item {
            flex: 1 0 29%; /*grow | shrink | basis */
        }

        .flex-row-item {
            margin: 10px;
        }

        th {
            word-wrap: break-word;
        }

        td {
            word-wrap: break-word;
        }
        </style>
        """]
        html.append(f'<b>Question:</b> {self.question}')
        if self.target is not None:
            for te in self.target:
                html.append(
                    f'<br><b>Target:</b> <span style="color: green">Entity: {te.idx} ({te.label})</span> (InstanceOf: {"; ".join([f"{e.idx} ({e.label})" for e in te.instance_of])})'
                )
        html.append('<div class="flex-row-container">')

        # FINAL ANSWERS
        html_final_answers = ["<h4>Final answers</h4>"]
        html_final_answers.extend([
            '<table style="border: 1px solid #A9ED2B; width: 900px;">',
            '<tr style="font-size:1rem; font-weight: bold; background-color: #A9ED2B">',
            "<th>Property</th>",
            "<th>P Label</th>",
            "<th>Entity</th>",
            "<th>E Label</th>",
            "<th>InstanceOf</th>",
            "<th>instance of score</th>",
            "<th>forward one hop neighbors score</th>",
            "<th>answers candidates score</th>",
            "<th>property question intersection score</th>",
            "</tr>"
        ])
        for property, entity, instance_of_score, forward_one_hop_neighbors_score, answers_candidates_score, property_question_intersection_score in self.final_answers():
            if self.target is not None and entity in self.target:
                html_final_answers.append('<tr style="background-color: #3CF30F">')
            else:
                html_final_answers.append('<tr>')
            html_final_answers.extend([
                f'<td>{property.idx if property is not None else ""}</td>',
                f'<td>{property.label if property is not None else ""}</td>',
                f'<td>{entity.idx}</td>',
                f'<td>{entity.label}</td>',
                f'<td>{"<br>".join([f"{io.idx} ({io.label})" for io in entity.instance_of])}</td>',
                f'<td>{instance_of_score:.5f}</td>',
                f'<td>{forward_one_hop_neighbors_score:.5f}</td>',
                f'<td>{answers_candidates_score:.5f}</td>',
                f'<td>{property_question_intersection_score:.5f}</td>',
                '</tr>',
            ])
        html_final_answers.append("</table>")
        html_final_answers = "".join(html_final_answers)
        
        # QUESTION
        html_question_entities = []
        for qentity in self.question_entities:
            html_question_entities.append(f"<h4>One hop neighbors for Entity: {qentity.idx} ({qentity.label})</h4>")
            html_question_entities.extend([
                '<table style="width: 700px;">',
                '<tr style="font-size:1rem; font-weight: bold; background-color: #50ADFF">',
                '<th>Dir</th>',
                "<th>Property</th>",
                "<th>P Label</th>",
                "<th>Entity</th>",
                "<th>E Label</th>",
                "<th>InstanceOf</th>",
                "</tr>"
            ])

            if self.only_forward_one_hop:
                neighbors = qentity.forward_one_hop_neighbors
            else:
                neighbors = qentity.forward_one_hop_neighbors + qentity.backward_one_hop_neighbors

            for property, entity in neighbors:
                if self.target is not None and entity in self.target:
                    html_question_entities.append('<tr style="background-color: #3CF30F">')
                elif entity in self.final_answers():
                    html_question_entities.append('<tr style="background-color: #A9ED2B">')
                else:
                    html_question_entities.append('<tr>')
                html_question_entities.extend([
                    f'<td>{"->" if (property, entity) in qentity.forward_one_hop_neighbors else "<-"}</td>'
                    f'<td>{property.idx}</td>',
                    f'<td>{property.label}</td>',
                    f'<td>{entity.idx}</td>',
                    f'<td>{entity.label}</td>',
                    f'<td>{"<br>".join([f"{io.idx} ({io.label})" for io in entity.instance_of])}</td>',
                    '</tr>',
                ])

            html_question_entities.append('</table>')
        
        html_question_entities = '<div class="flex-row-item">' + html_final_answers + "".join(html_question_entities) + '</div>'
        html.append(html_question_entities)

        # ANSWERS_INSTANCE_OF_COUNT
        html_answer_instance_of = ""
        html_answer_instance_of = [
            '<h4>Answers instanceOf count (<b style="color: green;">selected</b>)</h4>',
            "<table>",
            '<tr style="font-size:1rem; font-weight: bold; background-color: #50ADFF">',
            "<th>InstanceOf</th>",
            "<th>Label</th>",
            "<th>Count</th>",
            "</tr>"
        ]
        for instance_of_entity, count in self.answer_instance_of_count:
            if instance_of_entity in self._answer_instance_of:
                html_answer_instance_of.append('<tr style="background-color: #7AE2BC">')
            else:
                html_answer_instance_of.append('<tr>')
            html_answer_instance_of.append(f'<td>{instance_of_entity.idx}</td>')
            html_answer_instance_of.append(f'<td>{instance_of_entity.label}</td>')
            html_answer_instance_of.append(f'<td>{count}</td>')
            html_answer_instance_of.append('</tr>')  
        html_answer_instance_of.append("</table>")

        html_answer_instance_of = "".join(html_answer_instance_of)

        # ANSWERS candidates
        html_answers_candidates = [f"<h4>Seq2Seq answers candidates</h4>"]
        html_answers_candidates.extend([
            "<table>",
            '<tr style="font-size:1rem; font-weight: bold; background-color: #50ADFF">',
            "<th>Entity</th>",
            "<th>E Label</th>",
            "<th>InstanceOf</th>",
            "</tr>"
        ])
        for entity in self.answers_candidates:
            if self.target is not None and entity in self.target:
                html_answers_candidates.append('<tr style="background-color: #3CF30F">')
            else:
                html_answers_candidates.append('<tr>')
            html_answers_candidates.extend([
                f'<td>{entity.idx}</td>',
                f'<td>{entity.label}</td>',
                f'<td>{"<br>".join([f"{io.idx} ({io.label})" for io in entity.instance_of])}</td>',
                '</tr>',
            ])

        html_answers_candidates = '<div class="flex-row-item">' + html_answer_instance_of + "".join(html_answers_candidates) + '</div>'
        html.append(html_answers_candidates)

        return "".join(html) + '</div>'



row = df[~df['is_correct']].iloc[8]

# row = df.iloc[161]
answers_candidates = []
for lbl in row[answer_cols].dropna().unique():
    try:
        answers_candidates.extend(Entity.from_label(lbl)[:1])
    except ValueError:
        pass
question_entities = [Entity(e) for e in row['selected_entities']]

qtr = QuestionToRankInstanceOfHtml(
    row['Q'],
    question_entities,
    answers_candidates,
    target_entity=row['target'],
    only_forward_one_hop=True,
)

qtr.final_answers()
qtr

Property,P Label,Entity,E Label,InstanceOf,instance of score,forward one hop neighbors score,answers candidates score,property question intersection score
,,Q2121677,To Catch a Predator,Q1261214 (television special),0.9,0.0,1.0,0.0
,,Q80437,The Game of Their Lives,Q11424 (film),0.9,0.0,0.97959,0.0
,,Q191040,Mr. & Mrs. Smith,Q11424 (film),0.9,0.0,0.95918,0.0
,,Q466781,All the President's Men,Q11424 (film),0.9,0.0,0.91837,0.0
,,Q151714,American Idiot,Q482994 (album),0.9,0.0,0.89796,0.0
,,Q3235376,You Can't Do That on Television,Q5398426 (television series),0.9,0.0,0.87755,0.0
,,Q4022966,You Can't Take It with You,Q506240 (television film),0.9,0.0,0.85714,0.0
,,Q331017,To Catch a Thief,Q11424 (film),0.9,0.0,0.83673,0.0
,,Q3524739,This Is Spinal Tap,Q482994 (album),0.9,0.0,0.81633,0.0
,,Q30321629,All the Money in the World,Q11424 (film),0.9,0.0,0.79592,0.0

InstanceOf,Label,Count
Q11424,film,17.0
Q5398426,television series,11.0
Q15416,television program,5.0
Q482994,album,5.0
Q506240,television film,4.0
Q196600,media franchise,2.0
Q21191270,television series episode,2.0
Q7725634,literary work,2.0
Q1261214,television special,1.0
Q21190411,TV game show series,1.0

Entity,E Label,InstanceOf
Q2121677,To Catch a Predator,Q1261214 (television special)
Q80437,The Game of Their Lives,Q11424 (film)
Q191040,Mr. & Mrs. Smith,Q11424 (film)
Q459720,The Price Is Right,Q5398426 (television series) Q21190411 (TV game show series) Q15416 (television program)
Q466781,All the President's Men,Q11424 (film)
Q151714,American Idiot,Q482994 (album)
Q3235376,You Can't Do That on Television,Q5398426 (television series)
Q4022966,You Can't Take It with You,Q506240 (television film)
Q331017,To Catch a Thief,Q11424 (film)
Q3524739,This Is Spinal Tap,Q482994 (album)
