In [24]:
import sys
sys.path.append('../')


In [29]:
import pandas as pd
import pickle
from tqdm import tqdm
import requests

In [3]:
from curry.km import DBPediaPhysicsResources
from curry.babelfy import Babelfier

In [4]:
phy = DBPediaPhysicsResources()

In [5]:
babel = phy.relevant_annotations(annotation_source='babel')

In [6]:
wikify = phy.relevant_annotations(annotation_source='wikifier')

In [7]:
len(babel)

539

In [8]:
len(wikify)

539

In [9]:
pd.Series([r for url in babel for r in babel[url]]).value_counts().sort_values(ascending=False)

http://dbpedia.org/resource/Thermodynamic_free_energy      452
http://dbpedia.org/resource/Force                          410
http://dbpedia.org/resource/Physical_body                  390
http://dbpedia.org/resource/Light                          349
http://dbpedia.org/resource/Electric_charge                318
                                                          ... 
http://dbpedia.org/resource/Annotation                       1
http://dbpedia.org/resource/Enzyme                           1
http://dbpedia.org/resource/Cereal                           1
http://dbpedia.org/resource/Motte-and-bailey_castle          1
http://dbpedia.org/resource/Cost-effectiveness_analysis      1
Length: 2852, dtype: int64

## Get DBPedia Resources for Elixier

In [10]:
elixier = pd.read_csv('../data/elixier_physics_all.csv', index_col=0)

In [11]:
with open('../data/elixier_physics_all_responses.pkl', 'rb') as f:
    elixier_responses = pickle.load(f)

In [12]:
success_mask = pd.Series(elixier_responses).\
    apply(lambda r: (not isinstance(r, Exception)) and
          (r.status_code == 200))
elixier = elixier[success_mask]

In [13]:
text_columns = ['description',
                'titel für schüler', 
                'lange beschreibung', 
                'beschreibung für schüler', 
                'schlagwörter', 
                'freie schlagwörter']

In [14]:
def clean_description(desc):
    if 'Lernressourcentyp' in desc:
        return desc.split('; Lernressourcentyp')[0]
    elif 'margin-' in desc: 
        return None
    else:
        return desc

In [15]:
descriptions = elixier.description[elixier.description.notna()].apply(clean_description)
descriptions = descriptions[descriptions.notna()]
descriptions = descriptions[descriptions.apply(lambda x: len(x.strip())) >= 2]

In [16]:
import sparql_dataframe
from functools import lru_cache

def get_rdf_type(resource, depth):
    endpoint = "http://dbpedia.org/sparql"

    q = """
        prefix dbr: <http://dbpedia.org/resource/> 
        prefix dbo: <http://dbpedia.org/ontology/>
        prefix skos: <http://www.w3.org/2004/02/skos/core#>

        select distinct ?subcategory  where {
          <""" + resource + """> dbo:wikiPageRedirects* ?redirect.
          ?redirect <http://purl.org/dc/terms/subject> ?category.
          ?category """ + '?/'.join(['skos:broader' for _ in range(depth)]) + """ ?subcategory
        }
    """
    df = sparql_dataframe.get(endpoint, q)
    return df.subcategory.values

In [17]:
def babelfy_grouped(descriptions):
    to_babelfy = []
    group = []
    mappings = {}

    for description_index in descriptions.index:
        desc = descriptions.loc[description_index]
        group.append((description_index, desc))
        if len(group) == 3:
            joined_index = len(to_babelfy)
            out = ''
            for desc_index, desc in group:
                start = len(out)
                out += desc
                out += ' '
                end = len(out)
                mappings[desc_index] = (joined_index, start, end)
            to_babelfy.append(out)
            group = []

    joined_index = len(to_babelfy)
    out = ''
    for desc_index, desc in group:
        start = len(out)
        out += desc
        out += '. '
        end = len(out)
        mappings[desc_index] = (joined_index, start, end)
    to_babelfy.append(out)
    group = []
    #babelfy
    annotations = []

    for b in tqdm(to_babelfy):
        annotations.append(babelfier.bab(b))
        
    # associate annotations back to descriptions
    desc_id_to_annotation = dict()
    for desc_id in mappings:
        desc_id_to_annotation[desc_id] = []
        group_id, start, end = mappings[desc_id]
        desc = to_babelfy[group_id][start: end]
        for ann in annotations[group_id]:
            ann_start, ann_end = ann['start'], ann['end']
            if (ann_start >= start) and (ann_end <= end):
                desc_id_to_annotation[desc_id].append(ann)
                
    # check that most annotation's text is in the description
    
    errors = []
    desc_to_annotation = dict()
    for desc_id in tqdm(desc_id_to_annotation):
        corrected = []
        for ann in desc_id_to_annotation[desc_id]:
            if ann['text'] not in descriptions.loc[desc_id]:
                errors.append(f'{ann["text"]} not in {descriptions.loc[desc_id]}')
            else:
                corrected.append(ann)
        desc_to_annotation[descriptions.loc[desc_id]] = corrected
        desc_id_to_annotation[desc_id] = corrected
    
    return desc_id_to_annotation, errors


In [18]:

with open('../data/elixier_id_to_annotation.pkl', 'rb') as f:
    desc_id_to_annotation = pickle.load(f)

In [19]:
with open('../data/elixier_annotations_categories.pkl', 'rb') as f:
    entity_to_categories_depth_5 = pickle.load(f)[5]

In [20]:

filtered_categories = {e: [cat for cat in entity_to_categories_depth_5[e] if phy.is_accepted_topics(cat)] 
                       for e in entity_to_categories_depth_5}

In [21]:
import numpy as np

In [22]:
filtered_annotations = {did: [ann for ann in desc_id_to_annotation[did] 
       if ann.get('DBpediaURL') and len(filtered_categories[ann['DBpediaURL']])] 
 for did in desc_id_to_annotation}

# Load Leifi

In [23]:
from curry.loader import Loader


In [20]:
elixier[annotations] = [desc_id_to_annotation[did] for did in descriptions.index]

0       None
1       None
2       None
3       None
4       None
        ... 
2893    None
2894    None
2896    None
2897    None
2898    None
Name: description, Length: 2705, dtype: object

In [231]:

filtered_categories = {e: [cat for cat in entity_to_categories[e] if phy.is_accepted_topics(cat)] 
                       for e in entity_to_categories}

In [215]:
import numpy as np

In [216]:
np.mean(list(map(len, entity_to_categories.values())))

287.28235579253516