In [1]:
import pandas as pd
import spacy
import time 
import joblib
from IPython.display import HTML
css_str = '<style> \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str))

In [60]:
import neuralcoref

In [61]:
!python --version

Python 3.7.12


In [62]:
nlp = spacy.load("en_core_web_sm")

In [None]:
neuralcoref.add_to_pipe(nlp)

# Auxiliary functions

In [None]:

def resolve_coref(text):
        return text._.coref_resolved

def process_chunk_corefs(docs):
    coref_texts = []
    for doc in nlp.pipe(docs, batch_size = 20):
        coref_texts.append(resolve_coref(doc))
    return coref_texts

def clean(text, lemma=False):
    doc = nlp(text, )
    if lemma:
        tokens = " ".join(token.lemma_ for token in doc if token_filter(token))
    else:
        tokens = " ".join(token.text for token in doc if token_filter(token))
    
    return tokens

def to_nlp(text):
    return nlp(text)

from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]


def clean(text, lemma=False):
    doc = nlp(text, )
    if lemma:
        tokens = " ".join(token.lemma_ for token in doc if token_filter(token))
    else:
        tokens = " ".join(token.text for token in doc if token_filter(token))
    
    return tokens

def preprocess_parallel_corefs(texts, chunksize=100):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_corefs)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)


# BBC 

In [None]:
bbc = pd.read_csv('../../corpus/bbc-text.csv')
bbc.shape

In [None]:
bbc.head(1)

In [None]:
bbc2 = bbc.copy()

In [None]:
bbc2['coref_text']=preprocess_parallel_corefs(bbc2['text'], chunksize=1000)

In [None]:
bbc2.iloc[6]

In [None]:
import joblib
joblib.dump(bbc2,'bbc_coref_resolved')

# Additional Preprocessing

## Remove special characters and stopwords


In [None]:

from joblib import Parallel, delayed

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

def process_chunk_clean(texts):
    preproc_pipe = []

    for doc in nlp.pipe(texts, batch_size=200):
        tokens = " ".join(token.lemma_ for token in doc if token_filter(token))
        preproc_pipe.append(tokens)
    return preproc_pipe
    

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    print("Flatten a list of lists to a combined list")
    return [item for sublist in list_of_lists for item in sublist]

def preprocess_parallel_clean(texts, chunksize=100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_clean)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

def process_chunk_ner(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=200):
        entidades = set()
        if doc.ents:
            for ent in doc.ents:
                entidades.add(ent.text)
        preproc_pipe.append(entidades)
       
    return preproc_pipe
    
    
def preprocess_parallel_ner (texts, chunksize = 100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_ner)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)


In [None]:
%%time

In [None]:
bbc2['cleaned'] = preprocess_parallel_clean(bbc2['coref_text'], chunksize=1000)

In [None]:
joblib.dump(bbc2,'bbc_coref_resolved_cleaned')

# NER

In [None]:
!pip install -U sacremoses

In [None]:
bbc2 = joblib.load('bbc_coref_resolved_cleaned')
bbc2['entidades'] = preprocess_parallel_ner(bbc2['coref_text'], chunksize=250)

In [None]:
bbc2.head()

In [9]:
joblib.dump(bbc2,'bbc_objects/bbc_coref_resolved_cleaned')

In [12]:
bbc2=joblib.load('bbc_objects/bbc_coref_resolved_cleaned')
bbc2

Unnamed: 0,category,text,coref_text,cleaned,entidades
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,"{nine months to a years , five years time, a..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss leave book worldcom boss bernie ...,"{monday, about $180bn, last two months, 20 000..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester tiger wary...,"{another three months, five weeks ago}"
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership newcas...,"{sunday, two, the weekend, second, earlier thi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,ocean s raid box office ocean s crime caper se...,"{$184m, 57.2, 2001, one, five, december, $110m..."
...,...,...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,cars pull down us retail figures us retail sal...,car pull retail figure retail sale fall 0.3 ja...,"{0.3%, 3.3%, 2004, the first quarter, 2005, 0...."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,kilroy unveils immigration policy ex-chatshow ...,kilroy unveil immigration policy ex chatshow h...,"{2bn, 14 000}"
2222,entertainment,rem announce new glasgow concert us band rem h...,rem announce new glasgow concert us band rem h...,rem announce new glasgow concert band rem anno...,"{four days later, 29, 21, tuesday 14, 10 000 ..."
2223,politics,how political squabbles snowball it s become c...,how political squabbles snowball it s become c...,political squabble snowball s commonplace argu...,"{two, 283 000, one, the last couple of decades..."


In [13]:
import pandas as pd
d2  = pd.read_csv('bbc_objects/bbc_dataset_raw.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'bbc_objects/bbc_dataset_raw.csv'

In [8]:
d2

NameError: name 'd2' is not defined

# bbc_raw

In [24]:
bbc = pd.read_csv('../../corpus/bbc_dataset_raw.csv', encoding = "ISO-8859-1")
bbc.shape

(2225, 2)

In [25]:
bbc.head(1)

Unnamed: 0,news,type
0,China had role in Yukos split-up\n \n China le...,business


In [None]:
bbc2 = bbc.copy()

In [None]:
bbc2['coref_text']=preprocess_parallel_corefs(bbc2['news'], chunksize=1000)

In [None]:
bbc2.iloc[6]

In [None]:
import joblib
joblib.dump(bbc2,'../../corpus/processed/bbc_raw_coref_resolved')

In [26]:
bbc2 = joblib.load('../../corpus/processed/bbc_raw_coref_resolved')

In [23]:
bbc2

Unnamed: 0,news,type,coref_text
0,China had role in Yukos split-up\n \n China le...,business,China had role in Yukos split-up\n \n China le...
1,Oil rebounds from weather effect\n \n Oil pric...,business,Oil rebounds from weather effect\n \n Oil pric...
2,Indonesia 'declines debt freeze'\n \n Indonesi...,business,Indonesia 'declines debt freeze'\n \n Indonesi...
3,$1m payoff for former Shell boss\n \n Shell is...,business,$1m payoff for former Shell boss\n \n Shell is...
4,US bank in $515m SEC settlement\n \n Five Bank...,business,The SEC have agreed to pay a total of $515m (Â...
...,...,...,...
2220,Microsoft launches its own search\n \n Microso...,tech,Microsoft launches Microsoft own search\n \n M...
2221,Warnings about junk mail deluge\n \n The amoun...,tech,Warnings about junk mail deluge\n \n The amoun...
2222,Microsoft gets the blogging bug\n \n Software ...,tech,Microsoft gets the blogging bug\n \n Software ...
2223,Gamers snap up new Sony PSP\n \n Gamers have b...,tech,Gamers snap up new Sony PSP\n \n Gamers have b...


# Additional Preprocessing

## Remove special characters and stopwords


In [None]:

from joblib import Parallel, delayed

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

def process_chunk_clean(texts):
    preproc_pipe = []

    for doc in nlp.pipe(texts, batch_size=200):
        tokens = " ".join(token.lemma_.lower() for token in doc if token_filter(token))
        preproc_pipe.append(tokens)
    return preproc_pipe
    

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    print("Flatten a list of lists to a combined list")
    return [item for sublist in list_of_lists for item in sublist]

def preprocess_parallel_clean(texts, chunksize=100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_clean)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

def process_chunk_ner(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=200):
        entidades = set()
        if doc.ents:
            for ent in doc.ents:
                print(ent.text)
                print(type(ent))
                if ent.ent_type_!='ORDINAL' and ent.ent_type_!='CARDINAL' and ent.ent_type_!='TIME':
                    entidades.add(ent.text.lower())
        preproc_pipe.append(entidades)
       
    return preproc_pipe
    
    
def preprocess_parallel_ner (texts, chunksize = 100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_ner)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)


In [None]:
%%time

In [None]:
bbc2['cleaned'] = preprocess_parallel_clean(bbc2['coref_text'], chunksize=1000)

In [None]:
joblib.dump(bbc2,'../../corpus/processed/bbc_coref_resolved_cleaned')

In [None]:
import joblib
bbc2 = joblib.load('../../corpus/processed/bbc_coref_resolved_cleaned')

## NER: tenemos que hacerlo sobre el original para evitar que no lo coja

In [None]:
!pip install -U sacremoses

In [None]:
bbc2['entidades'] = preprocess_parallel_ner(bbc2['coref_text'], chunksize=250)

In [None]:
joblib.dump(bbc2,'../../corpus/processed/bbc_coref_resolved_cleaned')

In [None]:
bbc2.head()

In [None]:
import joblib
df = joblib.load ('../../corpus/processed/bbc_coref_resolved_cleaned')
df.head()

In [None]:
entidades = df['entidades']

In [None]:
entidad_set = set()
for entidad in entidades:
    for e in entidad:
        entidad_set.add(e)

In [None]:
entidad_set

In [None]:
joblib.dump(entidad_set,'./bbc_objects/entidades_bbc')

In [27]:
import joblib
ents = joblib.load('./bbc_objects/entidades_bbc')

In [None]:
ents

In [None]:
for ent in doc:
    print(ent.label_.lower())

## Crear tabla hash entidades tipos

In [51]:

from joblib import Parallel, delayed

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

def process_chunk_clean(texts):
    preproc_pipe = []

    for doc in nlp.pipe(texts, batch_size=200):
        tokens = " ".join(token.lemma_.lower() for token in doc if token_filter(token))
        preproc_pipe.append(tokens)
    return preproc_pipe
    

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    print("Flatten a list of lists to a combined list")
    return [item for sublist in list_of_lists for item in sublist]

def preprocess_parallel_clean(texts, chunksize=100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_clean)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

def process_chunk_hash_ner(texts):
    preproc_pipe = []
    diccionario = dict()
    
    for doc in nlp.pipe(texts, batch_size=200):
        entidades = set()
        if doc.ents:
            for ent in doc.ents:
#                print(ent.text)
#                print(type(ent))
#                print(dir(ent))
                if ent.label_!='ORDINAL' and ent.label_!='CARDINAL' and ent.label_!='TIME':
                    entidades.add(ent.text.lower())
                    diccionario[ent.text.lower()] = ent.label_
        preproc_pipe.append(diccionario)
       
    return preproc_pipe
    
    
def preprocess_parallel_hash_ner (texts, chunksize = 100):
    executor = Parallel(n_jobs=-1, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk_hash_ner)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)


In [52]:
bbc2 = joblib.load ('../../corpus/processed/bbc_coref_resolved_cleaned')

In [55]:
L = preprocess_parallel_hash_ner(bbc2['coref_text'], chunksize=250)
joblib.dump(L,'./bbc_objects/listado_diccionario_ner')
result = {}
for d in L:
    result.update(d)

In [None]:
joblib.dump(result,'./bbc_objects/diccionario_ner')
os.remove('./bbc_objects/listado_diccionario_ner')

# Spotlight: DBPedia ENtities

In [None]:
import joblib
import pandas as pd
import spotlight
df = joblib.load('./bbc_objects/bbc_coref_resolved_cleaned')
def anotar(text):
    #print(text)
    return spotlight.annotate(address="http://172.17.0.1:2222/rest/annotate",text=text, confidence=0.5, support=100)

In [None]:
import time
%time
resultados = df.coref_text.apply(anotar)

In [None]:
df.coref_text[1000]

In [None]:
resultados[1000]
class hashabledict(dict):
    def __hash__(self):
        return hash(tuple(sorted(self.items())))

res = hashabledict(resultados)

In [None]:
joblib.dump(res,'./bbc_objects/entidadesdbpedia_bbc')

In [1]:
import joblib
df = joblib.load('../bbc_objects/entidadesdbpedia_bbc')

AttributeError: module '__main__' has no attribute 'hashabledict'

In [None]:
corpus = joblib.load('./bbc_objects/bbc_processed_final')

In [None]:

resultados = joblib.load('./bbc_objects/entidadesdbpedia_bbc')
resultados[1000]

In [None]:
corpus['entidades_dbpedia'] = resultados
joblib.dump(corpus,'./bbc_objects/bbc_processed_final_semantic')

In [None]:
corpus

In [None]:
df = joblib.load('./bbc_objects/bbc_coref_resolved_cleaned')

In [None]:
resultados[10][1].get('URI')
resultados[10][1].keys()

In [None]:
corpus.iloc[2,6].remove(1)

In [None]:
# ANalisis entidades

In [None]:
corpus1 = joblib.load('./bbc_objects/bbc_processed_final_semantic')
corpus1.head(1)

In [None]:
corpus2 = joblib.load('../../corpus/processed/bbc_coref_resolved_cleaned')
corpus2.head(1)

In [None]:
import editdistance

In [None]:
listaindices = []
for i1,row1 in corpus1.iterrows():
    minimo = 100000000
    actual = 0
    for i2, row2 in corpus2.iterrows():
        if i1 == i2:
            continue
        distancia = editdistance.eval(row1['cleaned'],row2['cleaned'])
        if distancia < minimo:
            actual = i2
            minimo = distancia
    listaindices.append(actual)

In [None]:
joblib.dump(listaindices,'correspondencias')

In [None]:
corpus1.cleaned[1982]

In [None]:
corpus2.cleaned[listaindices[1982]]

In [None]:
!pwd

In [None]:
import joblib

In [None]:
corpus1 = joblib.load('./bbc_objects/bbc_processed_final_semantic')

In [None]:
corpus

In [None]:
correspondencias = joblib.load('bbc_objects/correspondencias')

In [None]:
corpus2 = joblib.load('../../corpus/processed/bbc_coref_resolved_cleaned')
corpus2.head(1)

In [None]:
listado_entidades_originales = corpus2.loc[:,'entidades'][correspondencias]
listado_entidades_originales.reset_index(drop=True)

In [None]:
corpus1['entidades']=listado_entidades_originales.reset_index(drop=True)

In [None]:
joblib.dump(corpus1,'./bbc_objects/bbc_processed_final_semantic')

In [None]:
corpus1.head()

In [None]:
corpus1.iloc[1,1]

## DESPUES DE TOPIC MODELING: Añadimos informacion dbpedia 

In [2]:
resultados = joblib.load('./bbc_objects/entidadesdbpedia_bbc-Copy1')


In [3]:
corpus1['entidades_dbpedia']=resultados[correspondencias].reset_index(drop=True)

NameError: name 'correspondencias' is not defined

In [None]:
joblib.dump(corpus1,'./bbc_objects/bbc_processed_final_semantic')

In [4]:
corpus1.head()

NameError: name 'corpus1' is not defined

## Nos quedamos con la informacion util de las entidades dbpedia: uri, surfaceForm y Types

In [14]:
import pandas as pd
import joblib
corpus1 = joblib.load('./bbc_objects/bbc_processed_final_semantic')

In [15]:
corpus1

Unnamed: 0,category,text,coref_text,cleaned,entidades,new_target,entidades_dbpedia
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,"{Adam Hume, Older, Windows, one, Europe, Tim H...",3,[{'URI': 'http://dbpedia.org/resource/Televisi...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss leave book worldcom boss bernie ...,"{Monday, about $180bn, Reid Weingarten, last t...",1,[{'URI': 'http://dbpedia.org/resource/MCI_Inc....
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester tiger wary...,"{England, BBC Radio Leicester, five weeks ago,...",5,[{'URI': 'http://dbpedia.org/resource/Colin_Fa...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership newcas...,"{Championship, Sheffield United, Sheff Utd, ea...",2,[{'URI': 'http://dbpedia.org/resource/Yeading_...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,ocean s raid box office ocean s crime caper se...,"{$184m, Julia Roberts, Blade:, 2001, Twelve, o...",0,[{'URI': 'http://dbpedia.org/resource/Crime_fi...
...,...,...,...,...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,cars pull down us retail figures us retail sal...,car pull retail figure retail sale fall 0.3 ja...,"{January, Popular Securities, just 0.3%, Chris...",1,"[{'URI': 'http://dbpedia.org/resource/Car', 's..."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,kilroy unveils immigration policy ex-chatshow ...,kilroy unveil immigration policy ex chatshow h...,"{Robert Kilroy-Silk, Â£2bn, Veritas, 14,000, t...",6,[{'URI': 'http://dbpedia.org/resource/Robert_K...
2222,entertainment,rem announce new glasgow concert us band rem h...,rem announce new glasgow concert us band rem h...,rem announce new glasgow concert band rem anno...,"{Tuesday, Europe, Glasgow, Balloch Castle Coun...",4,[{'URI': 'http://dbpedia.org/resource/Glasgow'...
2223,politics,how political squabbles snowball it s become c...,how political squabbles snowball it s become c...,political squabble snowball s commonplace argu...,"{Blair, Google, two, Blair and Brown, America,...",6,[{'URI': 'http://dbpedia.org/resource/Snowball...


In [None]:
cont = 0
def simplificar_entidades(entidad):
    diccionario = dict()
    #diccionario['URI'] = entidad.get('URI')
    #diccionario['surfaceForm'] = entidad.get('surfaceForm'),  
    #diccionario['tipos']: entidad.get('types') if entidad.keys() else None 
    diccionario[entidad.get('surfaceForm')] = dict({'URI':entidad.get('URI'), 'tipos':entidad.get('types') if entidad.keys() else None })
    return diccionario

def devolver_entidades_limpias_df (row):
    entidades_dbpedia = pd.Series(row['entidades_dbpedia'])
   # global cont
   # cont = cont + 1 
   # print(cont)
    dicts = entidades_dbpedia.apply(simplificar_entidades)
    print(len(dicts))
    import collections
    entidades_simplificadas = {}
    for d in dicts:
        for k, v in d.items():  # d.items() in Python 3+
            entidades_simplificadas[k] = v
    return entidades_simplificadas

corpus1['entidades_dbpedia_simplificadas'] = corpus1.apply(devolver_entidades_limpias_df,axis=1)

In [None]:
corpus1.head()['entidades_dbpedia_simplificadas'][0]

## Consolidamos entidades vs entidades_dbpedia

In [None]:
#cargamos dbpedia y sumo
from spotlight import *
from SPARQLWrapper import SPARQLWrapper, JSON, CSV

class TextAnalyzer(object):
    def __init__(self,nlp):
        self.nlp = nlp 
        
    # allow the class instance to be called just like
    # just like a function and applies the preprocessing and
    # tokenize the document
    @staticmethod      
    def remove_special_lines(texto):
        texto = re.sub("^upright=.*[\r|\n]", '', texto)
        texto = re.sub("^upright = .*[\r|\n]", '', texto)
        texto = re.sub("Category:.*[\r|\n]",'',texto)
        texto = re.sub("Cat\D*:.*[\r|\n]",'',texto)
        texto = re.sub("[[][\d]+[]]",'',texto)
        texto = re.sub("thumb",'',texto)
        texto = re.sub("[|]",'',texto)
        texto = re.sub("\d+px",'',texto)
        return (texto)
    @staticmethod
    def strip_formatting(string):
        string = string.lower()
        string = re.sub(r"([.!?,;-_'/|()]=-<>+*`)", r"", string)
        string = re.sub(r'https?:\/\/.*?[\s]', '', string) 
        return string

    def get_nlp(self):
        return self.nlp
    
    def __call__(self, doc):
        tokens = nlp(doc)
        lemmatized_tokens = [(token.lemma_.lower()) for token in tokens
                                                   if not (token.is_stop or token.is_punct)]
            
        return(lemmatized_tokens)
    
    def is_present (self,word,text):
        lemmatized_tokens =  lambda text: " ".join(token.lemma_.lower() for token in nlp(text) if not (token.is_stop or token.is_punct))
        normalizado = lemmatized_tokens(text)    
        return (word in (normalizado))

class SemanticAnalyzer(TextAnalyzer):
    def __init__(self,nlp,endpoint="http://172.17.0.1:2222/rest/annotate",soporte=1000,confianza=0.5,umbral=0.1):
        super().__init__(nlp)
        self.endpoint = endpoint
        self.soporte=soporte
        self.confianza = confianza
        self.alfa = umbral
    
    def __call__(self, doc):
        try:
            annotations = spotlight.annotate(self.endpoint,
                                     doc,
                                      confidence=self.confianza, support=self.soporte, spotter='Default')
            diccionario =  dict()
            for annotation in annotations:
                lista = list(annotation.items())
                print(lista)
                URI = lista[0]
                key = lista[3]
                score = lista[5]
               # if (score[1]>self.alfa):
                diccionario[key[1]]=URI[1]
        
            return(diccionario)
        except Exception as ex:
            print(ex)

import spacy
import time 
from owlready2 import *
import json 
from spotlight import *

class OntoManager(object):
    def __init__(self,nlp,dict_onto,dict_graph):
        
        self.dict_onto = dict_onto
        self.dict_graph = dict_graph
        
        self.prefijos = """  PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
                                PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                                PREFIX dbr:    <http://dbpedia.org/resource/>
                                PREFIX dbo:    <http://dbpedia.org/ontology/>
                                PREFIX dct:    <http://purl.org/dc/terms/>
                                PREFIX owl:    <http://www.w3.org/2002/07/owl#>
                                PREFIX rdf:    <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                                PREFIX rdfs:   <http://www.w3.org/2000/01/rdf-schema#>
                                PREFIX schema: <http://schema.org/>
                                PREFIX skos:   <http://www.w3.org/2004/02/skos/core#>
                                PREFIX xsd:    <http://www.w3.org/2001/XMLSchema#>
                                PREFIX SUMO: <http://www.adampease.org/OP/SUMO.owl#>
                            """
        self.sa = SemanticAnalyzer(nlp)
        
    def getSemanticsOfTerm(self,term):   
        isDbo = self._isDbo(term)
        if (isDbo is None):
            return None
        
        concept = self._getBaseConcept(term,isDbo)
        if (isDbo):
            resources = self.sa(term)
        else:
            resources = {}
        superclasses = self._getHierarchy(str(concept),isDbo,False)
        subclasses = self._getHierarchy(str(concept),isDbo,True)
        relationships = self._getRelationships(str(concept),isDbo)
        types = self._getDBPediaTypes(str(concept))
        
        termino = dict({'concepto':concept, 'tipos':types,'resources':resources,'padres':superclasses,'hijos':subclasses,'relaciones':relationships})
        
        return termino

    from SPARQLWrapper import SPARQLWrapper, JSON, CSV


    def ejecutar_consulta_dbpedia(self,query,tipo=JSON):
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setReturnFormat(tipo)

        sparql.setQuery(self.prefijos+query)  # the previous query as a literal string
        print(query)
        return sparql.query().convert()

    #def getTypesOfURI (self, uri):
        
    def _isDbo (self,term):
        res = self.dict_onto.get('dbo').search(label=term,_case_sensitive=False)
        if (len(res)>0):
            return True
        res = self.dict_onto.get('SUMO').search(label=term,_case_sensitive=False)
        if (len(res)>0):
            return False
        else:
            return None

    def _getBaseConcept(self,term,isDbo):
        if (isDbo):
            res = self.dict_onto.get('dbo').search(label=term,_case_sensitive=False)
            if (len(res)>0):
                return str(res[0]).replace('.',':')
        
        res = self.dict_onto.get('SUMO').search(label=term,_case_sensitive=False)
        if (len(res)>0):
            return str(res[0]).replace('.',':')
        else:
            return None
    
    def _getDBPediaTypes(self, term):
        consulta = self.prefijos + """
       select ?o where {"""+term.replace('dbo','dbr')+""" rdf:type ?o}

        """
        print(consulta)
        lista = list(self.dict_graph.get('dbo').query(consulta))
        if len(lista)==0:
            res = self.ejecutar_consulta_dbpedia(consulta, CSV)
            lista = str(res).replace("""\"""",'').split('\\n')[1:]
        return lista
        
    def _getHierarchy(self,concept,isDbo,isSuperclass):
        
        consulta = self.prefijos + """SELECT ?x
            WHERE {
                ?x a owl:Class .
                ?x rdfs:subClassOf """+concept+"""
                }"""
         
        if isSuperclass == False:
                 consulta = self.prefijos + """SELECT ?x
            WHERE {
                ?x a owl:Class .
                """+concept+""" rdfs:subClassOf ?x 
                }"""
        if (isDbo==False):
            return (list(self.dict_graph.get('SUMO').query(consulta)))  
        else:
            return (list(self.dict_graph.get('dbo').query(consulta)))
    
    def _getRelationships(self,concept,isDbo):
        consulta = self.prefijos + """select distinct * where {
                  """+concept+""" ?property ?value .
                  filter ( ?property not in ( rdf:type ) )
                   filter ( ?property not in ( rdfs:label ) )
                optional {?property rdfs:comment ?comment}
                  optional {?property rdfs:label ?label}
                  optional {?property rdfs:range ?range} 
                  optional {?property rdfs:domain ?domain} 
                }
        """
        
        if (isDbo==False):
            resultado =  (list(self.dict_graph.get('SUMO').query(consulta)))  
        else:
            resultado = (list(self.dict_graph.get('dbo').query(consulta)))
        
        res = pd.DataFrame(data = resultado, columns = ['term','property','comment','label','range','domain'])
        res.set_index(res.term)
        return (res.to_json())
        

### 1º buscar tipos vacíos:

In [None]:
from owlready2 import *
import pandas as pd
nlp = spacy.load("en_core_web_md")
myworld1 = World()
sumo =myworld1.get_ontology("file:///home/raul/doctorado/ontologias/SUMO.owl").load()
graphsumo = myworld1.as_rdflib_graph()
myworld2 = World()
dbpedia = myworld2.get_ontology("file:///home/raul/doctorado/ontologias/dbpedia_3.9.owl/").load()
graphdbo = myworld2.as_rdflib_graph()
#dbpedia.base_iri = "http://dbpedia.org/ontology/"
dbpedia.name='dbo'
dict_onto = dict([('dbo',dbpedia),('SUMO',sumo)])
dict_graph = dict([('dbo',graphdbo),('SUMO',graphsumo)])
alfred = OntoManager(nlp,dict_onto,dict_graph)


In [None]:
import pandas as pd
#corpus1 = joblib.load('./bbc_objects/bbc_processed_final_semantic')
cont = 0

def devolver_nuevos_tipos (row):
    
    cluster_dict = defaultdict(list)
    for termino, propiedades in row['entidades_dbpedia_simplificadas'].items():
        
        new_props = propiedades
        if new_props['tipos']==[] or len(new_props['tipos'])==0:
            dbr = '<'+new_props['URI']+'>'#.replace("http://dbpedia.org/resource/","dbr:")
            #dbr = dbr[0:len(dbr)-1] if dbr.endswith('.') else dbr
            #dbr = dbr.replace("'","""\\'""") if dbr.find("'")>-1 else dbr
            new_props['tipos']=alfred._getDBPediaTypes(dbr)
        cluster_dict[termino].append(new_props)
        
    return cluster_dict


In [None]:
minicorupus = corpus1.iloc[0:3,:]
corpus1['entidades_dbpedia_simplificadas'] = corpus1.apply(devolver_nuevos_tipos,axis=1)

### Ejecutamos `!python3 dbpedia_tipos.py > salida_tipos.out&`

In [95]:
newcorpus = joblib.load('./bbc_objects/bbc_processed_final_semantic_2')

NameError: name 'newcorpus' is not defined

## Guardamos diccionario entidadesdbpedia vs topic

In [1]:
import joblib
newcorpus = joblib.load('../bbc_objects/bbc_processed_final_semantic_2')

In [2]:
newcorpus.head()

Unnamed: 0,category,text,coref_text,cleaned,entidades,new_target,entidades_dbpedia,entidades_dbpedia_simplificadas
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,"{Adam Hume, TiVo DVR, One, today, Bill Gates, ...",3,[{'URI': 'http://dbpedia.org/resource/Televisi...,{'TV': [{'URI': 'http://dbpedia.org/resource/T...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,worldcom boss leave book worldcom boss bernie ...,"{2002, 11bn, Reid Weingarten, PE, the late 199...",1,[{'URI': 'http://dbpedia.org/resource/MCI_Inc....,{'Worldcom': [{'URI': 'http://dbpedia.org/reso...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester tiger wary...,"{Saracens, another three months, Great Britain...",5,[{'URI': 'http://dbpedia.org/resource/Colin_Fa...,{'Farrell': [{'URI': 'http://dbpedia.org/resou...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership newcas...,"{Milton Keynes Dons, Gillingham, Third, Presto...",2,[{'URI': 'http://dbpedia.org/resource/Yeading_...,{'Yeading': [{'URI': 'http://dbpedia.org/resou...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,ocean s raid box office ocean s crime caper se...,"{Ocean, the New York Times, the 1960s, Andy Ga...",0,[{'URI': 'http://dbpedia.org/resource/Crime_fi...,{'crime': [{'URI': 'http://dbpedia.org/resourc...


In [3]:
topic_dbpedia_ent = {}
for target in newcorpus.new_target.unique():
    entidades = newcorpus.loc[newcorpus.new_target==target,'entidades_dbpedia_simplificadas']
    print(len(entidades))
    result = {}
    L = entidades
    for d in L:
        result.update(d)
    topic_dbpedia_ent [target]=result

joblib.dump(topic_dbpedia_ent,'../bbc_objects/diccionario_topic_entidades_dbpedia')

355
508
289
112
167
439
355


['../bbc_objects/diccionario_topic_entidades_dbpedia']

In [91]:
#entidades=joblib.load('./bbc_objects/entidades_bbc')
#topics =joblib.load('./bbc_objects/new_bbc_topics_7')
#corpus = joblib.load('./bbc_objects/bbc_processed_final')

In [88]:
corpus3 = joblib.load('./bbc_objects/entidades_bbc')

In [98]:
corpus.new_target

0       3
1       1
2       5
3       2
4       0
       ..
2220    1
2221    6
2222    4
2223    6
2224    2
Name: new_target, Length: 2225, dtype: int64

In [94]:
newcorups.new_target

NameError: name 'newcorups' is not defined