# Извлечение информации

([Задание для семинара](#scrollTo=_r19P5FYuGk2&line=1&uniqifier=1), 
[Домашнее задание](#scrollTo=PbBbqFUeQPcS&line=1&uniqifier=1))


Задачей извлечения информации является получение структурированного знания из набора неструктурированных текстов. 

## 1. Открытое извлечение информации (Open information extraction)

Задача: извлечь из текста структурированную информацию в виде троек отношений: (объект, предикат, субъект)

Как решается: поиском потомков предиката в синтаксическом дереве согласно некоторым правилам

In [1]:
import os
import time

from urllib import request
from bs4 import BeautifulSoup

from nltk import sent_tokenize

import spacy
from spacy.util import minibatch, compounding
#from spacy.pipeline import SentenceSegmenter
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    return True

url = 'https://www.bbc.com/news/health-54540544'

html = request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.findAll(text=True)
visible_texts = filter(tag_visible, data)
text = u" ".join(t.strip() for t in visible_texts)
print(text[700:1200])

 the brain to the heart and the skin "Long Covid" - the long-lasting impact of coronavirus infection - may be affecting people in four different ways, according to a review. And this could explain why some of those with continuing symptoms are not being believed or treated. There could be a huge psychological impact on people living with long-term Covid-19, the National Institute for Health Research report says. They need more support - and healthcare staff require better information. Life-chang


In [3]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

In [4]:
nlp = spacy.load('en_core_web_sm')

tok = nlp(text)
svos = findSVOs(tok)
svos

[('skip', 'help', 'account'),
 ('weather', 'sounds', 'war'),
 ('long', 'reads', 'coronavirus'),
 ('symptoms', 'affect', 'everything'),
 ('staff', 'require', 'information'),
 ('you', 'enable', 'javascript'),
 ('video', 'enable', 'javascript'),
 ('review', 'found', 'symptoms'),
 ('symptoms', 'affecting', 'everything'),
 ('some', 'had', 'stay'),
 ('who', 'had', 'infection'),
 ('coming', 'help', 'support'),
 ('others', 'reporting', 'experiences'),
 ('that', 'follow', 'infection'),
 ('clinics', 'set', 'covid'),
 ('she', 'assumed', 'those'),
 ('there', 'are', 'people'),
 ('record', 'having', 'covid'),
 ('who', 'suffering', 'more'),
 ('effects', 'put', 'burden'),
 ('sons', 'taken', 'source'),
 ('source', 'experiencing', 'symptoms'),
 ('symptoms', 'having', 'impact'),
 ('partner', 'experiencing', 'symptoms'),
 ('ash', 'experiencing', 'symptoms'),
 ('sons', 'take', 'cooking'),
 ('sons', 'take', 'cleaning'),
 ('we', 'need', 'support'),
 ('jo', 'had', 'pneumonia'),
 ('we', 'made', 'wills'),
 ('i'

### Применение

Существуют готовые более сложные системы открытого извлечения информации. [Пример применения](https://openie.allenai.org/#).

Модуль [CoreNLP OpenIE](https://nlp.stanford.edu/software/openie.html) для открытого извлечения информации, с помощью которого можно обнаруживать триплеты в неструктурированном тексте, основывается на нахождении синтаксических шаблонов в де­реве зависимостей следующим образом: исходное предложение разбивается на множество клауз, затем выделяются предикат и аргументы на основе правил, заданных вручную. В качестве преди­катов рассматриваются токены, находящиеся между двумя аргументами (именными группами) в дереве зависимостей, например, «play with» для клаузы «cats play with yarn». Также извлекаются номинальные отношения из именных групп, например, «’s» для именной группы «IBM’s researchgroup». Данная система демонстрирует качество 28.3% F-­меры на наборе данных [Knowledge Base Population](http://www.surdeanu.info/mihai/papers/kbp2013.pdf). 

**Задание (Семинар, 2 балла):** используйте модуль OpenIE для наивного поиска ответа на вопрос по тексту:

> **Текст:** "Born in Moscow, Pushkin was raised by nursemaids and French tutors, and spoke mostly French until the age of ten. He learned some Russian through his nanny, Arina Rodionovna, who he loved dearly. He published his first poem at the age of 15. When he finished school, as part of the first graduating class of the prestigious Imperial Lyceum in Tsarskoye Selo, near Saint Petersburg, his talent was already widely recognized on the Russian literary scene. At the Lyceum, he was a student of David Mara, a younger brother of French revolutionary Jean-Paul Marat. At After school, Pushkin plunged into the vibrant and raucous intellectual youth culture of St. Petersburg. St. Petersburg was then the capital of the Russian Empire. In 1820, Pushkin published his first long poem, Ruslan And Ludmila, with much controversy about its subject and style."

> **Q:** "Who taught Pushkin to speak Russian?"
>
> **A:** ('He', 'learned Russian through', 'Arina Rodionovna')

> **Q:** "What was the capital of the Russian Empire?"
>
> **A:** ('St. Petersburg', 'was capital of', 'Russian Empire')

Загружаем Stanford CoreNLP:



In [5]:
! wget https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip && unzip -q stanford-corenlp-full-2018-10-05.zip

--2022-12-27 01:38:57--  https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip [following]
--2022-12-27 01:38:58--  https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-full-2018-10-05.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 393239982 (375M) [application/zip]
Saving to: ‘stanford-corenlp-full-2018-10-05.zip’


2022-12-27 01:40:11 (5.18 MB/s) - ‘stanford-corenlp-full-2018-10-05.zip’ saved [393239982/393239982]



In [6]:
os.environ["CORENLP_HOME"] = 'stanford-corenlp-full-2018-10-05'

Устанавливаем библиотеку с оберткой CoreNLP:

In [7]:
! pip install stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.4.2-py3-none-any.whl (691 kB)
[K     |████████████████████████████████| 691 kB 36.4 MB/s 
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 68.6 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=1a1ed803077c7d7ede0196cc9e44defefdc509441e9972b874573ab72767aade
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stanza-1.4.2


In [8]:
import stanza

stanza.install_corenlp()



Запускаем CoreNLP:

In [9]:
from stanza.server import CoreNLPClient

client = CoreNLPClient(timeout=150000000, be_quiet=True, annotators=['openie'], 
endpoint='http://localhost:9001')
client.start()
time.sleep(10)

INFO:stanza:Writing properties to tmp file: corenlp_server-a750e583b82b4c3a.props
INFO:stanza:Starting server with command: java -Xmx5G -cp stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 150000000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-a750e583b82b4c3a.props -annotators openie -preload -outputFormat serialized


Аннотируем текст, смотрим на триплеты, извлеченные из первого предложения:

In [10]:
paragraph = "Born in Moscow, Pushkin was raised by nursemaids and French tutors, and spoke mostly French until the age of ten. He learned some Russian through his nanny, Arina Rodionovna, who he loved dearly. He published his first poem at the age of 15. When he finished school, as part of the first graduating class of the prestigious Imperial Lyceum in Tsarskoye Selo, near Saint Petersburg, his talent was already widely recognized on the Russian literary scene. At the Lyceum, he was a student of David Mara, a younger brother of French revolutionary Jean-Paul Marat. At After school, Pushkin plunged into the vibrant and raucous intellectual youth culture of St. Petersburg. St. Petersburg was then the capital of the Russian Empire. In 1820, Pushkin published his first long poem, Ruslan And Ludmila, with much controversy about its subject and style."

In [11]:
ann = client.annotate(paragraph)

In [12]:
for triple in ann.sentence[0].openieTriple:
  print(triple)
  print()

subject: "Pushkin"
relation: "was raised by"
object: "nursemaids"
confidence: 1.0
tree {
  node {
    sentenceIndex: 0
    index: 5
  }
  node {
    sentenceIndex: 0
    index: 6
  }
  node {
    sentenceIndex: 0
    index: 7
  }
  node {
    sentenceIndex: 0
    index: 8
  }
  node {
    sentenceIndex: 0
    index: 9
  }
  edge {
    source: 7
    target: 5
    dep: "nsubjpass"
    isExtra: false
    sourceCopy: 0
    targetCopy: 0
    language: UniversalEnglish
  }
  edge {
    source: 7
    target: 6
    dep: "auxpass"
    isExtra: false
    sourceCopy: 0
    targetCopy: 0
    language: UniversalEnglish
  }
  edge {
    source: 7
    target: 9
    dep: "nmod:agent"
    isExtra: false
    sourceCopy: 0
    targetCopy: 0
    language: UniversalEnglish
  }
  edge {
    source: 9
    target: 8
    dep: "case"
    isExtra: false
    sourceCopy: 0
    targetCopy: 0
    language: UniversalEnglish
  }
  root: 7
}
subjectTokens {
  sentenceIndex: 0
  tokenIndex: 4
}
relationTokens {
  senten

In [13]:
triples = []
# ToDO: Здесь заполняем список триплетов значениями (субъект, предикат, объект), чтобы сохранялся вывод следующей ячейки
for sent in ann.sentence:
  d = sent.openieTriple
  for triple in sent.openieTriple:
    triples.append((triple.subject, triple.relation, triple.object))

In [14]:
len(triples), triples[:3]

(55,
 [('Pushkin', 'was raised by', 'nursemaids'),
  ('Pushkin', 'Born in', 'Moscow'),
  ('Pushkin', 'spoke until', 'age of ten')])

Находим ответ - триплет, в котором предикат больше всего похож на вопрос:

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


vectorizer = TfidfVectorizer(ngram_range=(1,2), norm='l2')
t_vectors = vectorizer.fit_transform([tr[1] for tr in triples])

def find_answer(question):
  """ Всем уже знакомое вычисление косинусной близости """
  q_vec = vectorizer.transform([question])
  cos_sim = q_vec @ t_vectors.T
  
  return triples[np.argmax(cos_sim)]

In [16]:
find_answer("Who taught Pushkin to speak Russian?")

('He', 'learned Russian through', 'Arina Rodionovna')

In [17]:
find_answer("What was the capital of the Russian Empire?")

('St. Petersburg', 'was capital of', 'Russian Empire')

In [24]:
# ToDO: Придумайте свой пример, для которого сработает этот метод
find_answer("Where Pushkin was born?")

('Pushkin', 'Born in', 'Moscow')