In [13]:
import spacy
from spacy import displacy
import requests
from bs4 import BeautifulSoup
import wikipediaapi
from wikipediaapi.wikipedia import WikipediaPage, WikipediaPageSection
from uuid import uuid4
import hashlib
from itertools import zip_longest

In [14]:
nlp = spacy.load('en')

In [141]:
s = """Known works from this period include a small Boy Peeling a Fruit (his earliest known painting), a Boy with a Basket of Fruit, and the Young Sick Bacchus, supposedly a self-portrait done during convalescence from a serious illness that ended his employment with Cesari."""
doc = nlp(s)

In [109]:
options = {'compact': True, #'bg': '#09a3d5',
           #'color': 'white', 
           'font': 'Source Sans Pro'}
displacy.render(doc, style='dep', options=options, jupyter=True)

In [29]:
import re

In [71]:
sentences = [re.sub(r"\n|\r", " ", el.string) for el in doc.sents]

In [106]:
pat = r"[A-Z]\w+ (\w+\s+?)+(?= \(\w+\)[.,])?"
pat = r"[A-Z][\w\'.-]+(?: [\w\'.-]+?){,4}(?= \(\w+\)[.,])?(?:[ \.\,\;\:])"

In [103]:
sentences

['Known works from this period include a small Boy Peeling a Fruit (his earliest known painting), a Boy with a Basket of Fruit, and the Young Sick Bacchus,  supposedly a self-portrait done during convalescence from a serious illness that ended his employment with Cesari.']

In [107]:
for match in re.finditer(r"^.*?(engraving|pictur|drawing|painting|work|imag).*?$", "\n".join(sentences), re.M):
    print(re.findall(pat, match.group()), match.group())

['Known works from this period ', 'Boy Peeling a Fruit ', 'Boy with a Basket of ', 'Fruit,', 'Young Sick Bacchus,', 'Cesari.'] Known works from this period include a small Boy Peeling a Fruit (his earliest known painting), a Boy with a Basket of Fruit, and the Young Sick Bacchus,  supposedly a self-portrait done during convalescence from a serious illness that ended his employment with Cesari.


In [145]:
def process_subtree(token, doc, output=None):
    if output is None:
        output = []
    parts = []
    for el in token.subtree:
        if el.pos_ in ["PUNCT", "CCONJ"]:
            break
        parts.append(el.text)
    title = " ".join(parts)
    output.append(title)
    for child in token.children:
        if child.dep_ == "conj":
            process_subtree(child, doc, output)
    return output

        
def rule_4(doc):
    v_lemmas = ["include"]
    paintings = []
    for token in doc:
        if token.lemma_ in v_lemmas:
            for child in token.children:
                if child.dep_ in ["dobj", "pobj"]:
                    rez = process_subtree(child, doc)
                    paintings.extend(rez)
    return paintings

In [144]:
rule_4(doc)

include
Boy


['a small Boy Peeling a Fruit',
 'a Boy with a Basket of Fruit',
 'the Young Sick Bacchus']

In [172]:
[el.label_ for el in doc.ents]

['PERSON', 'DATE', 'GPE', 'LOC', 'DATE', 'GPE', 'PERSON', 'DATE']

In [18]:
def get_items(token, doc, output=None):
    if output is None:
        output = []
    lefts = [el.text for el in filter(lambda x: x.dep_ in ["compound", "det"], token.lefts)]
    rights = [el.text for el in filter(lambda x: x.dep_ in ["compound", "det"], token.rights)]
    title = " ".join(lefts + [token.text] + rights)
    output.append(title)
    for child in token.children:
        if child.dep_ == "conj":
            get_items(child, doc, output)
    return output

In [19]:
v_lemmas = ["produce", "create", "paint", "reproduce"]
n_lemmas = ["image", "picture", "painting", "work", "engraving", "drawing"]
paintings = []
for token in doc:
    if token.lemma_ in v_lemmas:
        for child in token.children:
            if child.dep_ == "dobj" and child.lemma_ in n_lemmas:
                for grand_child in child.children:
                    if grand_child.dep_ == "appos":
                        rez = get_items(grand_child, doc)
                        paintings.extend(rez)

In [20]:
paintings

['Knight', 'Death', 'the Devil']

In [109]:
y=doc[18]
list(y.ancestors), list(y.conjuncts)
list(y.subtree), y

([Sea], Sea)

In [105]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

He He nsubj producing
some spectacular and original images images dobj producing
notably Nemesis Nemesis appos images
The Sea Monster Monster conj Nemesis
Saint Eustace Eustace conj Monster
a highly detailed landscape background background pobj with
animals animals conj background


In [74]:
x=doc[3]
list(x.children), x.dep_

([He, was, soon, images, with, .], 'ROOT')

In [39]:
url = "https://en.wikipedia.org/w/api.php"
params = {"action": "parse",
          "format": "json"}
params["page"] = title
r = requests.get(url, params=params)

In [40]:
raw_text = r.json()["parse"]["text"]["*"]
soup = BeautifulSoup(raw_text, "html5lib")
text = soup.get_text()

In [61]:
title = "Albrecht_Dürer"
page = wiki_wiki.page(title)

In [144]:
def add_section(section=None, path=None, page=None, parent="", output={}):
    if isinstance(section, (WikipediaPage, WikipediaPageSection)):
        title = section.title
        text = section.text
        level = section.level
    else:  
        title = "summary"
        text = section
        level = 1
    new_path = f"{path}/{title}"
    start = page.index(text)
    end = start + len(text)
    hash_value = hashlib.md5(new_path.encode()).hexdigest()
    output[hash_value] = {"parent": parent, 
                          "level": level, 
                          "start": start,
                          "end": end,
                          "title": title,
                          "path": new_path}
    return output, new_path

In [142]:
def parse_sections(data, page=None, output=None, parent="", path=""):
    if output is None:
        output = {}
    if isinstance(data, WikipediaPage):
        output, _ = add_section(data.summary, path, page, parent, output=output)
    for s in data.sections:
        output, new_path = add_section(s, path, page, parent, output=output)
        if s.sections:
            parse_sections(s, parent=s.title, output=output, path=new_path, page=page)
    return output

In [145]:
d = parse_sections(page, page=page.text)

In [150]:
# for key, value in d.items():
#     print(value["title"])
#     print(value["start"], value["end"])
#     print(page.text[value["start"]:value["end"]])

In [50]:
l = [1, 2, 3, 69]
l.remove(1)


IndexError: pop index out of range

In [1]:
from difflib import SequenceMatcher

In [7]:
s = SequenceMatcher(None, "art", "are")
s.a = "are"
s.ratio()
s.b

1.0

'are'

In [12]:
d = {"a": 1, "b": 2}
sorted(d.items(), key=lambda x: x[-1], reverse=True)

[('b', 2), ('a', 1)]