In [1]:
# Specific packages used in this notebook
#!pip install SPARQLWrapper
#!pip install Wikipedia-API

### Еталонна база даних

Будемо знаходити назви альбомів музичних гуртів на відповідних сторінках в Wikipedia. Почнемо з Queen в якості тестового зразка. Для цього формуємо базу даних, яку будемо використовувати для оцінки якості.

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re

def query_albums(artist):
    
    def cleanup(item):
        return re.sub(r'\s*\([^)]+\)', '', item)
    
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX res:  <http://dbpedia.org/resource/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT ?albumName, ?releaseDate WHERE
        {
            ?album dbo:artist <http://dbpedia.org/resource/$artist$> .
            ?album dbp:type ?type .
            ?album rdfs:label ?albumName .
            ?album dbo:releaseDate ?releaseDate
            FILTER (lang(?albumName) = 'en') . 
            #FILTER (regex(?type,'[Aa]lbum'))
        } 
    """.replace('$artist$', artist))
    
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    return {(cleanup(result['albumName']['value']), \
           int(re.sub(r'(\d+)-\d+-\d+', '\g<1>',result['releaseDate']['value']))) \
          for result in results['results']['bindings']}

ALBUMS = sorted(query_albums('Queen_(band)').\
                union({('Live Killers', 1979),\
                       ('Live at Wembley Stadium', 1982),\
                       ('The Cosmos Rocks', 2008)}), key = lambda item: item[1])

ALBUMS

[('Queen', 1973),
 ('Sheer Heart Attack', 1974),
 ('Queen II', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('Live Killers', 1979),
 ('The Game', 1980),
 ('Flash Gordon', 1980),
 ('Flash Gordon', 1981),
 ('Greatest Hits', 1981),
 ('Hot Space', 1982),
 ('Live at Wembley Stadium', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('Live Magic', 1986),
 ('At the Beeb', 1989),
 ('The Miracle', 1989),
 ('Innuendo', 1991),
 ('Greatest Hits II', 1991),
 ('Classic Queen', 1992),
 ('Five Live', 1993),
 ('Ultimate Queen', 1995),
 ('Queen Rocks', 1997),
 ('The Crown Jewels', 1998),
 ('Greatest Hits III', 1999),
 ('The Platinum Collection', 2000),
 ('Jewels', 2004),
 ('Queen on Fire – Live at the Bowl', 2004),
 ('Stone Cold Classics', 2006),
 ('The A–Z of Queen, Volume 1', 2007),
 ('The Singles Collection Volume 1', 2008),
 ('The Cosmos Rocks', 2008),
 ('Absolute Greatest', 2009),
 ('Greatest Hits II', 2011),
 ('Flas

### Данні

Завантажуємо сторінку і беремо чистий текст без списку дискографії.

In [3]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia('en')

page = wiki.page('Queen_(band)')

In [4]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [5]:
#exclude discography list
def page_text(page):
    index = page.text.find('Discography') - 1
    return page.text[:index]

Алгоритм по екстракту необхідних данних базуємо на іменованих сутностях. Працюємо в контексті одного речення, для розрізнення назв альбомів використовуємо залежності.

Оскільки текст статті не обов'язково включає інформацію по всім альбомам (для цього є окрема таблиця, а ми працюємо тільки з текстом) то для оцінки якості нам треба зкрозуміти які в тексті є альбоми взагалі.

In [6]:
doc = nlp(page_text(page))

In [7]:
sorted({album for sent in doc.sents for album in ALBUMS if album[0] in sent.text}, key = lambda a: a[1])

[('Queen', 1973),
 ('Sheer Heart Attack', 1974),
 ('Queen II', 1974),
 ('A Night at the Opera', 1975),
 ('A Day at the Races', 1976),
 ('News of the World', 1977),
 ('Jazz', 1978),
 ('Live Killers', 1979),
 ('The Game', 1980),
 ('Flash Gordon', 1980),
 ('Flash Gordon', 1981),
 ('Greatest Hits', 1981),
 ('Hot Space', 1982),
 ('Live at Wembley Stadium', 1982),
 ('The Works', 1984),
 ('A Kind of Magic', 1986),
 ('The Miracle', 1989),
 ('Innuendo', 1991),
 ('Greatest Hits II', 1991),
 ('Classic Queen', 1992),
 ('Queen Rocks', 1997),
 ('Greatest Hits III', 1999),
 ('Queen on Fire – Live at the Bowl', 2004),
 ('The Cosmos Rocks', 2008),
 ('Absolute Greatest', 2009),
 ('Greatest Hits II', 2011),
 ('Flash Gordon', 2011),
 ('Icon', 2013),
 ('Queen Forever', 2014)]

Дивимся також на альбоми які ідентифікуються як сутності і бачимо що якість класифікатора Spacy не дуже гарна в плані визначення типу сутності, що подалі буде впливати на якість всього алгоритму.

In [8]:
{ (album[0], ent.label_) for ent in doc.ents for album in ALBUMS if album[0].lower() in ent.text.lower()}

{('A Kind of Magic', 'WORK_OF_ART'),
 ('A Night at the Opera', 'WORK_OF_ART'),
 ('Absolute Greatest', 'PRODUCT'),
 ('Classic Queen', 'PRODUCT'),
 ('Greatest Hits', 'PERSON'),
 ('Greatest Hits', 'PRODUCT'),
 ('Greatest Hits', 'WORK_OF_ART'),
 ('Greatest Hits II', 'PERSON'),
 ('Greatest Hits II', 'PRODUCT'),
 ('Greatest Hits III', 'PRODUCT'),
 ('Hot Space', 'ORG'),
 ('Icon', 'WORK_OF_ART'),
 ('Jazz', 'GPE'),
 ('Jazz', 'ORG'),
 ('Live Killers', 'WORK_OF_ART'),
 ('Live at Wembley Stadium', 'WORK_OF_ART'),
 ('News of the World', 'ORG'),
 ('News of the World', 'WORK_OF_ART'),
 ('Queen', 'CARDINAL'),
 ('Queen', 'DATE'),
 ('Queen', 'EVENT'),
 ('Queen', 'FAC'),
 ('Queen', 'LOC'),
 ('Queen', 'ORG'),
 ('Queen', 'PERSON'),
 ('Queen', 'PRODUCT'),
 ('Queen', 'WORK_OF_ART'),
 ('Queen Forever', 'WORK_OF_ART'),
 ('Queen II', 'EVENT'),
 ('Queen II', 'PERSON'),
 ('Queen II', 'PRODUCT'),
 ('Queen II', 'WORK_OF_ART'),
 ('Queen on Fire – Live at the Bowl', 'WORK_OF_ART'),
 ('Sheer Heart Attack', 'ORG'),
 ('

Оскільки ми не можемо брати всі сутності будь-якого типу підряд, бо буде багато false positives, то ми обмежимось типами WORK_OF_ART та PRODUCT які найбільше відповідають тому чим являється альбом. Для оцінки якості ми беремо  тільки альбоми з загального переліку які є у тесксті і які розпізнаються як сутність з типом WORK_OF_ART або PRODUCT. Тобто ми не міряємо якість загалом, а за виключенням якості класифікатора Spacy.

In [9]:
def extract_ent_albums(albums, doc):
    return { album[0] for ent in doc.ents for album in albums if album[0].lower() in ent.text.lower() and \
            (ent.label_ == 'WORK_OF_ART' or ent.label_ == 'PRODUCT')}
    
ent_albums = extract_ent_albums(ALBUMS, doc)

ent_albums

{'A Kind of Magic',
 'A Night at the Opera',
 'Absolute Greatest',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits II',
 'Greatest Hits III',
 'Icon',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'Sheer Heart Attack',
 'The Cosmos Rocks',
 'The Game',
 'The Miracle',
 'The Works'}

### Алгоритм

Далі реалiзуємо алгоритм який знаходить іменовані сутності необхідного типу і намагається за допомогою простого аналізу залежностей розпізнати назву альбому.

In [10]:
def reach(token, word):
    if token.head.lemma_ == word:
        return token.head
    elif token.dep_ == 'ROOT':
        return None
    
    return reach(token.head, word)        

def is_reachable(token, word):    
    return not (reach(token, word) is None)

def depends_on_album(token):
    
    def the_most_probably_album(token):
        return not(is_reachable(token, 'single') or \
                   is_reachable(token, 'song') or \
                   is_reachable(token, 'game') or \
                   is_reachable(token, 'soundtrack') or \
                   is_reachable(token, 'track'))
        
    if is_reachable(token, 'album'):
        return True
    
    release = reach(token, 'release')
    if release:
        return the_most_probably_album(release)
    
    record = reach(token, 'record')
    if release:
        return the_most_probably_album(release) 
    
    return False
        
def depends_on_ent(token, ent):
    for e_token in ent:
        if is_reachable(token, e_token.lemma_) or is_reachable(e_token ,token.lemma_):
            return True
    return False    

def is_album(ent, sent):
    
    def the_most_probably_not_album(token, ent):
        return depends_on_ent(token, ent) or is_reachable(token, 'release') or is_reachable(token, 'record')
    
    proof = False
    
    if ent.label_ == "WORK_OF_ART" or ent.label_ == "PRODUCT":
        for token in ent:
            if depends_on_album(token):
                proof = True
                break
        for token in sent:           
            if (token.lemma_ == 'single' or \
                token.lemma_ == 'song' or \
                token.lemma_ == 'soundtrack' or \
                token.lemma_ == 'track' or \
                token.lemma_ == 'game') and the_most_probably_not_album(token, ent):
                proof = False
                break            
                
            if token.lemma_ == 'album' and depends_on_ent(token, ent):
                proof = True
                break
                            
    return proof

def extract_albums_from_sent(sent):
    
    albums = set([])
    
    for ent in sent.ents:
        
        if 'albums' in ent.text.lower() or \
        'award' in ent.text.lower() or \
        'of all time' in ent.text.lower(): #exclude charts aka 'Greatest Albums' or '... of All Time Ever'
            continue
        
        if is_album(ent, sent):
            albums.add(ent.text.strip())
            
    return albums
    

def extract_albums(text):
    doc = nlp(text)
    
    albums = set([])
    
    for sent in doc.sents:        
        albums.update(extract_albums_from_sent(sent))
    
    return albums

In [11]:
### True positives

def singleton(text):
    return set([text])

assert extract_albums('Queen first charted in the UK with their second album, Queen II, in 1974.') ==\
     singleton('Queen II')

assert extract_albums('Another One Bites the Dust" (1980) became their best-selling single, while their 1981 compilation album Greatest Hits is the best-selling album in the UK and is certified eight times platinum in the US.')\
    == singleton('Greatest Hits')

assert extract_albums('In late 1975, Queen recorded and released A Night at the Opera, taking its name from the popular Marx Brothers movie.')\
    == singleton('A Night at the Opera')

#assert extract_albums('By 1976, Queen were back in the studio recording A Day at the Races, which is often regarded as a sequel album to A Night at the Opera.')\
#    == singleton('A Day at the Races') ##  'A Day at the Races' is not recornized as NER

#Released in 1974, Sheer Heart Attack reached number two in the UK, sold well throughout Europe, and went gold in the US.
# 'Sheer Heart Attack' is not recornized as NER

#false negative
assert extract_albums('They released their first live album, Live Killers, in 1979; it went platinum twice in the US.')\
    == singleton('Live Killers')

assert extract_albums("In October that year, Queen released their first compilation album, titled Greatest Hits, which showcased the group's highlights from 1974 to 1981.")\
    == singleton('Greatest Hits')

#assert extract_albums('In February 1984, Queen released their eleventh studio album, The Works, which included the successful singles "Radio Ga Ga", "Hammer to Fall" and "I Want to Break Free".')\
#    == singleton('The Works') cannot distinguish between singles and album in one sentence

assert extract_albums('The first Queen + Paul Rodgers album, titled The Cosmos Rocks, was released in Europe on 12 September 2008 and in the United States on 28 October 2008.')\
    == singleton('The Cosmos Rocks')

assert extract_albums("A distinctive characteristic of Queen's music are the vocal harmonies which are usually composed of the voices of May, Mercury, and Taylor best heard on the studio albums A Night at the Opera and A Day at the Races.")\
    == singleton('A Night at the')

assert extract_albums('In May 2012, the choir performed "We Are the Champions" in the episode "Nationals", and the song features in The Graduation Album.')\
    == singleton('The Graduation Album')

assert extract_albums("DVD releases of their 1986 Wembley concert (titled Live at Wembley Stadium), 1982 Milton Keynes concert (Queen on Fire – Live at the Bowl), and two Greatest Video Hits (Volumes 1 and 2, spanning the 1970s and 1980s) have seen the band's music remixed into 5.1 and DTS surround sound")\
    == {'Live at Wembley Stadium', 'Queen on Fire – Live at the Bowl'}

assert extract_albums("The band’s 1977 album News of the World contained \"We Will Rock You\" and \"We Are the Champions\", which have become anthems at sporting events.")\
    == singleton('News of the World')

assert extract_albums("In October that year, Queen released their first compilation album, titled Greatest Hits, which showcased the group's highlights from 1974 to 1981.")\
    == singleton('Greatest Hits')

In [12]:
#True negatives
assert not extract_albums("Retrospectively, it is cited as the highlight of the album, and in 2008 Rolling Stone ranked it 31st in the \"100 Greatest Guitar Songs of All Time\", describing it as \"an entire album's worth of riffs crammed into a single song\"")

assert not extract_albums("\"The 50 Best British Albums Ever\" in 2004, and number 11 in Rolling Stone's \"The 100 Greatest Albums of All Time\" as featured in their Mexican edition in 2004.")

assert not extract_albums('Queen also released the very successful single "Crazy Little Thing Called Love", a rockabilly inspired song done in the style of Elvis Presley.')

assert not extract_albums('After attending a Queen concert in Los Angeles, Michael Jackson suggested to Mercury backstage that "Another One Bites the Dust" be released as a single, and in October 1980 it spent three weeks at number one.')

assert not extract_albums('In conjunction with Electronic Arts, Queen released the computer game Queen: The eYe in 1998')

assert not extract_albums("Several of the guest singers recorded new versions of Queen's hits under the Queen + name, such as Robbie Williams providing vocals for \"We Are the Champions\" for the soundtrack of A Knight's Tale (2001)")

# fasle positive
#extract_albums("In the United States, \"Bohemian Rhapsody\" was re-released as a single in 1992 after appearing in the comedy film Wayne's World.")

In [13]:
extracted_albums = extract_albums(doc.text)

extracted_albums

{'A Kind of Magic',
 'A Night at the',
 'A Night at the Opera',
 'Bohemian Rhapsody',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits III',
 'Hammer to Fall',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Opera',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'SingStar',
 'The Cosmos Rocks',
 'The Graduation Album',
 'The Works',
 'as Queen: Live in Rio',
 'the Races to Live Killers\nBy 1976'}

### Метрики

Оскільки наш алгоритм по суті виконує класифікацію, то тут ми можемо застосвати F1 метрики. 

In [14]:
from collections import namedtuple

Metrics = namedtuple('Metrics', 'accuracy precision recall f1')

Results = namedtuple('Results', 'true_positives false_negatives false_positives true_negatives albums_baseline')

def extract_and_measure(text, albums_db):    
    doc = nlp(text)
    
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    
    true_positives = set([])
    false_negatives = set([])
    false_positives = set([])
    true_negatives = set([])
    albums_baseline = set([])
    
    for sent in doc.sents:
        ent_albums = extract_ent_albums(albums_db, sent)
        albums_baseline.update(ent_albums)
        
        extracted_albums = extract_albums_from_sent(sent)
        
        tps = ent_albums.intersection(extracted_albums)
        true_positives.update(tps)        
        tp += len(tps)
        
        fns = ent_albums.difference(extracted_albums)
        false_negatives.update(fns)
        fn += len(fns)
        
        fps = extracted_albums.difference(ent_albums)
        false_positives.update(fps)
        fp += len(fps)        
        
        tns = {ent.text.strip() for ent in sent.ents \
                   if ent.label_ == 'PRODUCT' or ent.label_ == "WORK_OF_ART"}\
                  .difference(ent_albums)
        true_negatives.update(tns)
        tn += len(tns)        
        
    pr = tp/(tp + fp)
    rc = tp/(tp + fn)
    f1 = 2 * (pr * rc)/(pr + rc)    
    
    results = Results(true_positives = true_positives,\
                     false_negatives = false_negatives,\
                     false_positives = false_positives,\
                     true_negatives = true_negatives,\
                     albums_baseline = albums_baseline)
    
    metrics = Metrics(accuracy = (tp + tn)/(tp+fp+fn+tn),\
                      precision = pr,\
                      recall = rc,\
                      f1 = f1)
    
    return (results, metrics)

Рахуємо метрики для Queen.

In [15]:
results, metrics = extract_and_measure(doc.text, ALBUMS)

In [16]:
results.true_positives

{'A Kind of Magic',
 'A Night at the Opera',
 'Classic Queen',
 'Greatest Hits',
 'Greatest Hits III',
 'Live Killers',
 'Live at Wembley Stadium',
 'News of the World',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'The Cosmos Rocks',
 'The Works'}

In [17]:
results.false_negatives

{'A Kind of Magic',
 'A Night at the Opera',
 'Absolute Greatest',
 'Greatest Hits',
 'Greatest Hits II',
 'Icon',
 'Live Killers',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Sheer Heart Attack',
 'The Game',
 'The Miracle',
 'The Works'}

In [18]:
results.false_positives

{'A Night at the',
 'Bohemian Rhapsody',
 'Hammer to Fall',
 'Opera',
 'SingStar',
 'The Graduation Album',
 'as Queen: Live in Rio',
 'the Races to Live Killers\nBy 1976'}

In [19]:
results.true_negatives

{'"Bohemian Rhapsody"',
 '"Love of My Life"',
 '"Queen at the Ballet"',
 '"The Show Must Go',
 '"The Show Must Go On',
 '"The Show Must Go On"',
 "A Knight's Tale",
 'A Night',
 'A Night at the',
 'A Night at the Opera Tour',
 'A Night at the Opera and The Game',
 'A Trip',
 'American Idol',
 'Another One Bites the Dust',
 'Battleship Potemkin',
 'Bicycle Race',
 'Birthday Tribute',
 'Bohemian Rhapsody',
 'Breakthru',
 'Candle in the Wind 1997',
 'Composers',
 'Crazy Little Thing Called Love',
 'Death on Two Legs',
 'Disco',
 "Do They Know It's Christmas",
 "Don't Stop Me Now",
 'Ericsson',
 'Essential Hard Rock and Heavy Metal',
 'Fat Bottomed Girls',
 'Friends Will Be Friends',
 'Fun in Space',
 'Game',
 'Gigwise',
 'God Save the Queen',
 'Grammy Award for Best Metal Performance',
 'Grammy Lifetime Achievement Award',
 'Grand Theft Auto IV',
 'Guitar Hero',
 'Guitar Hero 5',
 'Guitar Hero World Tour',
 'Guitar Hero:',
 'Guitar Hero: Warriors of Rock',
 'Guitar Songs of All Time',
 'H

In [20]:
metrics

Metrics(accuracy=0.7516556291390728, precision=0.6428571428571429, recall=0.21686746987951808, f1=0.32432432432432434)

Пробуємо заекстрактити інший рок-гурт.

In [21]:
aero_results, aero_metrics = extract_and_measure(page_text(wiki.page('Aerosmith')), query_albums('Aerosmith'))

In [22]:
aero_metrics

Metrics(accuracy=0.8956043956043956, precision=0.1875, recall=0.3333333333333333, f1=0.24000000000000005)

In [23]:
aero_results.albums_baseline

{'A Little South of Sanity',
 'Aerosmith',
 'Get Your Wings',
 'Pump',
 'You Gotta Move'}

In [24]:
aero_results.true_positives

{'A Little South of Sanity', 'Get Your Wings'}

А також дует який створює електронну музику.

In [25]:
chemical_results, chemical_metrics = extract_and_measure(page_text(wiki.page('The_Chemical_Brothers')),\
                                                         query_albums('The_Chemical_Brothers'))

In [26]:
chemical_metrics

Metrics(accuracy=0.9206349206349206, precision=0.1, recall=0.5, f1=0.16666666666666669)

In [27]:
chemical_results.albums_baseline

{'Exit Planet Dust', 'We Are the Night'}

In [28]:
chemical_results.true_positives

{'We Are the Night'}

Тестуємо алгоритм на більший вибірці.

In [29]:
all_results, all_metrics = extract_and_measure(page_text(wiki.page('Queen_(band)')) + '\n' + \
                                               page_text(wiki.page('Aerosmith')) + '\n' + \
                                               page_text(wiki.page('Pink_Floyd')) + '\n' + \
                                               page_text(wiki.page('Iron_Maiden')) + '\n' + \
                                               page_text(wiki.page('Scorpions_(band)')) + '\n' + \
                                               page_text(wiki.page('Metallica')) + '\n' + \
                                               page_text(wiki.page('The_Smashing_Pumpkins')) + '\n' + \
                                               page_text(wiki.page('Nirvana_(band)')) + '\n' + \
                                               page_text(wiki.page('The_Chemical_Brothers')) + '\n' + \
                                               page_text(wiki.page('The_Prodigy')),\
                                               query_albums('Queen_(band)').\
                                               union(query_albums('Aerosmith')).\
                                               union(query_albums('Pink_Floyd')).\
                                               union(query_albums('Iron_Maiden')).\
                                               union(query_albums('Scorpions_(band)')).\
                                               union(query_albums('Metallica')).\
                                               union(query_albums('The_Smashing_Pumpkins')).\
                                               union(query_albums('Nirvana_(band)')).\
                                               union(query_albums('The_Chemical_Brothers')).\
                                               union(query_albums('The_Prodigy')))

In [30]:
all_metrics

Metrics(accuracy=0.7674586033117351, precision=0.3625, recall=0.2078853046594982, f1=0.26423690205011385)

In [31]:
all_results.true_positives

{'A Kind of Magic',
 'A Little South of Sanity',
 'A Matter of Life and Death',
 'A Momentary Lapse of Reason',
 'A Night at the Opera',
 'Acoustica',
 'Beyond Magnetic',
 'Blackout',
 'Classic Queen',
 'Dance of Death',
 'Death Magnetic',
 'Get Your Wings',
 'Greatest Hits',
 'Greatest Hits III',
 'Iron Maiden',
 'Live After Death',
 'Lovedrive',
 'Master of Puppets',
 'Moment of Glory',
 'Music for the Jilted Generation',
 'News of the World',
 'Powerslave',
 'Queen',
 'Queen Forever',
 'Queen II',
 'Queen on Fire – Live at the Bowl',
 'Reload',
 'Rotten Apples',
 'Somewhere Back in Time',
 'Somewhere in Time',
 'St. Anger',
 'The Book of Souls',
 'The Fat of the Land',
 'The Videos 1989–2004',
 'The Works',
 'Unbreakable',
 'We Are the Night',
 'World Wide Live'}

Не дивлячись на те що алгоритм знаходить назви албомів, він має не високу точність завдяки великій кількости false negatives та false positives. З іншої сторони підхід достатньо простий. Думаюю якість існуючого підходу можно б було покращити додаванням більш складних правил які враховують не тільки існування залежностей а і тип залежностей, це думаю, наприклад, допомогло б вирішити проблему коли в одному реченні згадуються не тільки альбом а й пісні цього альбому. Краще б було також, якщо ми вийшли за контекст одного речення і покращили NER (натренували класифікатор який розпізнає назви сучасних музичних творів). 