# Recognize George_Bernard_Shaw books in Wikipedia article

## Read GT and the Wikipedia article

Read groundtruth from dbpedia, save as 'books.json' and extract to books_gt

PREFIX : <http://dbpedia.org/resource/>

SELECT ?book 

WHERE {

?book dbo:author :George_Bernard_Shaw .

}

In [1]:
import json

def get_books_gt(fname):
    with open(fname) as f:
        books_gt_json = json.load(f)

    books_gt = []
    for book_j in books_gt_json['results']['bindings']:
        book_url_name = book_j['book']['value'].split('/')[-1]
        book_words_gt = book_url_name.split('_')
        # remove 'film)'' and '(play)'
        book_words_gt = [word for word in book_words_gt if not any([ch in word for ch in ['(', ')']])]
        book_gt = ' '.join(book_words_gt)
        books_gt.append(book_gt)

    return sorted(books_gt)

books_gt = get_books_gt('04-books.json')

Read article

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import re
from pprint import pprint
import nltk


link = 'https://en.wikipedia.org/wiki/George_Bernard_Shaw'
page_xml = BeautifulSoup(requests.get(link).text)
article_xml = page_xml.find('div', {'class': 'mw-parser-output'})

# also removes books list <div class="refbegin columns references-column-width"
for tag in article_xml.find_all(['div', 'table', 'ul', 'blockquote']): 
    tag.decompose()

article_text = article_xml.text

## Try baseline solution without NLP 
Detect by 'i' tag and first upper letter

In [3]:
def has_similar(name, names):
    SIMILARITY_THRESHOLD = 3
    for another_name in names:
        if nltk.edit_distance(name, another_name) < SIMILARITY_THRESHOLD:
            return True
    return False

def remove_dublicates(books_names):
    filtered_books_names = []
    for book_name in books_names:
        if not has_similar(book_name, filtered_books_names):
            filtered_books_names.append(book_name)
    return filtered_books_names

def detect_books_by_formatting(article_xml):
    books_names = []
    for i_xml in article_xml.findAll('i'):
        books_names.append(i_xml.text)

    #One book may have few entries with minor changes
    books_names = remove_dublicates(books_names)

    # Simple rule
    books_names = [bn for bn in books_names if list(bn)[0].isupper()]

    return sorted(books_names)

books_detected_by_formatting = detect_books_by_formatting(article_xml)

Measure score

In [4]:
def score(detected_set, gt_set):
    intersected_samples = [sample for sample in gt_set if has_similar(sample, detected_set)]
    if not intersected_samples:
        return 0,0,0

    tp = len(intersected_samples)
    recall = float(tp) / len(gt_set)
    precission = float(tp) / len(detected_set)
    f1 = 2 * recall * precission / ( recall + precission )

    #print(tp, len(detected_set), len(gt_set))
    #print(recall, precission, f1)

    return round(recall, 2), round(precission, 2), round(f1, 2)

In [5]:
rec, prec, f1 = score(books_detected_by_formatting, books_gt)
print(f'No-NLP score: rec:{rec}, prec:{prec}, f1:{f1}')
print('Detect by <i> tag and first upper letter')

No-NLP score: rec:0.75, prec:0.39, f1:0.51
Detect by <i> tag and first upper letter


## NER-based approach

In [7]:
import spacy

nlp = spacy.load("en_core_web_lg")

# remove '[...]' and '\n' to avoid wrong tokenization like:
# 'the Board of Censors.[281'
article_text = re.sub(r'\[.*?\]', '', article_text)
article_text = re.sub(r'\n', '', article_text)

doc = nlp(article_text)

In [8]:
ner_texts = [ent.text for ent in doc.ents]
ner_texts = remove_dublicates(ner_texts)
rec, prec, f1 = score(ner_texts, books_gt)
print(f'NER without post-processing: rec:{rec}, prec:{prec}, f1:{f1}')

NER without post-processing: rec:0.34, prec:0.03, f1:0.06


### Apply rules to Named Entities
* filter by label
* rules for NE string: len > 2, fisrt upper letter

#### Select labels
This script shows that only labels: PERSON, ORG, WORK_OF_ART, GPE has recall > 0

In [9]:
labels_and_ents = {}
for ent in doc.ents:
    if ent.label_ not in labels_and_ents:
        labels_and_ents[ent.label_] = []
    labels_and_ents[ent.label_].append(ent.text)
for label in labels_and_ents:
    rec, prec, f1 = score(labels_and_ents[label], books_gt)
    print(f'{label}:  rec:{rec}, prec:{prec}, f1:{f1}')

#### Filter

In [23]:
print('All Named Entities(NE):', sum([len(remove_dublicates(labels_and_ents[label])) for label in labels_and_ents]))

person_ents = list(set(labels_and_ents['PERSON']))
org_ents = list(set(labels_and_ents['ORG']))
work_of_art_ents = list(set(labels_and_ents['WORK_OF_ART']))
gpe_ents = list(set(labels_and_ents['GPE']))
selected_ents = person_ents + org_ents + work_of_art_ents + gpe_ents
print('NE with 4 selected labels:', len(remove_dublicates(selected_ents)))

selected_ents = [ent for ent in selected_ents if len(ent) > 2]
selected_ents = [ent for ent in selected_ents if list(ent)[0].isupper()]
selected_ents = remove_dublicates(selected_ents)
print('NE with 4 selected labels and rules for NE string (len > 2, fisrt upper letter):', len(selected_ents))

All Named Entities(NE): 703
NE with 4 selected tags: 413
NE with selected tags and rules for NE string (len > 2, fisrt upper letter): 367


In [30]:
rec, prec, f1 = score(selected_ents, books_gt)
print(f'NER after pre-selection: rec:{rec}, prec:{prec}, f1:{f1}')

NER after pre-selection: rec:0.33, prec:0.05, f1:0.09


### Apply rules to sentences with Named Entities

Find words that can be in the same sentence with NE

In [49]:
def has_book_detected_by_formatting(sent):
    for book_str in books_detected_by_formatting:
        if book_str in sent.text:
            return True

    return False

In [52]:
import operator

def lemmas_hist(sent_list):
    def get_hist(str_list):
        str_hist = {}
        for string in str_list:
            if string in str_hist:
                str_hist[string] += 1
            else:
                str_hist[string] = 1

        return sorted(str_hist.items(), key=operator.itemgetter(1), reverse=True)
    
    words_list = []
    for sent in sent_list:
        words_list += [token.lemma_ for token in sent]
    return get_hist(words_list)
    
pos_sentences = [sent for sent in doc.sents if has_book_detected_by_formatting(sent)]
print(lemmas_hist(pos_sentences)[:50])

In [70]:
# Manually selected from printed list
keywords = ['shaw', '(', 'play', 'write', 'which', 'work', 'become', 'publish']

In [71]:
# Containes all rules from 'Apply rules to Named Entities'
def check_ent(ent):
    check_label = lambda ent : ent.label_ in ['PERSON', 'ORG', 'WORK_OF_ART', 'GPE']
    check_text = lambda ent : (len(ent.text) > 2) and (list(ent.text)[0].isupper())
    return check_label(ent) and check_text(ent)

def check_sent(sent, keywords):
    for token in sent:
        if token.lemma_ in keywords:
            return True
    return False

In [72]:
ner_recognized_books = remove_dublicates([ent.text for ent in doc.ents if (check_ent(ent) and check_sent(ent.sent, keywords))])

In [74]:
rec, prec, f1 = score(ner_recognized_books, books_gt)
print(f'NER after applying extra rules to it\'s sentence: rec:{rec}, prec:{prec}, f1:{f1}')

NER after applying extra rules to it's sentence: rec:0.31, prec:0.06, f1:0.1
