In [60]:
import nltk
import wikipedia
from collections import Counter

## Loading data

In [3]:
path = './bbc/entertainment/'
files = [str(x+1).zfill(3)+'.txt' for x in range(100)]

In [7]:
text = ''
for order in files:
    with open(path+order, 'r') as file:
        text += (file.read())

In [8]:
text



## Data preprocessing

In [23]:
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)

In [None]:
sentences = nltk.sent_tokenize(text)

In [11]:
len(sentences)

1427

In [24]:
tagged

[('Gallery', 'NNP'),
 ('unveils', 'JJ'),
 ('interactive', 'JJ'),
 ('tree', 'NN'),
 ('A', 'NNP'),
 ('Christmas', 'NNP'),
 ('tree', 'NN'),
 ('that', 'WDT'),
 ('can', 'MD'),
 ('receive', 'VB'),
 ('text', 'NN'),
 ('messages', 'NNS'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('unveiled', 'VBN'),
 ('at', 'IN'),
 ('London', 'NNP'),
 ("'s", 'POS'),
 ('Tate', 'NNP'),
 ('Britain', 'NNP'),
 ('art', 'JJ'),
 ('gallery', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('spruce', 'NN'),
 ('has', 'VBZ'),
 ('an', 'DT'),
 ('antenna', 'NN'),
 ('which', 'WDT'),
 ('can', 'MD'),
 ('receive', 'VB'),
 ('Bluetooth', 'NNP'),
 ('texts', 'FW'),
 ('sent', 'VBN'),
 ('by', 'IN'),
 ('visitors', 'NNS'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('Tate', 'NNP'),
 ('.', '.'),
 ('The', 'DT'),
 ('messages', 'NNS'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('``', '``'),
 ('unwrapped', 'JJ'),
 ("''", "''"),
 ('by', 'IN'),
 ('sculptor', 'NN'),
 ('Richard', 'NNP'),
 ('Wentworth', 'NNP'),
 (',', ','),
 ('who', 'WP'),
 ('is', 'VBZ'),
 ('responsible', 'JJ'),
 ('

## Using nltk.ne_chunk for NER

In [25]:
entities = nltk.ne_chunk(tagged, binary=True)

In [33]:
len(entities)

32143

In [56]:
entities_list = []
for t in entities.subtrees():
    if t.label() == 'NE':
        e_tuples = list(t)
        s = ''
        for tup in e_tuples:
            s += tup[0] + ' '
        entities_list.append(s[:-1])

In [57]:
entities_list

['Gallery',
 'London',
 'Tate Britain',
 'Tate',
 'Richard Wentworth',
 'Tate',
 'Tracey Emin',
 'Norway',
 'ArtWorks',
 'Wentworth',
 'Henry Moore',
 'Wentworth',
 'Jarre',
 'French',
 'Copenhagen',
 'Denmark',
 'Parken',
 'Danish',
 'Christian Andersen',
 'Jarre',
 'Andersen',
 'Emperor',
 'New Clothes',
 'Little Mermaid',
 'Denmark',
 'Crown Princess Mary',
 'New York',
 'Manhattan',
 'US',
 'Harold Bloom',
 'Andersen',
 'Frederik',
 'Bloom',
 'Christian Andersen Award',
 'Anderson',
 'Odense',
 'Christian Anderson School',
 'Queen Mary',
 'Ugly Duckling',
 'Danish',
 'Helena Christensen',
 'Harvey Keitel',
 'Sir Roger Moore',
 'Cathy Freeman',
 'Brazilian',
 'Capra',
 'Jerry Springer',
 'Frank Capra',
 'James Stewart',
 'Jon',
 'Steve Brown',
 'London',
 'Greatest Gift',
 'Philip',
 'Doren Stern',
 'Van Doren Stern',
 'Brown',
 'Spend Spend Spend',
 'Paramount',
 'Richard',
 'Richard',
 'Judy',
 'Richard',
 'Andrew Taylor',
 'American Boy',
 'Robbie Williams',
 'Alice Sebold',
 'Lo

In [64]:
entities_freq = (word for word in entities_list if word[:1].isupper())
counter_entities = Counter(entities_freq)
print (counter_entities.most_common(10))

[('US', 59), ('Aviator', 34), ('Oscars', 33), ('Hollywood', 29), ('British', 29), ('London', 27), ('Million Dollar Baby', 24), ('Sideways', 20), ('French', 17), ('Harry Potter', 16)]


## Using customized NER

In [30]:
grammar = "NP: {<DT>?<JJ>*<NN|NNS>}"
cp = nltk.RegexpParser(grammar)
entities_custom = cp.parse(tagged)

In [36]:
len(entities_custom)

30521

In [37]:
entities_custom[:3]

[('Gallery', 'NNP'),
 Tree('NP', [('unveils', 'JJ'), ('interactive', 'JJ'), ('tree', 'NN')]),
 ('A', 'NNP')]

In [81]:
entities_custom_list = []
for subtree in entities_custom.subtrees():
    if subtree.label() == 'NP':
      entities_custom_list.append(' '.join(word for word, tag in subtree.leaves()))

In [82]:
entities_custom_list

['unveils interactive tree',
 'tree',
 'text',
 'messages',
 'art gallery',
 'The spruce',
 'an antenna',
 'visitors',
 'The messages',
 'sculptor',
 'the tree',
 'broken plates',
 'light',
 'bulbs',
 'the 17th year',
 'the gallery',
 'an artist',
 'tree',
 'Artists',
 'tree',
 'previous years',
 'The plain',
 'spruce',
 'the gallery',
 'foyer',
 'light bulb',
 'adornments',
 'ordinary domestic ones',
 'string',
 'The plates',
 'the branches',
 'the children',
 'charity',
 'an assistant',
 'the late 1960s',
 'reputation',
 'a sculptor',
 'influential teachers',
 'decades',
 'photography',
 'mundane',
 'everyday subjects',
 'a cigarette',
 'packet',
 'the wonky leg',
 'a table',
 'fairytale celebration',
 'a concert',
 'the bicentennial',
 'the birth',
 'writer',
 'a three-day celebration',
 'the life',
 'the fairy-tale author',
 'a concert',
 'stadium',
 'Other stars',
 'the line-up',
 'the coming months',
 'family',
 'fairy',
 'tales',
 'any age',
 'the pure enjoyment',
 'the tale',
 

In [83]:
entities_cust_freq = (word for word in entities_custom_list if word[:1].isupper())
counter_entities_cust = Counter(entities_cust_freq)
print (counter_entities_cust.most_common(10))

[('The film', 24), ('TV', 21), ('This year', 12), ('The awards', 9), ('The actor', 6), ('The sequel', 6), ('The show', 5), ('Oscar', 5), ('The winner', 5), ('The book', 5)]


## Extracting information from Wikipedia

In [88]:
for entity in entities_list:
    try:
        summary = wikipedia.page(wikipedia.search(entity)[0]).summary
    except wikipedia.DisambiguationError:
        pass
    else:
        sent = nltk.sent_tokenize(summary)[0]
        index = sent.index(' is ') if ' is ' in sent else None
        if index is not None:
            cut = sent[index:]
            t_tokens = nltk.word_tokenize(cut)
            t_tagged = nltk.pos_tag(t_tokens)
            t_entities = nltk.ne_chunk(t_tagged, binary=True)
            t_entities_list = []
            for t in t_entities.subtrees():
                if t.label() == 'NE':
                    e_tuples = list(t)
                    s = ''
                    for tup in e_tuples:
                        s += tup[0] + ' '
                    t_entities_list.append(s[:-1])
            if len(t_entities_list) > 0:
                category = t_entities_list[0]
            else:
                category = 'Other'
        else:
                category = 'Other'
        print(entity, ': ', category)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


London :  England
Tate Britain :  Millbank
Tate :  United Kingdom
Tate :  United Kingdom
Tracey Emin :  Other
Norway :  Scandinavian Peninsula
ArtWorks :  RISC
Henry Moore :  Other
Copenhagen :  Denmark
Denmark :  Scandinavian
Parken :  Indre Østerbro
Andersen :  Greek
Emperor :  Other
New Clothes :  Danish
Little Mermaid :  Danish
Denmark :  Scandinavian
Crown Princess Mary :  Other
New York :  United States
Manhattan :  New York City
US :  Other
Harold Bloom :  American
Andersen :  Greek
Christian Andersen Award :  Danish
Odense :  Denmark
Christian Anderson School :  Anderson
Ugly Duckling :  Danish
Helena Christensen :  Danish
Harvey Keitel :  American
Sir Roger Moore :  Other
Cathy Freeman :  Australian
Jerry Springer :  American
Frank Capra :  Other


KeyboardInterrupt: 

In [84]:
for entity in entities_custom_list:
    try:
        summary = wikipedia.page(wikipedia.search(entity)[0]).summary
    except wikipedia.DisambiguationError:
        pass
    else:
        sent = nltk.sent_tokenize(summary)[0]
        index = sent.index(' is ')
        cut = sent[index:]
        t_tokens = nltk.word_tokenize(cut)
        t_tagged = nltk.pos_tag(t_tokens)
        t_entities = cp.parse(t_tagged)
        t_entities_list = []
        for subtree in t_entities.subtrees():
            if subtree.label() == 'NP':
              t_entities_list.append(' '.join(word for word, tag in subtree.leaves()))
        if len(t_entities_list) > 0:
            category = t_entities_list[0]
        else:
            category = 'Other'
        print(entity, ': ', category)

unveils interactive tree :  a metaphor
tree :  a perennial plant




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


messages :  a discrete unit
art gallery :  a building
The spruce :  a tree
an antenna :  an electrical device
The messages :  an interpretation
sculptor :  the branch
broken plates :  a Greek custom
light :  electromagnetic radiation
bulbs :  a short stem


ValueError: substring not found