In [1]:
import wikipedia
import nltk

In [2]:
page = wikipedia.page("Iceland")
text = page.content
text = text.replace('=','')

sentences = nltk.sent_tokenize(text)
print(f'Sentences: {len(sentences)}')

Sentences: 528


### POS tagging

In [3]:
tokens = [nltk.word_tokenize(sent) for sent in sentences]
tagged = [nltk.pos_tag(sent) for sent in tokens]

for item in tagged[1]:
    print(f'{item[0]} - {item[1]}')

The - DT
capital - NN
and - CC
largest - JJS
city - NN
is - VBZ
Reykjavík - NNP
, - ,
with - IN
Reykjavík - NNP
and - CC
the - DT
surrounding - VBG
areas - NNS
in - IN
the - DT
southwest - NN
of - IN
the - DT
country - NN
being - VBG
home - VBN
to - TO
over - IN
two-thirds - NNS
of - IN
the - DT
population - NN
. - .


### Named Entity Recognition (ne_chunk)

In [4]:
tokens_ner = nltk.word_tokenize(text)
tagged_ner = nltk.pos_tag(tokens_ner)
ne_chunked = nltk.ne_chunk(tagged_ner)

named_entities = {}
for entity in ne_chunked:
    if isinstance(entity, nltk.tree.Tree):
        tmp = " ".join([word for word, tag in entity.leaves()])
        ent = entity.label()
        named_entities[tmp] = ent
    else:
        continue


limit = 0
tmp = []
for key, value in named_entities.items():
    if key in tmp:
        continue
    else:
        tmp.append(key)
    print(f'{key} - {value}')
    if limit == 20:
        break
    limit +=1

Iceland - GPE
Ísland - PERSON
Nordic - GPE
North Atlantic - LOCATION
Europe - GPE
Reykjavík - GPE
Gulf Stream - ORGANIZATION
Arctic Circle - ORGANIZATION
Landnámabók - PERSON
Norwegian - GPE
Ingólfr Arnarson - PERSON
Norwegians - GPE
Gaelic - ORGANIZATION
Kalmar Union - ORGANIZATION
Norway - GPE
Denmark - PERSON
Sweden - GPE
Danish - GPE
Lutheranism - GPE
French - GPE
Napoleonic Wars - ORGANIZATION


### Custom NER

In [5]:
tokens_ner = nltk.word_tokenize(text)
tagged_ner = nltk.pos_tag(tokens_ner)
entities = []

for sentence in tagged:
    entity = []
    for tagged_entry in sentence:

        if tagged_entry[1].startswith("NNP") or (entity and tagged_entry[1].startswith("IN")):
            entity.append(tagged_entry)
        else:
            if entity and entity[-1][1].startswith("IN"):
                entity.pop()
            if(entity and " ".join(e[0] for e in entity)[0].isupper()):
                entities.append(" ".join(e[0] for e in entity))
            
            entity = []


my_set = set(entities)
for i in list(my_set)[:20]:
    print(i)

Upper Secondary School Act
Danish
Phoca vitulina
Index of Iceland-related
Guinness World Records
Hringvegur
Sport Sport
Icelandic Sign Language
English
Eurasian
European
Nesjavellir
Guardian
Fjallagrasa
UN
Iceland Plateau
Academy Award for Best Foreign Language Film
November
Garðar Svavarsson
Troll Peninsula in Northern Iceland


In [6]:
def get_wiki_classification(entity):
    try:
        page = wikipedia.page(entity)
    except wikipedia.exceptions.DisambiguationError as e:
        return 'something'
    sentences = nltk.sent_tokenize(page.summary)
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(sentences[0]))
    is_word = False
    res = []
    for word in tagged_tokens:
        if word[1].startswith('VB'):
            is_word = True
        
        if is_word:
            if word[1].startswith('JJ') or word[1].startswith('NN'):
                res.append(word[0])
            else:
                if res:
                    return ' '.join(res)
                else:
                    continue

In [7]:
count = 0
tmp = []
for entity in entities:
    if entity in tmp:
        continue
    else:
        tmp.append(entity)
    if count <20:
        count+=1
    else:
        break
    res = get_wiki_classification(entity)
    print(f'{entity} = {res}')

Iceland = Nordic island country




  lis = BeautifulSoup(html).find_all('li')


Icelandic = something
North Atlantic = second largest
Europe = continent
Reykjavík = capital
Gulf Stream = warm
Arctic Circle = polar circles
Landnámabók = Landnáma
Ingólfr Arnarson = first permanent Norse settlers
Norwegians = North Germanic ethnic group native
Scandinavians = people
Gaelic = something
Althing = Althingi
Kalmar Union = personal union
Norway = Nordic country
Denmark = [ ˈdanmɑɡ ]
Sweden = Kingdom
Danish = something
Lutheranism = major branch
Revolution = fundamental
