In [51]:
import nltk
import numpy as np
import pandas as pd

In [52]:
def preprocess(doc):
    # Sentence segmenter
    sentences = nltk.sent_tokenize(doc)
    # Word tokenizer
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    # Parts of speech tagger
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    
    return sentences

In [74]:
def NPchunking(sentence):
    # Naive chunking 
    grammar = "NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    print(cp.parse(sentence))

In [75]:
def Regexchunking(sentence):
    # Naive chunking 
    grammar = r"""
        CHUNK:  {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
                {<NNP>+}                # chunk sequences of proper nouns
    """
    cp = nltk.RegexpParser(grammar)
    print(cp.parse(sentence))

In [76]:
df = pd.read_json('infodem_docs.json', encoding='utf-8')
for doc in df['content']:
    sentences = preprocess(doc)
    for sentence in sentences:
        Regexchunking(sentence)

(S
  (CHUNK ISABELA/NNP)
  ,/,
  (CHUNK Philippines/NNP)
  –/VBZ
  (CHUNK The/DT rainy/NN)
  (CHUNK season/NN)
  has/VBZ
  just/RB
  started/VBN
  ,/,
  but/CC
  (CHUNK the/DT municipality/NN)
  of/IN
  (CHUNK Luna/NNP)
  in/IN
  (CHUNK Isabela/NNP)
  (CHUNK province/NN)
  in/IN
  the/DT
  northern/JJ
  Philippines/NNPS
  already/RB
  declared/VBD
  (CHUNK state/NN)
  of/IN
  (CHUNK calamity/NN)
  after/IN
  their/PRP$
  (CHUNK health/NN)
  (CHUNK office/NN)
  recorded/VBD
  54/CD
  cases/NNS
  of/IN
  (CHUNK dengue/NN)
  in/IN
  just/RB
  (CHUNK the/DT month/NN)
  of/IN
  (CHUNK June/NNP)
  ./.)
(S
  (CHUNK Dengue/NNP)
  is/VBZ
  (CHUNK a/DT fever/NN)
  ,/,
  usually/RB
  fatal/JJ
  ,/,
  caused/VBN
  by/IN
  (CHUNK a/DT bite/NN)
  of/IN
  (CHUNK Aedes/NNP)
  (CHUNK mosquito/NN)
  ./.)
(S
  It/PRP
  targets/VBZ
  (CHUNK the/DT immune/JJ system/NN)
  of/IN
  (CHUNK the/DT body/NN)
  ./.)
(S
  (CHUNK Dr/NNP Claire/NNP Francisco/NNP)
  ,/,
  (CHUNK Luna/NNP ’/NNP)
  s/VBZ
  (CHUNK health

  ./.)
(S
  These/DT
  barangays/NNS
  include/VBP
  :/:
  Caloocan/JJ
  (CHUNK City/NNP)
  :/:
  (CHUNK Bagong/NNP Barrio/NNP)
  and/CC
  (CHUNK Dagat-dagatan/NNP Las/NNP Piñas/NNP City/NNP)
  :/:
  (CHUNK Pamplona/NNP Uno/NNP)
  ,/,
  (CHUNK Talon/NNP Dos/NNP)
  ,/,
  and/CC
  (CHUNK Talon/NNP Singko/NNP Malabon/NNP City/NNP)
  :/:
  (CHUNK Longos/NNP)
  and/CC
  (CHUNK Tonsuya/NNP Manila/NNP)
  :/:
  (CHUNK Binondo/NNP)
  ,/,
  (CHUNK Quiapo/NNP)
  ,/,
  (CHUNK Port/NNP Area/NNP)
  ,/,
  (CHUNK Sampaloc/NNP)
  ,/,
  (CHUNK Sta/NNP)
  ./.)
(S (CHUNK Cruz/NNP) ,/, (CHUNK Sta/NNP) ./.)
(S
  (CHUNK Mesa/NNP)
  ,/,
  and/CC
  (CHUNK Tondo/NNP Muntinlupa/NNP City/NNP)
  :/:
  (CHUNK Alabang/NNP)
  and/CC
  (CHUNK Putatan/NNP Navotas/NNP)
  :/:
  (CHUNK
    North/NNP
    Bay/NNP
    Boulevard/NNP
    South/NNP
    Parañaque/NNP
    City/NNP)
  :/:
  (CHUNK Don/NNP Bosco/NNP)
  and/CC
  (CHUNK Moonwalk/NNP Taguig/NNP City/NNP)
  :/:
  (CHUNK Bagong/NNP Tanyag/NNP Valenzuela/NNP City/NNP)
  

In [79]:
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [None]:
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): [1]
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data) [2]

    def parse(self, sentence): [3]
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))