In [1]:
#import required libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
text = "Can you draft a clause for a non-disclosure agreement under Indian law to protect a company's confidential information?"

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(text)
sent

[('Can', 'MD'),
 ('you', 'PRP'),
 ('draft', 'VB'),
 ('a', 'DT'),
 ('clause', 'NN'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('non-disclosure', 'JJ'),
 ('agreement', 'NN'),
 ('under', 'IN'),
 ('Indian', 'JJ'),
 ('law', 'NN'),
 ('to', 'TO'),
 ('protect', 'VB'),
 ('a', 'DT'),
 ('company', 'NN'),
 ("'s", 'POS'),
 ('confidential', 'JJ'),
 ('information', 'NN'),
 ('?', '.')]

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [6]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Can/MD
  you/PRP
  draft/VB
  (NP a/DT clause/NN)
  for/IN
  (NP a/DT non-disclosure/JJ agreement/NN)
  under/IN
  (NP Indian/JJ law/NN)
  to/TO
  protect/VB
  (NP a/DT company/NN)
  's/POS
  (NP confidential/JJ information/NN)
  ?/.)


In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Can', 'MD', 'O'),
 ('you', 'PRP', 'O'),
 ('draft', 'VB', 'O'),
 ('a', 'DT', 'B-NP'),
 ('clause', 'NN', 'I-NP'),
 ('for', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('non-disclosure', 'JJ', 'I-NP'),
 ('agreement', 'NN', 'I-NP'),
 ('under', 'IN', 'O'),
 ('Indian', 'JJ', 'B-NP'),
 ('law', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('protect', 'VB', 'O'),
 ('a', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ("'s", 'POS', 'O'),
 ('confidential', 'JJ', 'B-NP'),
 ('information', 'NN', 'I-NP'),
 ('?', '.', 'O')]


In [9]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sasuh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sasuh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [8]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(text)))
print(ne_tree)

(S
  Can/MD
  you/PRP
  draft/VB
  a/DT
  clause/NN
  for/IN
  a/DT
  non-disclosure/JJ
  agreement/NN
  under/IN
  (GPE Indian/JJ)
  law/NN
  to/TO
  protect/VB
  a/DT
  company/NN
  's/POS
  confidential/JJ
  information/NN
  ?/.)
