In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

# Sentence : I left my phone on the left side of the room.

In [29]:
text = nltk.word_tokenize("I left my phone on the left side of the room.")

In [30]:
tagged_text = nltk.pos_tag(text)
print(tagged_text)

[('I', 'PRP'), ('left', 'VBD'), ('my', 'PRP$'), ('phone', 'NN'), ('on', 'IN'), ('the', 'DT'), ('left', 'JJ'), ('side', 'NN'), ('of', 'IN'), ('the', 'DT'), ('room', 'NN'), ('.', '.')]


In [31]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

In [32]:
type(grammar)

str

In [33]:
chunker = nltk.RegexpParser(grammar)
chunked_text = chunker.parse(tagged_text)

In [34]:
# 청킹 (Chunking) => 토큰 (token)을 가지고 구(pharase)를 만드는 것
# 만들어진 구(pharase) => chunk라고 함!
# NP : Noun Pharase (명사구)

print(chunked_text)

(S
  I/PRP
  left/VBD
  my/PRP$
  (NP phone/NN)
  on/IN
  (NP the/DT left/JJ side/NN)
  of/IN
  (NP the/DT room/NN)
  ./.)


In [45]:
text = nltk.word_tokenize("the left side")

In [46]:
tagged_text = nltk.pos_tag(text)
print(tagged_text)

[('the', 'DT'), ('left', 'JJ'), ('side', 'NN')]


In [47]:
# Chinking Case 1 : 일치하는 토큰 시퀀스가 전체 청크를 구성하는 경우 전체 청크가 제거

grammar1 = """NP: {<.*>+}
               }<DT|JJ|NN>+{"""

chunker = nltk.RegexpParser(grammar1)
chunked_text1 = chunker.parse(tagged_text)
print("Case 1 : ", chunked_text1)

Case 1 :  (S the/DT left/JJ side/NN)


In [48]:
# Chinking Case 2 : 토큰 시퀀스가 청크 중간에 나타나면 해당 토큰를 제거
# {<.*>+} : chunk 전체
# }<JJ>+{ : 우리가 chunk에서 제거할 부분을 정의


grammar2 = """NP: {<.*>+} 
               }<JJ>+{"""

chunker = nltk.RegexpParser(grammar2)
chunked_text2 = chunker.parse(tagged_text)
print("Case 2 : ", chunked_text2)

Case 2 :  (S (NP the/DT) left/JJ (NP side/NN))


In [49]:
# Chinking Case 3 : 마지막 것을 제외하고 다 제거하겠다.

grammar3 = """NP: {<.*>+}
               }<DT|JJ>+{"""

chunker = nltk.RegexpParser(grammar3)
chunked_text3 = chunker.parse(tagged_text)
print("Case 3 : ", chunked_text3)

Case 3 :  (S the/DT left/JJ (NP side/NN))


In [51]:
text = nltk.word_tokenize("He's got to polish up his Polish for his job.")
tagged_text = nltk.pos_tag(text)
print(tagged_text)

[('He', 'PRP'), ("'s", 'VBZ'), ('got', 'VBD'), ('to', 'TO'), ('polish', 'VB'), ('up', 'RP'), ('his', 'PRP$'), ('Polish', 'NN'), ('for', 'IN'), ('his', 'PRP$'), ('job', 'NN'), ('.', '.')]


In [52]:
namedEntity = nltk.ne_chunk(tagged_text)
print(namedEntity)

(S
  He/PRP
  's/VBZ
  got/VBD
  to/TO
  polish/VB
  up/RP
  his/PRP$
  (GPE Polish/NN)
  for/IN
  his/PRP$
  job/NN
  ./.)
