# POS Tagging : 토큰에 품사를 부착 

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

simpleSentence = 'Seoul is te capital of Korea.'
wordsinSentence = nltk.word_tokenize(simpleSentence)
print(wordsinSentence)
partsOfSpeechTags = nltk.pos_tag(wordsinSentence)
print(partsOfSpeechTags)

['Seoul', 'is', 'te', 'capital', 'of', 'Korea', '.']
[('Seoul', 'NNP'), ('is', 'VBZ'), ('te', 'JJ'), ('capital', 'NN'), ('of', 'IN'), ('Korea', 'NNP'), ('.', '.')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Own Tagger

In [6]:
import nltk
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger('NN')
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
def learnRETagger(simpleSentence):
    customPatterns = [
    (r'.*ing$', 'ADJECTIVE'), # running
    (r'.*ly$', 'ADVERB'), # willingly
    (r'.*ion$', 'NOUN'), # intimation
    (r'(.*ate|.*en|is)$', 'VERB'), # terminate, darken, lighten
    (r'^an$', 'INDEFINITE-ARTICLE'), # terminate
    (r'^(with|on|at)$', 'PREPOSITION'), # on
    (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'), # -1.0, 12345.123
    (r'.*$', None),
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
def learnLookupTagger(simpleSentence):
    mapping = {
    '.': '.', 'place': 'NN', 'on': 'IN',
    'earth': 'NN', 'Reykjavik' : 'NNP', 'is': 'VBZ',
    'an': 'DT', 'amazing': 'JJ'
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
if __name__ == '__main__':
    testSentence = "Reykjavik is an amazing place on earth. I have visited Reykjavik"
    learnDefaultTagger(testSentence)
    print()
    learnRETagger(testSentence)
    print()
    learnLookupTagger(testSentence)

[('Reykjavik', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Reykjavik', 'NN')]

[('Reykjavik', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Reykjavik', None)]

[('Reykjavik', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Reykjavik', 'NNP')]


# Train3

In [13]:
import nltk
import pickle

def sampleData():
    return [
        'Bangalore is the capital of Karnataka.',
        'Steve Jobs was the CEO of Apple.',
        'iPhone was Invented by Apple.',
        'Bookds can be purchased in Market.'
    ]
def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

def saveMyTagger(tagger, fileName):
    fileHandle = open(fileName,'wb')
    pickle.dump(tagger,fileHandle)
    fileHandle.close()
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model = buildDictionary())
    saveMyTagger(tagger,fileName)
    
def loadMyTagger(fileName):
    return pickle.load(open(fileName,'rb'))
sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = "myTagger.pickle"

saveMyTraining(fileName)
myTagger = loadMyTagger(fileName)
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP')]


# Pickle 예제 

In [10]:
import pickle
ls = ['a','b','c']
with open('list.txt','wb') as f:
    pickle.dump(ls,f)

In [11]:
with open('list.txt','rb') as f:
    data = pickle.load(f)
    print(data)

['a', 'b', 'c']


# Grammar

In [24]:
import nltk
import string
from nltk.parse.generate import generate

productions = [
    'ROOT -> WORD',
    'WORD -> ' '',
    'WORD -> NUMBER LETTER',
    'WORD -> LETTER NUMBER'
]
digits = list(string.digits)
for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w=digit))

letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w=letters))

grammarString = '\n'.join(productions)

grammar = nltk.CFG.fromstring(grammarString)

print(grammar)

for sentence in generate(grammar,n=100,depth=5):
    palindrome = ''.join(sentence).replace(' ', '')
    print('생성된 단어: {},크기:{}'.format(palindrome,len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> 
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'
생성된 단어: ,크기:0
생성된 단어: 0a,크기:2
생성된 단어: 0b,크기:2
생성된 단어: 0c,크기:2
생성된 단어: 0d,크기:2
생성된 단어: 1a,크기:2
생성된 단어: 1b,크기:2
생성된 단어: 1c,크기:2
생성된 단어: 1d,크기:2
생성된 단어: 2a,크기:2
생성된 단어: 2b,크기:2
생성된 단어: 2c,크기:2
생성된 단어: 2d,크기:2
생성된 단어: 3a,크기:2
생성된 단어: 3b,크기:2
생성된 단어: 3c,크기:2
생성된 단어: 3d,크기:2
생성된 단어: a0,크기:2
생성된 단어: a1,크기:2
생성된 단어: a2,크기:2
생성된 단어: a3,크기:2
생성된 단어: b0,크기:2
생성된 단어: b1,크기:2
생성된 단어: b2,크기:2
생성된 단어: b3,크기:2
생성된 단어: c0,크기:2
생성된 단어: c1,크기:2
생성된 단어: c2,크기:2
생성된 단어: c3,크기:2
생성된 단어: d0,크기:2
생성된 단어: d1,크기:2
생성된 단어: d2,크기:2
생성된 단어: d3,크기:2


# PCFG

In [26]:
import nltk
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]
grammarString = '\n'.join(productions)
grammar = nltk.PCFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar,n=10,depth=5):
    palindrome = ''.join(sentence).replace(' ','')
    print('문자열:{},크기:{}'.format(palindrome,len(palindrome)))

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]
문자열:A,크기:1
문자열:AB,크기:2
문자열:AC,크기:2
문자열:ABD,크기:3
문자열:ABE,크기:3
문자열:ABF,크기:3
문자열:ACD,크기:3
문자열:ACE,크기:3
문자열:ACF,크기:3
문자열:ABDG,크기:4


In [7]:
import nltk
import string
from nltk.parse.generate import generate

productions = [
    'ROOT -> WORD',
    'WORD -> ' ''
]
alphabets = list(string.digits)
for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

grammarString = '\n'.join(productions)

grammar = nltk.CFG.fromstring(grammarString)

print(grammar)

for sentence in generate(grammar,depth =5):
    palindrome = ''.join(sentence).replace(' ','')
    print('Palindrome : {},size :{}'.format(palindrome,len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> 
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'
Palindrome : ,size :0
Palindrome : 00,size :2
Palindrome : 0000,size :4
Palindrome : 000000,size :6
Palindrome : 001100,size :6
Palindrome : 002200,size :6
Palindrome : 003300,size :6
Palindrome : 004400,size :6
Palindrome : 005500,size :6
Palindrome : 006600,size :6
Palindrome : 007700,size :6
Palindrome : 008800,size :6
Palindrome : 009900,size :6
Palindrome : 0110,size :4
Palindrome : 010010,size :6
Palindrome : 011110,size :6
Palindrome : 012210,size :6
Palindrome : 013310,size :6
Palindrome : 014410,size :6
Palindrome : 015510,size :6
Palindrome : 016610,size :6
Palindrome : 017710,size :6
Palindrome : 018810,size :6
Palindrome : 019910,size :6
Palindrome : 

# 청킹

In [5]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

text = 'Namsan Botanical Garden is well known botanical garden in Seoul, Korea.'

sentences = nltk.sent_tokenize(text)

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    
    chunks = nltk.ne_chunk(tags)
    print(chunks)

(S
  (PERSON Namsan/NNP)
  (PERSON Botanical/NNP Garden/NNP)
  is/VBZ
  well/RB
  known/VBN
  botanical/JJ
  garden/NN
  in/IN
  (GPE Seoul/NNP)
  ,/,
  (GPE Korea/NNP)
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
import nltk
nltk.download('punkt')

text = 'Ravi is the CEO of a Company. He is very powerful public speaker also.'

grammar = '\n'.join(['NP: {<DT>*<NNP>}', # DT : 한정사 ,NNP 고유명사 DT가 0번이상 출현하고 NNP출현
                    'NP: {<JJ>*<NN>}',  # JJ 형용사 NN 명사 JJ가 0번이상 출현하고 NN이 등장
                    'NP: {<NNP>+}'])
sentences = nltk.sent_tokenize(text)

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunkparser = nltk.RegexpParser(grammar)
    result = chunkparser.parse(tags)
    print(result)

(S
  (NP Ravi/NNP)
  is/VBZ
  (NP the/DT CEO/NNP)
  of/IN
  (NP a/DT Company/NNP)
  ./.)
(S
  He/PRP
  is/VBZ
  very/RB
  (NP powerful/JJ public/JJ speaker/NN)
  also/RB
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
import nltk
nltk.download('treebank')
nltk.download('conll2000')
from nltk.corpus import conll2000
from nltk.corpus import treebank_chunk

def mySimpleChunker():
    grammar = 'NP: {<NNP>+}'
    return nltk.RegexpParser(grammar)
def test_nothing(data):
    cp = nltk.RegexpParser('')
    print(cp.evaluate(data))
def test_mysimplechunker(data):
    schunker = mySimpleChunker()
    print(schunker.evaluate(data))
    
datasets = [
    conll2000.chunked_sents('test.txt',chunk_types=['NP']),
    treebank_chunk.chunked_sents()
]
for dataset in datasets:
    test_nothing(dataset[:50])
    test_mysimplechunker(dataset[:50])


ChunkParse score:
    IOB Accuracy:  38.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  48.2%%
    Precision:     71.1%%
    Recall:        17.2%%
    F-Measure:     27.7%%
ChunkParse score:
    IOB Accuracy:  45.0%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  50.7%%
    Precision:     51.9%%
    Recall:         8.8%%
    F-Measure:     15.1%%


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
  return [tok for tok in self._regexp.split(text) if tok]


In [13]:
import nltk

def RDParserExample(grammar,textlist):
    parser = nltk.parse.RecursiveDescentParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()
            
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

text = [
    "Tajmahal is in Agra",
    "Bangalore is the capital of Karnataka"
]

RDParserExample(grammar,text)

(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))
(S
  (NP (NNP Bangalore) (VBZ is))
  (VP (DT the) (NN capital) (IN of) (NNP Karnataka)))


In [14]:
import nltk

def SRParserExample(grammar, textlist):
    parser = nltk.parse.ShiftReduceParser(grammar)
    for text in textlist:
        sentence = nltk.word_tokenize(text)
        print(sentence)
        for tree in parser.parse(sentence):
            print(tree)
            tree.draw()
            
text = [
    "Bangalore is the capital of Karnataka",
    "Tajmahal is in Agra",
]

grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

SRParserExample(grammar,text)

['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka']
['Tajmahal', 'is', 'in', 'Agra']
(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))


### Parsing DG

In [16]:
import nltk
grammar = nltk.grammar.DependencyGrammar.fromstring("""
'savings' -> 'small'
'yield' -> 'savings'
'gains' -> 'large'
'yield' -> 'gains'
""")
sentence = 'small savings yield large gains'
dp = nltk.parse.ProjectiveDependencyParser(grammar)
for t in sorted(dp.parse(sentence.split())):
    print(t)
    t.draw()

(yield (savings small) (gains large))


KeyboardInterrupt: 

### Parsing Chart : 차트 구문 분석

In [19]:
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser, BU_LC_STRATEGY

grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 -> IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")

cp = ChartParser(grammar,BU_LC_STRATEGY,trace=True)

sentence = "Bangalore is the capital of Karnataka"

tokens = sentence.split()
chart = cp.chart_parse(tokens)
parses = list(chart.parses(grammar.start()))
print('Total Edges :',len(chart.edges()))
for tree in parses:
    print(tree)
    tree.draw()

|.Bangal.  is  . the  .capita.  of  .Karnat.|
|[------]      .      .      .      .      .| [0:1] 'Bangalore'
|.      [------]      .      .      .      .| [1:2] 'is'
|.      .      [------]      .      .      .| [2:3] 'the'
|.      .      .      [------]      .      .| [3:4] 'capital'
|.      .      .      .      [------]      .| [4:5] 'of'
|.      .      .      .      .      [------]| [5:6] 'Karnataka'
|[------]      .      .      .      .      .| [0:1] NNP -> 'Bangalore' *
|[------>      .      .      .      .      .| [0:1] T1 -> NNP * VBZ
|.      [------]      .      .      .      .| [1:2] VBZ -> 'is' *
|[-------------]      .      .      .      .| [0:2] T1 -> NNP VBZ *
|[------------->      .      .      .      .| [0:2] S  -> T1 * T4
|.      .      [------]      .      .      .| [2:3] DT -> 'the' *
|.      .      [------>      .      .      .| [2:3] T2 -> DT * NN
|.      .      .      [------]      .      .| [3:4] NN -> 'capital' *
|.      .      [-------------]      .      .| [2: