In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
simpleSentence="Seoul is the capital of Korea."
wordsInSentence=nltk.word_tokenize(simpleSentence)
print(wordsInSentence)
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
print(partsOfSpeechTags)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['Seoul', 'is', 'the', 'capital', 'of', 'Korea', '.']
[('Seoul', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Korea', 'NNP'), ('.', '.')]


In [9]:
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
    
def learnRETagger(simpleSentence):
    customPatterns = [
        (r'.&ing$', 'ADJECTIVE'),
        (r'.*ly$', 'ADVERB'),
        (r'.*ion$', 'NOUN'),
        (r'(.*|.*en|is)$', 'VERB'),
        (r'^an$', 'INDEFINITE-ARTICLE'),
        (r'^(with|on|at)$', 'PREPOSITION'),
        (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'),
        (r'.*$',None)        
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

def learnLookupTagger(simpleSentence):
    mapping = {
        '.':'.', 'place':'NN', 'on':'IN','earth':'NN', 
        'Reykjavik':'NNP', 'is':'VBZ','an':'DT', 'amazing':'JJ'
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

if __name__ == '__main__':
    testSentence = 'Reykjavik is an amazing place on eargh. I have visited Reykjavik'
    learnDefaultTagger(testSentence)
    learnRETagger(testSentence)
    learnLookupTagger(testSentence)

[('Reykjavik', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('eargh', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Reykjavik', 'NN')]
[('Reykjavik', 'VERB'), ('is', 'VERB'), ('an', 'VERB'), ('amazing', 'VERB'), ('place', 'VERB'), ('on', 'VERB'), ('eargh', 'VERB'), ('.', 'VERB'), ('I', 'VERB'), ('have', 'VERB'), ('visited', 'VERB'), ('Reykjavik', 'VERB')]
[('Reykjavik', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('eargh', None), ('.', ','), ('I', None), ('have', None), ('visited', None), ('Reykjavik', 'NNP')]


# pickle 예제
#### 자체 태거를 학습시켜 모델로 저장

In [12]:
import pickle

def sampleData():
    return [
        "Bangalore is the capital of Karnataka.",
        "Steve Jobs was the CEO of Apple.",
        "iPhone was Invented by Apple.",
        "Books can be purchased in Market.",
    ]

def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
            # 토큰 별로 품사가 부착된 딕셔너리가 반환
    return dictionary

def saveMyTagger(tagger, fileName):
    fileHandle = open(fileName,"wb")
    pickle.dump(tagger,fileHandle)
    fileHandle.close()
    
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger (model = buildDictionary())
    # buildDictionary() - 기존의 딕셔너리를 이용해 만든다는 뜻
    saveMyTagger(tagger, fileName)

def loadMyTagger(fileName):
    return pickle.load(open(fileName,'rb'))

sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = 'myTagger.pickle'

saveMyTraining(fileName)

myTagger = loadMyTagger(fileName)

print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP')]


# Grammar CFG(Context-Free Grammar)
### 토큰들 사이의 의존도나 구문 분석에 사용

In [48]:
import string
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD",
    "WORD -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER",
]

digits = list(string.digits)
for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w=digit))
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w=letters))

grammarString = "\n".join(productions)

grammar = nltk.CFG.fromstring(grammarString)

print(grammar)

for sentence in generate(grammar, depth = 3):
    palindrome = "".join(sentence).replace(" ","")
    print("생성된 단어: {}, 크기: {}".format(palindrome, len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'
생성된 단어: , 크기: 0


In [53]:
productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]

grammarString = "\n".join(productions)
grammar = nltk.PCFG.fromstring(grammarString)
print(grammar)
for sentence in generate(grammar, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("문자열 : {}, 크기 : {}".format(palindrome, len(palindrome)))

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]
문자열 : A, 크기 : 1
문자열 : AB, 크기 : 2
문자열 : AC, 크기 : 2
문자열 : ABD, 크기 : 3
문자열 : ABE, 크기 : 3
문자열 : ABF, 크기 : 3
문자열 : ACD, 크기 : 3
문자열 : ACE, 크기 : 3
문자열 : ACF, 크기 : 3
문자열 : ABDG, 크기 : 4
문자열 : ABDH, 크기 : 4
문자열 : ABEG, 크기 : 4
문자열 : ABEH, 크기 : 4
문자열 : ABFG, 크기 : 4
문자열 : ABFH, 크기 : 4
문자열 : ACDG, 크기 : 4
문자열 : ACDH, 크기 : 4
문자열 : ACEG, 크기 : 4
문자열 : ACEH, 크기 : 4
문자열 : ACFG, 크기 : 4
문자열 : ACFH, 크기 : 4


# Recursive CFG

In [56]:
productions = [
    "ROOT -> WORD",
    "WORD -> ' '"
]

alphabets = list(string.digits)
for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))
    
grammarString = "\n".join(productions)
grammar = nltk.CFG.fromstring(grammarString)
                # fromstring : 문자열로부터 규칙을 뽑아낸다!!
print(grammar)
for sentence in generate(grammar, n=20,depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("Palindrome : {}, Size : {}".format(palindrome, len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'
Palindrome : , Size : 0
Palindrome : 00, Size : 2
Palindrome : 0000, Size : 4
Palindrome : 0110, Size : 4
Palindrome : 0220, Size : 4
Palindrome : 0330, Size : 4
Palindrome : 0440, Size : 4
Palindrome : 0550, Size : 4
Palindrome : 0660, Size : 4
Palindrome : 0770, Size : 4
Palindrome : 0880, Size : 4
Palindrome : 0990, Size : 4
Palindrome : 11, Size : 2
Palindrome : 1001, Size : 4
Palindrome : 1111, Size : 4
Palindrome : 1221, Size : 4
Palindrome : 1331, Size : 4
Palindrome : 1441, Size : 4
Palindrome : 1551, Size : 4
Palindrome : 1661, Size : 4
