Nama  : San Antonio Limbong

NIM   : 12S19033

### 1 Persiapan

In [1]:
import nltk, math, sys
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn.preprocessing import normalize
from collections import defaultdict
import sklearn_crfsuite

### 2 Part of Speech Tagging
#### 2.1 Hidden Markov Model

Hidden Markov Model (HMM) adalah algoritma pemodelan urutan generatif (sekuens urutan
Naive Bayes). HMM mengharuskan kita untuk mempelajari parameter model, probabilitas
transisi dari satu POS tag ke POS tag lainnya dan probabilitas emisi setiap fitur kata yang diberi POS tag, hanya dari kalimat yang diamati. POS tag untuk kata-kata diasumsikan disembunyikan
(mis. tidak diberikan). 

In [2]:
def features(sentence, index):
    
    currWord = sentence[index][0]
    
    if (index > 0):
        prevWord = sentence[index - 1][0]
    else:
        prevWord = '<START>'
        
    if (index < len(sentence)-1):
        nextWord = sentence[index + 1][0]
    else:
        nextWord = '<END>'
    
    return {
        'word' : currWord,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'curr_is_title': currWord.istitle(),
        'prev_is_title': prevWord.istitle(),
        'next_is_title': nextWord.istitle(),
        'curr_is_lower': currWord.islower(),
        'prev_is_lower': prevWord.islower(),
        'next_is_lower': nextWord.islower(),
        'curr_is_upper': currWord.isupper(),
        'prev_is_upper': prevWord.isupper(),
        'next_is_upper': nextWord.isupper(),
        'curr_is_digit': currWord.isdigit(),
        'prev_is_digit': prevWord.isdigit(),
        'next_is_digit': nextWord.isdigit(),
        'curr_prefix-1': currWord[0],
        'curr_prefix-2': currWord[:2],
        'curr_prefix-3': currWord[:3],
        'curr_suffix-1': currWord[-1],
        'curr_suffix-2': currWord[-2:],
        'curr_suffix-3': currWord[-3:],
        
        'prev_prefix-1': prevWord[0],
        'prev_prefix-2': prevWord[:2],
        'prev_prefix-3': prevWord[:3],
        'prev_suffix-1': prevWord[-1],
        'prev_suffix-2': prevWord[-2:],
        'prev_suffix-3': prevWord[-3:],
        
        'next_prefix-1': nextWord[0],
        'next_prefix-2': nextWord[:2],
        'next_prefix-3': nextWord[:3],
        'next_suffix-1': nextWord[-1],
        'next_suffix-2': nextWord[-2:],
        'next_suffix-3': nextWord[-3:],
        
        'prev_word': prevWord,
        'next_word': nextWord,
    }

Blok kode diatas merupakan fungsi Python untuk menghasilkan fitur pada setiap kata (posisi "indeks") dalam "kalimat".

In [3]:
def computeTagProbs(trainLabels, tagsDict):
    numTags = len(tagsDict)
    tagProbs = np.zeros(numTags)
    for sentenceLabels in trainLabels:
        for tag in sentenceLabels:
            tagProbs[tagsDict[tag]] += 1
    tagProbs += 1
    return tagProbs / np.sum(tagProbs)

def computeStartProbs(trainLabels, tagsDict):
    numTags = len(tagsDict)
    startProbs = np.zeros(numTags)
    for sentenceLabels in trainLabels:
        startTag = sentenceLabels[0]
        startProbs[tagsDict[startTag]] += 1
    startProbs += 1
    return startProbs/np.sum(startProbs)

def computeTransitionProbabilities(trainLabels, tagsDict):
    numTags = len(tagsDict)
    transMat = np.zeros(shape=(numTags, numTags))
    for sentenceLabels in trainLabels:
        for i in range(len(sentenceLabels)-1):
            tag1 = tagsDict[sentenceLabels[i]]
            tag2 = tagsDict[sentenceLabels[i+1]]
            transMat[tag1, tag2] += 1
    normalized_transmat = normalize(transMat+1, axis=1, norm='l1')
    return normalized_transmat

Blok kode diatas adalah fungsi untuk menghitung probabilitas setiap POS tag dalam kalimat
pelatihan, probabilitas setiap POS tag menjadi tag awal dalam sebuah kalimat dan probabilitas transisi dari satu POS tag ke POS tag lain untuk semua kalimat pelatihan

In [4]:
def computeEmissionProbabilities(trainFeatures, trainLabels, tagsDict):
    numTags =  len(tagsDict)
    emissionDict = defaultdict(lambda: defaultdict(int))
    uniqueKeys = set()
    for i in range(len(trainLabels)):
        sentenceFeatures = trainFeatures[i]
        sentenceLabels = trainLabels[i]
        for j in range(len(sentenceLabels)):
            tag = sentenceLabels[j]
            for key, val in sentenceFeatures[j].items():
                transformedKey = str(key) + "__" + str(val)
                uniqueKeys.add(transformedKey)
                emissionDict[tag][transformedKey] += 1
    emissionMat = np.zeros(shape=(numTags, len(uniqueKeys)))
    featuresDict = {}
    for index, key in enumerate(uniqueKeys):
        featuresDict[key] = index
    for tag in tagsDict.keys():
        i = tagsDict[tag]
        j = featuresDict[key]
        emissionMat[i, j] = emissionDict[tag][key]
    normalized_emissionMat = normalize(emissionMat+1, axis=1, norm='l1')
    return normalized_emissionMat, featuresDict
    

Blok kode diatas merupakan fungsi untuk menghitung probabilitas emisi, yaitu diberikan
POS tag tertentu, tentukan probabilitas kondisional untuk mengamati fitur kata pada tag
tersebut.

In [5]:
def predictTags(testFeatures, tagProbs, startProbs, transMat, emissionMat, tagsDict, featuresDict):
    numTags = len(tagsDict)
    bestTags = []
    for sentenceFeatures in testFeatures:
        bestTagsSentence = []
        lenSentence = len(sentenceFeatures)
        probMatrix, tagMatrix = np.zeros(shape = (lenSentence, numTags)),np.zeros(shape=(lenSentence, numTags))
        for index in range(lenSentence):
            feat = sentenceFeatures[index]
            for curr in range(numTags):
                emissionProb = 0
                for key, val in feat.items():
                    transformedKey = str(key) + "__" + str(val)
                    if transformedKey in featuresDict:
                        emissionProb += \
                        math.log(emissionMat[curr, featuresDict[transformedKey]])
                    else:
                        emissionProb -= math.log(len(featuresDict))
                emissionProb += math.log(tagProbs[curr])
                maxProb = -sys.float_info.max
                maxProbTag = -1
                if index == 0:
                    probMatrix[index][curr] = \
                    math.log(startProbs[curr]) + emissionProb
                    tagMatrix[index][curr] = -1
                else:
                    for prev in range(numTags):
                        tagProb = \
                        math.log(transMat[prev, curr]) + \
                        math.log(probMatrix[index - 1][prev])
                        if (tagProb > maxProb):
                            maxProb = tagProb
                            maxProbTag = prev
                    maxProb += emissionProb
                    probMatrix[index][curr] = maxProb
                    tagMatrix[index][curr] = maxProbTag
            const = -np.mean(probMatrix[index])
            func = np.vectorize(lambda t: math.exp(t+const))
            probMatrix[index] = func(probMatrix[index])
            probMatrix = normalize(probMatrix, axis=1, norm='l1')
        prevBestTag = None
        for index in reversed(range(lenSentence+1)):
            if index == lenSentence:
                bestTag = probMatrix[index-1].argmax()
            else:
                bestTag = tagMatrix[index][prevBestTag]
            prevBestTag = int(bestTag)
            bestTagsSentence.append(prevBestTag)
        bestTags.append(list(reversed(bestTagsSentence))[1:])
    return bestTags

- 'tagsDict' adalah inverted index untuk POS tag, 
- 'featuresDict' adalah inverted index untuk fitur kata.
- Modul 'predictTags' berfungsi untuk menghitung penugasan POS tag yang paling mungkin untuk urutan kata dalam kalimat menggunakan algoritma Viterbi.

In [6]:
def transformDatasetSequence(sentences):
    wordFeatures, wordLabels = [], []
    for sent in sentences:
        feats, labels = [], []
        for index in range(len(sent)):
            feats.append(features(sent, index))
            labels.append(sent[index][1])
        wordFeatures.append(feats)
        wordLabels.append(labels)
    return wordFeatures, wordLabels

def trainHMM(trainFeatures, trainLabels, tagsDict):
    tagProbs = computeTagProbs(trainLabels, tagsDict)
    startProbs = computeStartProbs(trainLabels, tagsDict)
    transMat = computeTransitionProbabilities(trainLabels, tagsDict)
    emissionMat, featuresDict = computeEmissionProbabilities(trainFeatures, trainLabels, tagsDict)
    return tagProbs, startProbs, transMat, emissionMat, featuresDict

In [7]:
def computeSeqAccuracy(perdictedTags, actualTags):
    total, correct = 0, 0
    
    for i in range(len(predictedTags)):
        for j in range(len(predictedTags[i])):
            total += 1
            if predictedTags[i][j] == actualTags[i][j]:
                correct += 1
                
    return float(correct)/total

Blok kode diatas digunakan untuk menghitung akurasi

In [8]:
from nltk.corpus import brown
import nltk
nltk.download('brown')

brown_tagged_sents = brown.tagged_sents(categories='news')

size = int(len(brown_tagged_sents) * 0.7)

tags = [tag for (word, tag) in brown.tagged_words()]
defaultTag = nltk.FreqDist(tags).max()

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


In [9]:
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

tagsDict = {}
for index, tag in enumerate(set(tags)):
    tagsDict[tag] = index
    
trainSeqFeatures, trainSeqLabels = transformDatasetSequence(train_sents)
testSeqFeatures, testSeqLabels = transformDatasetSequence(test_sents)

tagProbs, startProbs, transMat, emissionMat, featuresDict = \
trainHMM(trainSeqFeatures[:30000], trainSeqLabels[:30000], tagsDict)
predictedTags = predictTags(testSeqFeatures[:100], tagProbs,
                           startProbs, transMat,
                           emissionMat, tagsDict, featuresDict)
print(computeSeqAccuracy(predictedTags, \
                        [[tagsDict[tag] for tag in tags]\
                        for tags in testSeqLabels]))

0.09594594594594595


#### 2.2 Conditional Random Field

In [10]:
def trainCRF(trainFeatures, trainLabels):
    crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    )
    crf.fit(trainFeatures, trainLabels)
    return crf

In [11]:
crf_model = trainCRF(trainSeqFeatures[:5], trainSeqLabels[:5])
pred_labels = crf_model.predict(testSeqFeatures)
print(computeSeqAccuracy(pred_labels, testSeqLabels))

0.0


Definisi parameter dari algoritma ini adalah sebagai berikut:

-  'algorithm' mengacu pada teknik optimasi yang digunakan untuk meminimalkan log-linear loss function dan menghitung bobot fitur. 'L-BFGS' sering menjadi pilihan utama untuk sebagian besar model log-linear (dengan jumlah contoh pelatihan yang relatif kecil).
-  'c1' mengacu pada istilah konstan untuk istilah regularisasi L1.
- 'c2' mengacu pada istilah konstan untuk istilah regularisasi L2.
- 'max_iterations' mengacu pada jumlah iterasi dalam algoritma pengoptimalan. Jika algoritma tidak konvergen sebelum 'max_iterations', maka program berhenti mengoptimalkan lebih lanjut.
- 'all_possible_transitions' set to 'False' menyiratkan bahwa kita hanya mempelajari bobot untuk transisi yang ada dalam kalimat pelatihan, sedangkan jika diatur ke 'True' akan menghitung bobot untuk semua transisi yang mungkin serta untuk semua transisi yang tidak valid (transisi yang tidak diamati dalam data pelatihan), bobot akan negatif, tetapi bobot dari transisi yang valid akan menyesuaikan.