In [1]:
import pandas as pd
import csv
from math import log, inf

## Baca file corpus dan memisahkan data train dan data test

In [2]:
df = pd.read_csv('Indonesian_Manually_Tagged_Corpus_ID.tsv', sep='\t', header=None, names=['word','tag'], converters={'word': lambda x: x.lower()}, quoting=csv.QUOTE_NONE)

In [3]:
split_index1 = int(df[df['word'].str.contains('id=1001')].index[0])
split_index2 = int(df[df['word'].str.contains('id=1021')].index[0])

In [4]:
df['word'] = df['word'].str.replace(r'<kalimat id=.*>', '<s>')
df['word'] = df['word'].str.replace('</kalimat>', '</s>', regex=False)
df['word'] = df['word'].str.replace(r'(-?\d+.*)', 'NUM')

df.loc[df.word == '<s>', 'tag'] = '<s>'
df.loc[df.word == '</s>', 'tag'] = '</s>'

In [5]:
df_train, df_test = df[:split_index1].reset_index(drop=True), df[split_index1:split_index2].reset_index(drop=True)

## Buat bigram dari urutan tag pada setiap kalimat

In [6]:
def build_bigram(tokens):
    # Get all the unique tokens
    unique_tokens = set(tokens)

    # Initialize unigram and bigram dictionaries
    unigram = {}
    bigram = {}
    for word_row in unique_tokens:
        unigram[word_row] = 0
        bigram[word_row] = {}
        for word_column in unique_tokens:
            bigram[word_row][word_column] = 0

    # Build the bigram and unigram from word sequences in corpus tokens
    unigram[ tokens[0] ] += 1
    for i in range(1, len(tokens)):
        unigram[ tokens[i] ] += 1
        bigram[ tokens[i-1] ][ tokens[i] ] += 1

    # Laplace (add-one) smoothing
#     for word_row in bigram:
#         # Exclude sentence-end flag because it's always followed by sentence-start flag
#         if word_row == "</s>":
#             continue
#         # row <s> shouldn't has column <s> and </s> so it's decremented by 2
#         unigram[word_row] += len(unigram) if word_row != "<s>" else len(unigram) - 2
#         for word_column in bigram[word_row]:
#             bigram[word_row][word_column] += 1

    # Normalize the bigram with unigram
    for word_row in bigram:
        for word_column in bigram[word_row]:
            bigram[word_row][word_column] /= unigram[word_row]
    
    # Special case handling for flags
    bigram["<s>"]["</s>"] = 0.0
    bigram["<s>"]["<s>"] = 0.0
    bigram["</s>"]["<s>"] = 1.0
    
    return bigram

In [22]:
tokens = list(df_train['tag'])

In [8]:
tag_bigram = pd.DataFrame.from_dict(build_bigram(tokens), orient='index')
tag_bigram

Unnamed: 0,NND,UH,RP,<s>,CD,NNP,Z,DT,OD,X,...,SYM,SC,PRP,VB,RB,FW,PR,</s>,JJ,WH
</s>,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
<s>,0.004854,0.002913,0.0,0.0,0.025243,0.164078,0.025243,0.003883,0.0,0.0,...,0.0,0.062136,0.069903,0.029126,0.012621,0.002913,0.016505,0.0,0.003883,0.000971
CC,0.001233,0.0,0.0,0.0,0.065351,0.106042,0.046856,0.001233,0.001233,0.0,...,0.011097,0.019729,0.01233,0.106042,0.045623,0.023428,0.008631,0.0,0.043157,0.001233
CD,0.031619,0.0,0.0,0.0,0.303963,0.031619,0.182968,0.0,0.000422,0.0,...,0.006324,0.019393,0.002108,0.06914,0.006324,0.004216,0.005059,0.001265,0.014334,0.0
DT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.0,0.0,0.0,0.15,0.0,0.05,0.0,0.0,0.0,0.0
FW,0.0,0.0,0.0,0.0,0.011876,0.061758,0.137767,0.0,0.0,0.0,...,0.011876,0.038005,0.009501,0.052257,0.007126,0.372922,0.023753,0.0,0.035629,0.0
IN,0.017234,0.0,0.0,0.0,0.105612,0.194432,0.006628,0.001326,0.001768,0.001326,...,0.015024,0.003977,0.018118,0.003535,0.006628,0.011931,0.016792,0.0,0.019443,0.0
JJ,0.0,0.0,0.000929,0.0,0.105948,0.037175,0.189591,0.0,0.002788,0.0,...,0.049257,0.083643,0.054833,0.068773,0.019517,0.011152,0.020446,0.003717,0.026952,0.000929
MD,0.0,0.0,0.0,0.0,0.003745,0.0,0.005618,0.0,0.0,0.0,...,0.0,0.0,0.005618,0.764045,0.086142,0.0,0.0,0.0,0.052434,0.0
NEG,0.0,0.0,0.031496,0.0,0.023622,0.0,0.023622,0.0,0.0,0.0,...,0.0,0.0,0.0,0.370079,0.125984,0.0,0.0,0.0,0.149606,0.0


## Tabel emisi tag-kata

In [9]:
tabel = df_train.groupby(['word', 'tag']).size().unstack(fill_value=0)
tabel

tag,</s>,<s>,CC,CD,DT,FW,IN,JJ,MD,NEG,...,PRP,RB,RP,SC,SYM,UH,VB,WH,X,Z
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,70
$,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
%,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,11,0,0,0,0,0
&,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,77
(,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
+,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,0
",",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1425
-,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,104


In [10]:
tabel1 = tabel.astype('float')
for i, word in tabel1.iterrows():
    for tag in word.index:
        if (word[tag] > 0):
            word[tag] = float(word[tag]) / tabel1.loc[:, tag].sum()
tabel1

tag,</s>,<s>,CC,CD,DT,FW,IN,JJ,MD,NEG,...,PRP,RB,RP,SC,SYM,UH,VB,WH,X,Z
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""",0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.025427
$,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.009063,0.000000,0.0,0.0,0.000000,0.000000
%,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.033536,0.000000,0.0,0.0,0.000000,0.000000
&,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.001118
',0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.028731
(,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.001921
),0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.001925
+,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.009462,0.000000,0.0,0.0,0.000000,0.000000
",",0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.549544
-,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.088995


## Baseline model

In [11]:
df_test

Unnamed: 0,word,tag
0,<s>,<s>
1,perusahaan,NN
2,ritel,NN
3,pt,NNP
4,matahari,NNP
5,putra,NNP
6,prima,NNP
7,tbk,NNP
8,mppa,NNP
9,dalam,IN


In [12]:
bener = 0
salah = 0
not_exist = 0
all = 0
for i, row in df_test.iterrows():
#     print(row['word'])
    if (row['word'] in tabel.index):
        if (row['tag'] == tabel.loc[row['word']].idxmax()):
            bener += 1
        else :
            salah += 1
    else :
        not_exist += 1
    all += 1
print('bener :',bener,', salah :',salah,', ga ada :', not_exist,', semua :', all)

bener : 497 , salah : 16 , ga ada : 26 , semua : 539


In [13]:
baseline_acc_loss = bener / (all) * 100
print(baseline_acc_loss, '%')
baseline_acc = bener / (all - not_exist) * 100
print(baseline_acc, '%')

92.20779220779221 %
96.88109161793372 %


In [23]:
#for i, row in df_test.iterrows():
    #print(row['word'] in tabel.index, row['word'], row['tag'])

<h2>HMM Model</h2>

In [15]:
def viterbi(words, tags, trans_p, emit_p):
    V = [{}]
    # Buat kata/tag pertama di awal kalimat
    for tag in tags:
        word_p = emit_p.loc[words[0], tag]
        # Simpan hanya yang probabilitasnya lebih dari 0 untuk kata pertama ber-tag 'tag'
        if word_p > 0:
            tr = trans_p.loc['<s>', tag]
            V[0][tag] = {"prob": (log(tr) if tr > 0 else -inf) + log(word_p), 'prev': None}
    # Kata/tag selanjutnya
    for i in range(1, len(words)):
        V.append({})
        word_known = words[i] in tabel1.index
        valid_tags = list(filter(lambda x: emit_p.loc[words[i], x] > 0, tags)) if word_known else ['X']
        for tag in valid_tags:
            max_tr_prob = -inf
            prev_tag_selected = None
            for prev_tag in V[i-1]:
                tr = trans_p.loc[prev_tag, tag]
                tr_prob = V[i-1][prev_tag]['prob'] + (log(tr) if tr > 0 else -inf)
                if (prev_tag_selected == None or tr_prob > max_tr_prob):
                    max_tr_prob = tr_prob
                    prev_tag_selected = prev_tag
            max_prob = max_tr_prob + log(emit_p.loc[words[i], tag]) if word_known else max_tr_prob
            #if (max_prob == -inf):
             #   prev_tag_selected = tabel.loc[words[i-1]].idxmax() if words[i-1] in tabel1.index else 'X'
            V[i][tag] = {'prob': max_prob, 'prev': prev_tag_selected}
    best_tags = []
    
    # Cari end-point dengan probabilitas maksimal
    max_end_prob = max(x['prob'] for x in V[-1].values())
    #print('end', max_end_prob)
    prev_tag = None
    
    for tag, data in V[-1].items():
        if data['prob'] == max_end_prob:
            best_tags.append(tag)
            prev_tag = tag
            #print(tag)
            break
            
    # Backtrack dari end-point ke awal
    for i in range(len(V)-2, -1, -1):
        #print(V[i+1])
        best_tags.insert(0, V[i+1][prev_tag]['prev'])
        prev_tag = V[i+1][prev_tag]['prev']
    return best_tags

In [16]:
words = []
original_tag = []
hmm_tag = []
knowns = []
for i, row in df_test.iterrows():
    words.append(row['word'])
    knowns.append(row['word'] in tabel.index)
    original_tag.append(row['tag'])
    if (row['tag'] == '</s>'):
        hmm_tag.extend(viterbi(words, tokens, tag_bigram, tabel1))
        words = []
hmm_result = pd.DataFrame(list(zip(list(df_test['word']), knowns, original_tag, hmm_tag)), columns=['word', 'known_word', 'original_tag', 'hmm_tag'])
hmm_result

Unnamed: 0,word,known_word,original_tag,hmm_tag
0,<s>,True,<s>,<s>
1,perusahaan,True,NN,NN
2,ritel,True,NN,NN
3,pt,True,NNP,NNP
4,matahari,True,NNP,NNP
5,putra,True,NNP,NN
6,prima,True,NNP,NNP
7,tbk,True,NNP,NNP
8,mppa,False,NNP,X
9,dalam,True,IN,NN


In [18]:
total = len(hmm_result.index)
correct = 0
incorrect = 0
known = 0
for i, tag in enumerate(hmm_tag):
    if knowns[i]:
        known += 1
        if hmm_tag[i] == original_tag[i]:
            correct += 1
        else:
            incorrect += 1
hmm_acc_loss = correct / total * 100
hmm_acc = correct / known * 100

print('bener :',correct,', salah :',incorrect,', ga ada :', total-known,', semua :', total)
print(hmm_acc_loss, '%')
print(hmm_acc, '%')

bener : 436 , salah : 77 , ga ada : 26 , semua : 539
80.89053803339517 %
84.99025341130604 %


## Perbandingan Akurasi

### Baseline

In [20]:
print('Include not found: ', baseline_acc_loss, '%')
print('Found only: ', baseline_acc, '%')

Include not found:  92.20779220779221 %
Found only:  96.88109161793372 %


### HMM

In [21]:
print('Include not found: ', hmm_acc_loss, '%')
print('Found only: ', hmm_acc, '%')

Include not found:  80.89053803339517 %
Found only:  84.99025341130604 %
