# Word Sense Disambiguation
Untuk tugas akhir mata kuliah NLP Semester Genap 2018/2019

## Model-3

Spesifikasi model yang dibangun pada _notebook_ ini adalah sebagai berikut :

- Data _trainset_ yang digunakan adalah `triple_annotator_agree.csv` dan `single_annotator.csv`.
- _Preprocessing_ yang dilakukan pada kalimat adalah ...
- _Feature_ yang digunakan adalah ...
- Algoritma model yang digunakan adalah ...

In [1]:
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re

import warnings
warnings.filterwarnings("ignore")

## Ensuring reproducibility

In [2]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

## Read Dataset

### • Read csv file

In [9]:
import os
path = '../dataset/training_set/'
filenames = os.listdir(path)
filenames

['double_annotator_agree.csv',
 'double_annotator_disagree.csv',
 'single_annotator.csv',
 'triple_annotator_agree.csv',
 'triple_annotator_disagree.csv']

In [10]:
training_set = []

for filename in filenames:
    df = pd.read_csv(path + filename)
    training_set.append(df)
training_set[3].head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000034,mengandung,2301,"Di Jepang, manga biasanya serial di majalah ma..."
1,1000129,mengandung,2301,"Surah ini dinamai Maryam, karena surat ini men..."
2,1000476,mengandung,2301,"Rebusan seperti Waterzooi atau Hachee, rebusan..."
3,1000486,mengandung,2301,Permukaan daun mengandung lapisan lilin sehing...
4,1000511,mengandung,2301,"Pemicu yang paling umum antara lain alergen, r..."


In [11]:
path = '../dataset/testing_set/'
filenames = os.listdir(path)
filenames

['testing_data.csv']

In [12]:
testing_set = []

for filename in filenames:
    df = pd.read_csv(path + filename)
    testing_set.append(df)
testing_set[0].head()

Unnamed: 0,id,word,kalimat
0,13,asing,"Para pecinta film indonesia atau tv, pasti tak..."
1,19,asing,Pasti telinga kita merasa asing dan aneh mende...
2,41,asing,Warga negara asing atau warga negara Persemakm...
3,44,asing,"Selama lima belas tahun memerintah, Sultan Mah..."
4,121,asing,Yang kemudian diikuti dengan donat-donat waral...


### • Choose training set

In [13]:
# Select dataframe from triple_annotator_agree.csv and single_annotator.csv
annotator_agree = pd.DataFrame()

for i,df in enumerate(training_set):
    if i == 0 or i == 3:
        annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)

annotator_agree.shape

(1439, 4)

In [14]:
annotator_agree.kata.unique().shape

(54,)

In [23]:
df = annotator_agree[annotator_agree.kata == 'bisa']
df.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
227,236872,bisa,101,Penelitian pada tahun 2003 melaporkan bahwa mo...
228,237114,bisa,101,Pramana ingin menceraikan Anita untuk bisa mer...
229,240206,bisa,101,"Ia menemukan bahwa dalam unsur tertentu, nukle..."
230,242752,bisa,101,Maleficent sudah berusaha mencabut kutukannya ...
231,243816,bisa,101,"Walaupun berasal dari keluarga sederhana, Inda..."


In [26]:
df.sense.unique()

array(['0101'], dtype=object)

## Datasets preprocessing for supervised learning

### • Remove Stopwords

In [16]:
def remove_stopwords(text):
    """ Remove stopwords from a text (bahasa).
        :param text: the text 
        :type text: string
        :return: a stopwords removed text
        :rtype: string
    """
    stopwords = pd.read_csv('../dataset/stopwords.csv')
    special_list = []
    token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

### • Normalize

In [17]:
def normalize(text):
    """ Normalize a text (lowercase, ).
    """
    normal_txt = text.lower()
    normal_txt = re.sub('\s+', ' ', normal_txt)
    normal_txt = normal_txt.strip()
    normal_txt = re.sub(r'[^\w\s]', '', normal_txt)
    
    normal_regex = re.compile(r'(.)\1{1,}', re.IGNORECASE)
    normal_txt = normal_regex.sub(r'\1\1', normal_txt)
    return normal_txt

### • Preprocessing

In [57]:
def preprocessing(texts):
    text_clean = []
    for txt in texts:
        normal_txt = normalize(txt)
        #nosw_txt = remove_stopwords(normal_txt)
        text_clean.append(normal_txt)
    return text_clean

In [58]:
# Get clean texts
raw_text = annotator_agree['kalimat']
label = annotator_agree['sense'].tolist()

clean_text = preprocessing(raw_text)
clean_text[:3]

['secara anatomi hidung adalah penonjolan pada vertebrata yang mengandung nostril yang menyaring udara untuk pernapasan',
 'mandi air yang mengandung belerang untuk pengobatan penyakit kulit',
 'dengan terikatnya klathrin membrane sel membentuk vesikel yang mengandung molekul ligan']

In [59]:
annotator_agree['clean_text'] = clean_text
annotator_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean_text
0,1000527,mengandung,2301,"Secara anatomi, hidung adalah penonjolan pada ...",secara anatomi hidung adalah penonjolan pada v...
1,1000651,mengandung,2301,"Mandi air yang mengandung belerang, untuk peng...",mandi air yang mengandung belerang untuk pengo...
2,1000770,mengandung,2301,"Dengan terikatnya klathrin, membrane sel membe...",dengan terikatnya klathrin membrane sel memben...
3,1001199,mengejar,2802,"Dalam menapaki karier di gedung parlemen, Toto...",dalam menapaki karier di gedung parlemen totok...
4,1001330,mengejar,2802,Ini juga memberi EMC pengalaman dan wawasan ma...,ini juga memberi emc pengalaman dan wawasan ma...


In [60]:
feature_cols = ['kata', 'clean_text']
annotator_agree[feature_cols].iloc[0]

kata                                                 mengandung
clean_text    secara anatomi hidung adalah penonjolan pada v...
Name: 0, dtype: object

## Feature engineering

In [20]:
def add_basic_features(sentence_terms, index):
    """ Compute some very basic word features.
        :param sentence_terms: [w1, w2, ...] 
        :type sentence_terms: list
        :param index: the index of the word 
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    term = sentence_terms[index]
    sentence_len len(sentence_terms)
    return {
        'term': term,
        'word-2': '' if index == 0 or index == 1 else sentence_terms[index - 2],
        'word-1': '' if index == 0 else sentence_terms[index - 1],
        'word+1': '' if index == sentence_len - 1 else sentence_terms[index + 1],
        'word+2': '' if index == sentence_len - 1 or index == sentence_len - 2 else sentence_terms[index + 1],        
        
        
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word': '' if index == 0 else sentence_terms[index - 1],
        'next_word': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1]
    }

In [21]:
def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.
    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return: 
    """
    X, y = [], []
    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y