# Word Sense Disambiguation
Untuk tugas akhir mata kuliah NLP Semester Genap 2018/2019

## Model-1

Spesifikasi model yang dibangun pada _notebook_ ini adalah sebagai berikut :

- Data _trainset_ yang digunakan adalah `triple_annotator_agree.csv` dan `single_annotator.csv`.
- _Preprocessing_ yang dilakukan pada kalimat adalah normalisasi kalimat dan _remove stopwords_.
- _Feature_ yang digunakan adalah _Bag-Of-Words_.
- Algoritma model yang digunakan adalah _Multinomial Naive Bayes_.

In [1]:
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re

import warnings
warnings.filterwarnings("ignore")

## Ensuring reproducibility

In [2]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

## Read Dataset

### • Read csv file

In [19]:
import os
path = '../dataset/training_set/'
filenames = os.listdir(path)
filenames

['double_annotator_agree.csv',
 'double_annotator_disagree.csv',
 'single_annotator.csv',
 'triple_annotator_agree.csv',
 'triple_annotator_disagree.csv']

In [20]:
dfs = []

for filename in filenames:
    df = pd.read_csv(path + filename)
    dfs.append(df)
dfs[3].head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000034,mengandung,2301,"Di Jepang, manga biasanya serial di majalah ma..."
1,1000129,mengandung,2301,"Surah ini dinamai Maryam, karena surat ini men..."
2,1000476,mengandung,2301,"Rebusan seperti Waterzooi atau Hachee, rebusan..."
3,1000486,mengandung,2301,Permukaan daun mengandung lapisan lilin sehing...
4,1000511,mengandung,2301,"Pemicu yang paling umum antara lain alergen, r..."


### • Choose DataFrame

In [21]:
dfs[3].shape

(783, 4)

In [22]:
# Select dataframe from triple_annotator_agree.csv and single_annotator.csv
annotator_agree = pd.DataFrame()

for i,df in enumerate(dfs):
    if i == 0 or i == 3:
        annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)

annotator_agree.shape

(1439, 4)

In [23]:
annotator_agree.kata.unique().shape

(54,)

In [24]:
df = annotator_agree[annotator_agree.kata == 'mengandung']
df.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000527,mengandung,2301,"Secara anatomi, hidung adalah penonjolan pada ..."
1,1000651,mengandung,2301,"Mandi air yang mengandung belerang, untuk peng..."
2,1000770,mengandung,2301,"Dengan terikatnya klathrin, membrane sel membe..."
646,994570,mengandung,2301,Karena mengandung PABA (Para Aminobenzoic Acid...
647,994574,mengandung,2301,Kopi robusta dapat dikatakan sebagai kopi kela...


## Datasets preprocessing for supervised learning

### • Remove Stopwords

In [25]:
def remove_stopwords(text):
    stopwords = pd.read_csv('../dataset/stopwords.csv')
    special_list = []
    token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

### • Normalize

In [26]:
def normalize(text):
    normal_txt = text.lower()
    normal_txt = re.sub('\s+', ' ', normal_txt)
    normal_txt = normal_txt.strip()
    normal_txt = re.sub(r'[^\w\s]', '', normal_txt)
    
    normal_regex = re.compile(r'(.)\1{1,}', re.IGNORECASE)
    normal_txt = normal_regex.sub(r'\1\1', normal_txt)
    return normal_txt

### • Preprocessing

In [27]:
def preprocessing(texts):
    text_clean = []
    for txt in texts:
        normal_txt = normalize(txt)
        nosw_txt = remove_stopwords(normal_txt)
        text_clean.append(nosw_txt)
    return text_clean

In [28]:
# Get clean texts
raw_text = annotator_agree['kalimat']
label = annotator_agree['sense'].tolist()

clean_text = preprocessing(raw_text)
clean_text[:3]

['secara anatomi hidung adalah penonjolan pada vertebrata yang mengandung nostril yang menyaring udara untuk pernapasan',
 'mandi air yang mengandung belerang untuk pengobatan penyakit kulit',
 'dengan terikatnya klathrin membrane sel membentuk vesikel yang mengandung molekul ligan']

## Feature Selection

In [29]:
# Import library
from sklearn.base import BaseEstimator, TransformerMixin

In [31]:
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]

In [None]:
class WordSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self)

## Feature Extraction

In [39]:
# Import library
from sklearn.feature_extraction.text import CountVectorizer

### 1. Word ID

In [36]:
def extract_word_id(sense_id):
    assert len(sense_id) == 4
    return sense_id[:2]

In [37]:
# Get wid
sense_id = annotator_agree.sense
wid = np.array(list(map(extract_word_id, sense_id)))
print(wid[:10])

['23' '23' '23' '28' '28' '28' '28' '28' '28' '28']


### 2. Bag-Of-Words

In [40]:
def extract_bag_of_words(text):
    unigram = CountVectorizer(ngram_range=(1,1))
    unigram_matrix = unigram.fit_transform(np.array(text)).todense()
    feat_name = unigram.get_feature_names()
    return unigram_matrix, feat_name

unigram_feat, feat_name = extract_bag_of_words(clean_text)
print(unigram_feat[:3])
print(feat_name[:10])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['006', '010', '04', '075', '0800', '08kepyappti1986', '10', '100', '100500', '1020']


## Supervised Learning (Classification)

In [41]:
# Import library
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [43]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('words', Pipeline([
            
        ])),
        ('bow', Pipeline([
            
        ]))
    ])),
    ('clf', GaussianNB()),
    ('clf', MultinomialNB()),
])

ValueError: not enough values to unpack (expected 2, got 0)

## Save the model