# Word Sense Disambiguation
Untuk tugas akhir mata kuliah NLP Semester Genap 2018/2019

## Model-1

Spesifikasi model yang dibangun pada _notebook_ ini adalah sebagai berikut :

- Data _trainset_ yang digunakan adalah `triple_annotator_agree.csv` dan `single_annotator.csv`.
- _Preprocessing_ yang dilakukan pada kalimat adalah normalisasi kalimat dan _remove stopwords_.
- _Feature_ yang digunakan adalah _Bag-Of-Words_.
- Algoritma model yang digunakan adalah ...

In [1]:
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re

from nltk.tag import CRFTagger
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

## Ensuring reproducibility

In [2]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

## Read Dataset

### • Read csv file

In [3]:
import os
path = '../dataset/training_set/'
filenames = os.listdir(path)
filenames

['double_annotator_agree.csv',
 'double_annotator_disagree.csv',
 'single_annotator.csv',
 'triple_annotator_agree.csv',
 'triple_annotator_disagree.csv']

In [19]:
dfs = []

for filename in filenames:
    df = pd.read_csv(path + filename)
    dfs.append(df)
dfs[3].head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000034,mengandung,2301,"Di Jepang, manga biasanya serial di majalah ma..."
1,1000129,mengandung,2301,"Surah ini dinamai Maryam, karena surat ini men..."
2,1000476,mengandung,2301,"Rebusan seperti Waterzooi atau Hachee, rebusan..."
3,1000486,mengandung,2301,Permukaan daun mengandung lapisan lilin sehing...
4,1000511,mengandung,2301,"Pemicu yang paling umum antara lain alergen, r..."


### • Choose DataFrame

In [20]:
dfs[3].shape

(783, 4)

In [26]:
# 
annotator_agree = pd.DataFrame()

for i,df in enumerate(dfs):
    if i == 0 or i == 3:
        annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)
#     if df.shape[1] == 4:
#         annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)

annotator_agree.shape

(1439, 4)

In [28]:
annotator_agree.kata.unique()

array(['mengandung', 'mengejar', 'mengeluarkan', 'mengikat', 'mengisi',
       'menjaga', 'menurunkan', 'menyusun', 'nilai', 'atas', 'baru',
       'panas', 'pembagian', 'rapat', 'sarung', 'tengah', 'tinggi',
       'berat', 'besar', 'asing', 'bidang', 'bintang', 'bisa', 'buah',
       'bulan', 'bunga', 'cabang', 'cerah', 'coklat', 'dalam', 'badan',
       'dasar', 'dunia', 'halaman', 'harapan', 'jalan', 'jam', 'jaringan',
       'kabur', 'kaki', 'kali', 'kepala', 'ketat', 'kulit', 'kunci',
       'layar', 'lebat', 'lingkungan', 'mata', 'membawa', 'memecahkan',
       'menangkap', 'mendorong', 'menerima'], dtype=object)

In [8]:
df = annotator_agree[annotator_agree.kata == 'mengandung']
df.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000527,mengandung,2301,"Secara anatomi, hidung adalah penonjolan pada ..."
1,1000651,mengandung,2301,"Mandi air yang mengandung belerang, untuk peng..."
2,1000770,mengandung,2301,"Dengan terikatnya klathrin, membrane sel membe..."
646,994570,mengandung,2301,Karena mengandung PABA (Para Aminobenzoic Acid...
647,994574,mengandung,2301,Kopi robusta dapat dikatakan sebagai kopi kela...


## Datasets preprocessing for supervised learning

### • Remove Stopwords

In [9]:
def remove_stopwords(text):
    stopwords = pd.read_csv('../dataset/stopwords.csv')
    special_list = []
    token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

### • Normalize

In [11]:
def normalize(text):
    normal_txt = text.lower()
    normal_txt = re.sub('\s+', ' ', normal_txt)
    normal_txt = normal_txt.strip()
    normal_txt = re.sub(r'[^\w\s]', '', normal_txt)
    
    normal_regex = re.compile(r'(.)\1{1,}', re.IGNORECASE)
    normal_txt = normal_regex.sub(r'\1\1', normal_txt)
    return normal_txt

### • Preprocessing

In [12]:
def preprocessing(texts):
    text_clean = []
    for txt in texts:
        normal_txt = normalize(txt)
        nosw_txt = remove_stopwords(normal_txt)
        text_clean.append(nosw_txt)
    return text_clean

raw_text = annotator_agree['kalimat']
label = annotator_agree['sense'].tolist()

clean_text = preprocessing(raw_text)
clean_text[:3]

['secara anatomi hidung adalah penonjolan pada vertebrata yang mengandung nostril yang menyaring udara untuk pernapasan',
 'mandi air yang mengandung belerang untuk pengobatan penyakit kulit',
 'dengan terikatnya klathrin membrane sel membentuk vesikel yang mengandung molekul ligan']

## Feature Extraction

In [13]:
# Import library
from sklearn.feature_extraction.text import CountVectorizer

### 1. Bag-Of-Words

In [14]:
def extract_bag_of_words(text):
    unigram = CountVectorizer(ngram_range=(1,1))
    unigram_matrix = unigram.fit_transform(np.array(text)).todense()
    feat_name = unigram.get_feature_names()
    return unigram_matrix, feat_name

unigram_feat, feat_name = extract_bag_of_words(clean_text)
print(unigram_feat[:3])
print(feat_name[:10])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['00', '001', '0010', '0036', '00432', '0045', '005', '006', '00695', '01']


## Supervised Learning (Classification)

In [15]:
# Import library
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [16]:
# classifiers and hyperparameters
classifiers = [
    LogisticRegression(solver='liblinear'),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
]

hyperparameters = [
    {'penalty': ['l1', 'l2'], 'C': np.logspace(-3,3,7)},
    {'max_features': ['auto', 'sqrt', 'log2'], 'criterion' : ['gini', 'entropy']},
    {'n_estimators': [2, 4], 'max_features': ['auto', 'sqrt', 'log2'], 'criterion' : ['gini', 'entropy']},
    {'n_estimators': [2, 4], 'base_estimator': [LogisticRegressionCV(cv=5), DecisionTreeClassifier(), GaussianNB()]},
    {}
]

In [17]:
#feat_list = [unigram_feat, sentlex_feat, postag_feat, orto_feat]
#feat_name = ["Unigram", "Sentimen", "POS", "Ortografi"]

X_train = unigram_feat
y_train = label

In [18]:
classf = SVC(kernel='rbf')
scoring = ['accuracy', 'f1_macro']
scores = cross_validate(classf, X_train, y_train, cv=5, scoring=scoring, verbose=1)
acc = np.mean(scores['test_accuracy'])
f1 = np.mean(scores['test_f1_macro'])
print("Akurasi :", acc)
print("F1-Score :", f1)
print("---------------")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [45]:
for i,classf in enumerate(classifiers):
    print("# Tuning hyperparam for " + classf.__class__.__name__)
    cv = GridSearchCV(classf, hyperparameters[i], cv=5, verbose=1)
    cv.fit(X_train, y_train)
    
    print("# View best hyperparam")
    print(cv.best_params_)
    print("")
    print("# View Score")
    y_pred = cv.best_estimator_.predict(X_test)
    print("score : ", accuracy_score(y_test, y_pred))
    print("")
    print("classification report : ")
    print(classification_report(y_test, y_pred))
    print("")
    print("confusion matrix : ")
    print(pd.crosstab(y_test, y_pred, rownames=["Actual Label"], colnames=["Predicted Label"]))
    print("")

# Tuning hyperparam for LogisticRegression
Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 