In [1]:
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re

from nltk.tag import CRFTagger
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
CUSTOM_SEED = 42
np.random.seed(CUSTOM_SEED)

In [3]:
import os
path = '../dataset/training_set/'
filenames = os.listdir(path)
filenames

['double_annotator_agree.csv',
 'double_annotator_disagree.csv',
 'single_annotator.csv',
 'triple_annotator_agree.csv',
 'triple_annotator_disagree.csv']

In [4]:
dfs = []

for filename in filenames:
    df = pd.read_csv(path + filename)
    dfs.append(df)
dfs[3].head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000034,mengandung,2301,"Di Jepang, manga biasanya serial di majalah ma..."
1,1000129,mengandung,2301,"Surah ini dinamai Maryam, karena surat ini men..."
2,1000476,mengandung,2301,"Rebusan seperti Waterzooi atau Hachee, rebusan..."
3,1000486,mengandung,2301,Permukaan daun mengandung lapisan lilin sehing...
4,1000511,mengandung,2301,"Pemicu yang paling umum antara lain alergen, r..."


In [5]:
dfs[3].shape

(783, 4)

In [6]:
# 
annotator_agree = pd.DataFrame()

for i,df in enumerate(dfs):
    if i == 0 or i == 3:
        annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)
#     if df.shape[1] == 4:
#         annotator_agree = pd.concat([annotator_agree, df], ignore_index=True)

annotator_agree.shape

(1439, 4)

In [7]:
annotator_agree.kata.unique()

array(['mengandung', 'mengejar', 'mengeluarkan', 'mengikat', 'mengisi',
       'menjaga', 'menurunkan', 'menyusun', 'nilai', 'atas', 'baru',
       'panas', 'pembagian', 'rapat', 'sarung', 'tengah', 'tinggi',
       'berat', 'besar', 'asing', 'bidang', 'bintang', 'bisa', 'buah',
       'bulan', 'bunga', 'cabang', 'cerah', 'coklat', 'dalam', 'badan',
       'dasar', 'dunia', 'halaman', 'harapan', 'jalan', 'jam', 'jaringan',
       'kabur', 'kaki', 'kali', 'kepala', 'ketat', 'kulit', 'kunci',
       'layar', 'lebat', 'lingkungan', 'mata', 'membawa', 'memecahkan',
       'menangkap', 'mendorong', 'menerima'], dtype=object)

In [8]:
df = annotator_agree[annotator_agree.kata == 'mengandung']
df.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,1000527,mengandung,2301,"Secara anatomi, hidung adalah penonjolan pada ..."
1,1000651,mengandung,2301,"Mandi air yang mengandung belerang, untuk peng..."
2,1000770,mengandung,2301,"Dengan terikatnya klathrin, membrane sel membe..."
646,994570,mengandung,2301,Karena mengandung PABA (Para Aminobenzoic Acid...
647,994574,mengandung,2301,Kopi robusta dapat dikatakan sebagai kopi kela...


In [9]:
def remove_stopwords(text):
    stopwords = pd.read_csv('../dataset/stopwords.csv')
    special_list = []
    token = nltk.word_tokenize(text)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [10]:
def normalize(text):
    normal_txt = text.lower()
    normal_txt = re.sub('\s+', ' ', normal_txt)
    normal_txt = normal_txt.strip()
    normal_txt = re.sub(r'[^\w\s]', '', normal_txt)
    
    normal_regex = re.compile(r'(.)\1{1,}', re.IGNORECASE)
    normal_txt = normal_regex.sub(r'\1\1', normal_txt)
    return normal_txt

In [11]:
def preprocessing(texts):
    text_clean = []
    for txt in texts:
        normal_txt = normalize(txt)
        nosw_txt = remove_stopwords(normal_txt)
        text_clean.append(nosw_txt)
    return text_clean

raw_text = annotator_agree['kalimat']
label = annotator_agree['sense'].tolist()

clean_text = preprocessing(raw_text)
clean_text[:3]

['secara anatomi hidung adalah penonjolan pada vertebrata yang mengandung nostril yang menyaring udara untuk pernapasan',
 'mandi air yang mengandung belerang untuk pengobatan penyakit kulit',
 'dengan terikatnya klathrin membrane sel membentuk vesikel yang mengandung molekul ligan']

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# Import library
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# def extract_bag_of_words(text):
#     unigram = CountVectorizer(ngram_range=(1,1))
#     unigram_matrix = unigram.fit_transform(np.array(text)).todense()
#     feat_name = unigram.get_feature_names()
#     return unigram_matrix, feat_name

# unigram_feat, feat_name = extract_bag_of_words(clean_text)
# print(unigram_feat[:3])
# print(feat_name[:10])
def EkstraksiPOS(list_tweet):
    ct = CRFTagger()
    ct.set_model_file("all_indo_man_tag_corpus_model.crf.tagger")
    pos_feat_list = []
    count_tag = []
    for tweet in list_tweet:
        token = nltk.word_tokenize(tweet)
        tag = ct.tag_sents([token])
        flat_tag = [item for sublist in tag for item in sublist]
        pos_count = Counter([j for i,j in flat_tag])
        #CC,CD, DT,EX,IN,JJ,JJR,JJS,LS,MD,NEG,NN,NNP,NNS,PDT,POS,PRP,RB,RBR,RBS,RP,TO,UH,VB,VBD,VBG,VBN,VBP,VBZ,WDT,WP,WRB
        pos_feat = (pos_count['CC'], pos_count['CD'],pos_count['DT'],pos_count['EX'],pos_count['IN'],pos_count['JJ'],
                    pos_count['JJR'],pos_count['JJS'],pos_count['LS'],pos_count['MD'],pos_count['NEG'],pos_count['NN'],
                    pos_count['NNP'],pos_count['NNS'],pos_count['PDT'],pos_count['POS'],pos_count['PRP'],pos_count['RB'],
                    pos_count['RBR'],pos_count['RBS'],pos_count['RP'],pos_count['TO'],pos_count['UH'],pos_count['VB'],
                    pos_count['VBD'],pos_count['VBG'],pos_count['VBN'],pos_count['VBP'],pos_count['VBZ'],pos_count['WDT'],
                    pos_count['WP'],pos_count['WRB'])
        pos_feat_list.append(pos_feat)
    return pos_feat_list

In [25]:
postag_feat = EkstraksiPOS(clean_text)
print(postag_feat[:3])

[(0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0)]


In [20]:
nltk.download('tagsets')
nltk.help.upenn_tagset()

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data]   Unzipping help\tagsets.zip.


In [16]:
# Import library
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [17]:
# classifiers and hyperparameters
classifiers = [
    LogisticRegression(solver='liblinear'),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
]

hyperparameters = [
    {'penalty': ['l1', 'l2'], 'C': np.logspace(-3,3,7)},
    {'max_features': ['auto', 'sqrt', 'log2'], 'criterion' : ['gini', 'entropy']},
    {'n_estimators': [2, 4], 'max_features': ['auto', 'sqrt', 'log2'], 'criterion' : ['gini', 'entropy']},
    {'n_estimators': [2, 4], 'base_estimator': [LogisticRegressionCV(cv=5), DecisionTreeClassifier(), GaussianNB()]},
    {}
]

In [18]:
#feat_list = [unigram_feat, sentlex_feat, postag_feat, orto_feat]
#feat_name = ["Unigram", "Sentimen", "POS", "Ortografi"]

X_train = unigram_feat
y_train = label

NameError: name 'unigram_feat' is not defined

In [None]:
classf = SVC(kernel='rbf')
scoring = ['accuracy', 'f1_macro']
scores = cross_validate(classf, X_train, y_train, cv=5, scoring=scoring, verbose=1)
acc = np.mean(scores['test_accuracy'])
f1 = np.mean(scores['test_f1_macro'])
print("Akurasi :", acc)
print("F1-Score :", f1)
print("---------------")

In [None]:
for i,classf in enumerate(classifiers):
    print("# Tuning hyperparam for " + classf.__class__.__name__)
    cv = GridSearchCV(classf, hyperparameters[i], cv=5, verbose=1)
    cv.fit(X_train, y_train)
    
    print("# View best hyperparam")
    print(cv.best_params_)
    print("")
    print("# View Score")
    y_pred = cv.best_estimator_.predict(X_test)
    print("score : ", accuracy_score(y_test, y_pred))
    print("")
    print("classification report : ")
    print(classification_report(y_test, y_pred))
    print("")
    print("confusion matrix : ")
    print(pd.crosstab(y_test, y_pred, rownames=["Actual Label"], colnames=["Predicted Label"]))
    print("")