## Data Preprocessing

In [1]:
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk

df = pd.read_csv('data\data_test.csv', sep=';')

In [2]:
# Casefolding from column 'question'
df.question = df.question.str.lower()

# Remove punctuation from column 'question'
df['punc_remove'] = df.question.str.replace('[^\w\s]', ' ')

# Stemming
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()
df['stemmed'] = df.apply(lambda row: stemmer.stem(row['punc_remove']), axis = 1)

# Remove stopwords
stopword_factory = StopWordRemoverFactory()
stopword = stopword_factory.create_stop_word_remover()
df['sw_remove'] = df.apply(lambda row: stopword.remove(row['stemmed']), axis = 1)

# Tokenization
# df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['sw_remove']), axis = 1)

## Feature Extraction

In [3]:
def get_bow(wordlist, df):
    # check if word exist in bow
    bow = {}
    for index, row in df.iterrows():
        txt = row['stemmed'].split()
        for i in range(len(txt)):
            if txt[i] in bow:
                bow[txt[i]] += 1
            else:
                bow[txt[i]] = 1
            
    # sort bow by value
    sorted_bow = {}
    for i in sorted(bow.items(), key=lambda x: x[1], reverse=True):
        sorted_bow[i[0]] = i[1]
        
    return sorted_bow

In [4]:
def flatten(term_doc_matrix):
    # Convert sparse matrix to list
    temp = []
    for i in range(term_doc_matrix.shape[0]):
        temp.append(term_doc_matrix[i].toarray().tolist())

    # Flatten list
    res = list(chain.from_iterable(temp))
    
    return res

In [5]:
def postag_weighting(tfidf, term_dict):
    # Fungsi untuk weighting tfidf based on POS Tag. Weight optimal ada di paper
    
    # cek di list TF-IDF, mana yg ga 0
    # indeks di list TF-IDF dicocokin sama indeks di term_dictionary
    # ambil nilai weight dari dict
    result = []
    
    for i in range(len(tfidf)):
        elem = []
        for j in range(len(tfidf[i])):
            if tfidf[i][j] > 0:
                temp = list(term_dict.values())[j]
                weight = 1
#                 Uncomment below if POS tagger is available
#                 if temp == 'N':
#                     weight = 3
#                 elif temp == 'ADJ':
#                     weight = 2
#                 else:
#                     weight = 1
            elem.append(tfidf[i][j] * weight)
        result.append(elem)
    return result

In [6]:
def get_score(test, pred):
    
    accuracy = metrics.accuracy_score(test, pred)
    precision = metrics.precision_score(test, pred, average='weighted')
    recall = metrics.recall_score(test, pred, average='weighted')
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return accuracy, precision, recall, f1_score

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import chain
import numpy as np

# Buat ngitung nilai TF-IDF dari tiap term
vectorizer = TfidfVectorizer()
term_doc_matrix = vectorizer.fit_transform(df['stemmed'])

# Daftar kata yang dipake di corpus
# TODO: Bikin list isinya POSTag, urutan sama kaya vocabulary
#       Gabungin 2 list itu jadi 1 dict
vocabulary = vectorizer.get_feature_names()

# Convert hasil fit_transform dari sparse matrix ke list
tfidf = flatten(term_doc_matrix)

# Apply weighting untuk TF-IDF
# tfpos_idf = postag_weighting(tfidf, term_dict)

# Convert TFPOS-IDF ke dataframe
# Ini yang dipake buat model
df_dataset = pd.DataFrame(tfidf)
df_dataset.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,543,544,545,546,547,548,549,550,551,552
166,0.0,0.0,0.0,0.0,0.0,0.0,0.183273,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,0.0,0.0,0.0,0.0,0.0,0.0,0.19253,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
168,0.0,0.0,0.0,0.0,0.0,0.0,0.209466,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.186082,0.0,0.0,0.0
169,0.0,0.0,0.0,0.0,0.0,0.0,0.209522,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170,0.0,0.0,0.0,0.0,0.0,0.0,0.204318,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Split

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(df_dataset, df['Label'], test_size=0.2, random_state=23)

## SVM    

In [9]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(X_train, y_train)
pred_svm = clf_svm.predict(X_test)

accuracy, precision, recall, f1_score = get_score(y_test, pred_svm)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.45714285714285713
Precision:  0.5932234432234432
Recall:  0.45714285714285713
F1 Score:  0.5163681654004234


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

svm_grid_param = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

# make a GridSearchCV object
gs_svm = GridSearchCV(clf_svm, svm_grid_param, cv = 5, verbose = 3)

gs_svm.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.536, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.519, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.444, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.593, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.481, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] ................. C=10, kernel=linear, score=0.536, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] ...........

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.6s finished


GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}],
             verbose=3)

In [12]:
print(gs_svm.best_estimator_)

SVC(C=10, kernel='linear')


In [26]:
clf_svm = svm.SVC(C=10, kernel='linear')
clf_svm.fit(X_train, y_train)
pred_svm = clf_svm.predict(X_test)

accuracy, precision, recall, f1_score = get_score(y_test, pred_svm)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.4857142857142857
Precision:  0.6845021645021645
Recall:  0.4857142857142857
F1 Score:  0.5682239039445314


## Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
pred_nb = clf_nb.predict(X_test)

accuracy, precision, recall, f1_score = get_score(y_test, pred_nb)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.42857142857142855
Precision:  0.6470418470418471
Recall:  0.42857142857142855
F1 Score:  0.5156196097972325


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
nb_grid_param = [ {'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]} ]

# make a GridSearchCV object
gs_nb = GridSearchCV(clf_nb, nb_grid_param, cv = 5, verbose = 3)

gs_nb.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.500, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.556, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.407, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.519, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.556, total=   0.0s
[CV] alpha=0.2 .......................................................
[CV] ........................... alpha=0.2, score=0.536, total=   0.0s
[CV] alpha=0.2 .......................................................
[CV] ...........

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.5s finished


GridSearchCV(cv=5, estimator=MultinomialNB(),
             param_grid=[{'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                    1.0]}],
             verbose=3)

In [24]:
print(gs_nb.best_estimator_)

MultinomialNB(alpha=0.2)


In [27]:
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
pred_nb = clf_nb.predict(X_test)

accuracy, precision, recall, f1_score = get_score(y_test, pred_nb)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.42857142857142855
Precision:  0.6470418470418471
Recall:  0.42857142857142855
F1 Score:  0.5156196097972325


  _warn_prf(average, modifier, msg_start, len(result))


## Playground

In [16]:
swlist = ['a', 'ada', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 
           'akankah', 'akhir', 'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 
           'anda', 'andalah', 'antar', 'antara', 'antaranya',  'apaan', 'apabila', 
           'apalagi', 'apatah',  'asal', 'asalkan', 'atas', 'atau', 'ataukah', 'ataupun', 'awal',
           'awalnya', 'b', 'bagai', 'bagaikan', 'bagaimanapun', 'bagi', 'bagian', 'bahkan','bahwa',
           'bahwasannya', 'bahwasanya', 'baik', 'baiklah', 'bakal', 'bakalan', 'balik', 'banyak', 'bapak', 'baru', 'bawah',
           'beberapa', 'begini', 'beginian', 'beginikah', 'beginilah', 'begitu', 'begitukah',
           'begitulah', 'begitupun', 'bekerja', 'belakang', 'belakangan', 'belum', 'belumlah',
           'benar', 'benarlah', 'berada', 'berakhir', 'berakhirlah', 'berakhirnya',
           'berapalah', 'berapapun', 'berarti', 'berawal', 'berbagai', 'berdatangan', 'berikut',
           'berikutnya', 'berkali-kali', 'berkata', 'berkehendak', 'berkeinginan', 'berkenaan',
           'berlainan', 'berlalu', 'berlangsung', 'berlebihan', 'bermacam', 'bermacam-macam',
           'bermaksud', 'bermula', 'bersama', 'bersama-sama', 'bersiap', 'bersiap-siap',
           'bertanya', 'bertanya-tanya', 'berturut', 'berturut-turut', 'bertutur', 'berujar',
           'berupa', 'besar', 'betul', 'biasa', 'biasanya', 'bila', 'bilakah', 'bisa', 'boleh', 'bolehkah', 'bolehlah', 'buat',
           'bukan', 'bukankah', 'bukanlah', 'bukannya', 'bulan', 'bung', 'c', 'cara', 'caranya', 'cukup', 'cukupkah',
           'cukuplah', 'cuma', 'd', 'dahulu', 'dalam', 'dan', 'dapat', 'dari', 'daripada', 'datang', 'dekat', 'demi',
           'demikian', 'demikianlah', 'dengan', 'depan', 'di', 'dia', 'diakhiri', 'diakhirinya',
           'dialah', 'diantara', 'diberi', 'diberikan', 'diberikannya', 'dibuat', 'dibuatnya', 'didapat',
           'didatangkan', 'digunakan', 'diibaratkan', 'diibaratkannya', 'diingat', 'diingatkan', 'diinginkan',
           'dijawab', 'dijelaskan', 'dijelaskannya', 'dikarenakan', 'dikatakan', 'dikatakannya', 'dikerjakan',
           'diketahui', 'diketahuinya', 'dikira', 'dilakukan', 'dilalui', 'dilihat', 'dimaksud', 'dimaksudkan',
           'dimaksudkannya', 'dimaksudnya', 'diminta', 'dimintai', 'dimisalkan', 'dimulai', 'dimulailah',
           'dimulainya', 'dimungkinkan', 'dini', 'dipastikan', 'diperbuat', 'diperbuatnya', 'dipergunakan',
           'diperkirakan', 'diperlihatkan', 'diperlukan', 'diperlukannya', 'dipersoalkan', 'dipertanyakan',
           'dipunyai', 'diri', 'dirinya', 'disampaikan', 'disebutkan', 'disebutkannya', 'disini', 'disinilah',
           'ditambahkan', 'ditandaskan', 'ditanya', 'ditanyai', 'ditanyakan', 'ditegaskan', 'ditujukan', 'ditunjuk',
           'ditunjuki', 'ditunjukkan', 'ditunjukkannya', 'ditunjuknya', 'dituturkan', 'dituturkannya', 'diucapkan',
           'diucapkannya', 'diungkapkan', 'dong', 'dua', 'dulu', 'e', 'empat', 'enak', 'enggak', 'enggaknya',
           'entah', 'entahlah', 'f', 'g', 'guna', 'gunakan', 'h', 'hadap', 'hai', 'hal', 'halo', 'hallo',
           'hampir', 'hanya', 'hanyalah', 'hari', 'harus', 'haruslah', 'harusnya', 'helo', 'hello', 'hendak',
           'hendaklah', 'hendaknya', 'hingga', 'i', 'ia', 'ialah', 'ibarat', 'ibaratkan', 'ibaratnya', 'ibu', 'ikut',
           'ingat', 'ingat-ingat', 'ingin', 'inginkah', 'inginkan', 'ini', 'inikah', 'inilah', 'itu', 'itukah', 'itulah',
           'j', 'jadi', 'jadilah', 'jadinya', 'jangan', 'jangankan', 'janganlah', 'jauh', 'jawab', 'jawaban',
           'jawabnya', 'jelas', 'jelaslah', 'jelasnya', 'jika', 'jikalau', 'juga', 'jumlah', 'jumlahnya', 'justru',
           'k', 'kadar', 'kala', 'kalau', 'kalaulah', 'kalaupun', 'kali', 'kalian', 'kami', 'kamilah', 'kamu',
           'kamulah', 'kan', 'kapankah', 'kapanpun',  'karenanya', 'kasus', 'kata', 'katakan', 'katakanlah', 'katanya',
           'ke', 'keadaan', 'kebetulan', 'kecil', 'kedua', 'keduanya', 'keinginan', 'kelamaan', 'kelihatan',
           'kelihatannya', 'kelima', 'keluar', 'kembali', 'kemudian', 'kemungkinan', 'kemungkinannya', 'kena',
           'kepada', 'kepadanya', 'kerja', 'kesampaian', 'keseluruhan', 'keseluruhannya', 'keterlaluan', 'ketika',
           'khusus', 'khususnya', 'kini', 'kinilah', 'kira', 'kira-kira', 'kiranya', 'kita', 'kitalah', 'kok', 'kurang',
           'l', 'lagi', 'lagian', 'lah', 'lain', 'lainnya', 'laku', 'lalu', 'lama', 'lamanya', 'langsung', 'lanjut',
           'lanjutnya', 'lebih', 'lewat', 'lihat', 'lima', 'luar', 'm', 'macam', 'maka', 'makanya', 'makin', 'maksud',
           'malah', 'malahan', 'mampu', 'mampukah', 'mana', 'manakala', 'manalagi', 'masa', 'masalah', 'masalahnya',
           'masih', 'masihkah', 'masing', 'masing-masing', 'masuk', 'mata', 'mau', 'maupun', 'melainkan', 'melakukan',
           'melalui', 'melihat', 'melihatnya', 'memang', 'memastikan', 'memberi', 'memberikan', 'membuat',
           'memerlukan', 'memihak', 'meminta', 'memintakan', 'memisalkan', 'memperbuat', 'mempergunakan',
           'memperkirakan', 'memperlihatkan', 'mempersiapkan', 'mempersoalkan', 'mempertanyakan', 'mempunyai',
           'memulai', 'memungkinkan', 'menaiki', 'menambahkan', 'menandaskan', 'menanti', 'menanti-nanti',
           'menantikan', 'menanya', 'menanyai', 'menanyakan', 'mendapat', 'mendapatkan', 'mendatang', 'mendatangi',
           'mendatangkan', 'menegaskan', 'mengakhiri', 'mengatakan', 'mengatakannya', 'mengenai', 'mengerjakan',
           'mengetahui', 'menggunakan', 'menghendaki', 'mengibaratkan', 'mengibaratkannya', 'mengingat', 'mengingatkan',
           'menginginkan', 'mengira', 'mengucapkan', 'mengucapkannya', 'mengungkapkan', 'menjadi', 'menjawab',
           'menjelaskan', 'menuju', 'menunjuk', 'menunjuki', 'menunjuknya', 'menurut', 'menuturkan', 'menyampaikan',
           'menyangkut', 'menyatakan', 'menyebutkan', 'menyeluruh', 'menyiapkan', 'merasa', 'mereka', 'merekalah',
           'meski', 'meskipun', 'meyakini', 'meyakinkan', 'minta', 'mirip', 'misal', 'misalkan', 'misalnya',
           'mohon', 'mula', 'mulai', 'mulailah', 'mulanya', 'mungkin', 'mungkinkah', 'n', 'nah', 'naik', 'namun',
           'nanti', 'nantinya', 'nya', 'nyaris', 'nyata', 'nyatanya', 'o', 'oleh', 'olehnya', 'orang', 'p', 'pada',
           'padahal', 'padanya', 'pak', 'paling', 'panjang', 'pantas', 'para', 'pasti', 'pastilah', 'penting',
           'pentingnya', 'per', 'percuma', 'perlu', 'perlukah', 'perlunya', 'pernah', 'persoalan', 'pertama',
           'pertama-tama', 'pertanyaan', 'pertanyakan', 'pihak', 'pihaknya', 'pukul', 'pula', 'pun', 'punya', 'q', 'r',
           'rasa', 'rasanya', 'rupanya', 's', 'saat', 'saatnya', 'saja', 'sajalah', 'salam', 'saling', 'sama',
           'sama-sama', 'sambil', 'sampai', 'sampai-sampai', 'sampaikan', 'sana', 'sangat', 'sangatlah', 'sangkut',
           'satu', 'saya', 'sayalah', 'se', 'sebab', 'sebabnya', 'sebagai', 'sebagaimana', 'sebagainya', 'sebagian',
           'sebaik', 'sebaik-baiknya', 'sebaiknya', 'sebaliknya', 'sebanyak', 'sebegini', 'sebegitu', 'sebelum',
           'sebelumnya', 'sebenarnya', 'seberapa', 'sebesar', 'sebetulnya', 'sebisanya', 'sebuah', 
           'sebutnya', 'secara', 'secukupnya', 'sedang', 'sedangkan', 'sedemikian', 'sedikit', 'sedikitnya', 'seenaknya',
           'segala', 'segalanya', 'segera', 'seharusnya', 'sehingga', 'seingat', 'sejak', 'sejauh', 'sejenak', 'sejumlah',
           'sekadar', 'sekadarnya', 'sekali', 'sekali-kali', 'sekalian', 'sekaligus', 'sekalipun', 'sekarang', 'sekaranglah',
           'sekecil', 'seketika', 'sekiranya', 'sekitar', 'sekitarnya', 'sekurang-kurangnya', 'sekurangnya', 'sela',
           'selain', 'selaku', 'selalu', 'selama', 'selama-lamanya', 'selamanya', 'selanjutnya', 'seluruh', 'seluruhnya',
           'semacam', 'semakin', 'semampu', 'semampunya', 'semasa', 'semasih', 'semata', 'semata-mata', 'semaunya',
           'sementara', 'semisal', 'semisalnya', 'sempat', 'semua', 'semuanya', 'semula', 'sendiri', 'sendirian',
           'sendirinya', 'seolah', 'seolah-olah', 'seorang', 'sepanjang', 'sepantasnya', 'sepantasnyalah', 'seperlunya',
           'seperti', 'sepertinya', 'sepihak', 'sering', 'seringnya', 'serta', 'serupa', 'sesaat', 'sesama', 'sesampai',
           'sesegera', 'sesekali', 'seseorang', 'sesuatu', 'sesuatunya', 'sesudah', 'sesudahnya', 'setelah', 'setempat',
           'setengah', 'seterusnya', 'setiap', 'setiba', 'setibanya', 'setidak-tidaknya', 'setidaknya', 'setinggi', 'seusai',
           'sewaktu', 'siap', 'siapa', 'siapakah', 'siapapun', 'sini', 'sinilah', 'soal', 'soalnya', 'suatu', 'sudah', 
           'sudahkah', 'sudahlah', 'supaya', 't', 'tadi', 'tadinya', 'tahu', 'tak', 'tambah', 'tambahnya', 'tampak',
           'tampaknya', 'tandas', 'tandasnya', 'tanpa', 'tanya', 'tanyakan', 'tanyanya', 'tapi', 'tegas', 'tegasnya',
           'telah', 'tempat', 'tentang', 'tentu', 'tentulah', 'tentunya', 'tepat', 'terakhir', 'terasa', 'terbanyak',
           'terdahulu', 'terdapat', 'terdiri', 'terhadap', 'terhadapnya', 'teringat', 'teringat-ingat', 'terjadi',
           'terjadilah', 'terjadinya', 'terkira', 'terlalu', 'terlebih', 'terlihat', 'termasuk', 'ternyata', 'tersampaikan',
           'tersebut', 'tersebutlah', 'tertentu', 'tertuju', 'terus', 'terutama', 'tetap', 'tetapi', 'tiap', 'tiba',
           'tiba-tiba', 'tidak', 'tidakkah', 'tidaklah', 'tiga', 'toh', 'tuju', 'tunjuk', 'turut', 'tutur', 'tuturnya',
           'u', 'ucap', 'ucapnya', 'ujar', 'ujarnya', 'umumnya', 'ungkap', 'ungkapnya', 'untuk', 'usah', 'usai', 'v', 'w',
           'waduh', 'wah', 'wahai', 'waktunya', 'walau', 'walaupun', 'wong', 'x', 'y', 'ya', 'yaitu', 'yakin', 'yakni',
           'yang', 'z', '']

In [17]:
def remove_stopwords(text):
    res = []
    words = text.split(" ")
    for word in words:
        if word not in swlist:
            res.append(word)
    return ' '.join(res)

In [18]:
sw_keep = ['adalah', 'apa', 'apakah', 'arti', 'artinya', 'bagaimana', 'bagaimanakah', 'berapa', 'berapakah', 'benarkah',
           'beri', 'berikan', 'berjumlah', 'betulkah', 'bisakah', 'diantaranya', 'disebut', 'jelaskan', 'kapan',
           'karena', 'kenapa', 'mengapa', 'menunjukkan', 'merupakan', 'rupa', 'sebut', 'sebutlah']
    
df = pd.read_csv('data\data_test.csv', sep=';')

In [19]:
# Casefolding from column 'question'
df.question = df.question.str.lower()

# Remove punctuation from column 'question'
df['punc_remove'] = df.question.str.replace('[^\w\s]', ' ')

# Remove stopwords
df['sw_remove'] = df.apply(lambda row: remove_stopwords(row['punc_remove']), axis = 1) 

# Stemming
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()
df['stemmed'] = df.apply(lambda row: stemmer.stem(row['sw_remove']), axis = 1)

In [20]:
# Buat ngitung nilai TF-IDF dari tiap term
vectorizer = TfidfVectorizer()
term_doc_matrix = vectorizer.fit_transform(df['stemmed'])

# Daftar kata yang dipake di corpus
# TODO: Bikin list isinya POSTag, urutan sama kaya vocabulary
#       Gabungin 2 list itu jadi 1 dict
vocabulary = vectorizer.get_feature_names()

# Convert hasil fit_transform dari sparse matrix ke list
tfidf = flatten(term_doc_matrix)

# Apply weighting untuk TF-IDF
# tfpos_idf = postag_weighting(tfidf, term_dict)

# Convert TFPOS-IDF ke dataframe
# Ini yang dipake buat model
df_dataset = pd.DataFrame(tfidf)
df_dataset.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,414,415,416,417,418,419,420,421,422,423
166,0.0,0.0,0.0,0.0,0.0,0.23696,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
167,0.0,0.0,0.0,0.0,0.0,0.201708,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
168,0.0,0.0,0.0,0.0,0.0,0.27692,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169,0.0,0.0,0.0,0.0,0.0,0.242774,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
170,0.0,0.0,0.0,0.0,0.0,0.201708,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_dataset, df['Label'], test_size=0.2, random_state=23)

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(X_train, y_train)
y_pred = clf_svm.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.4
Precision:  0.6306766917293233
Recall:  0.4
F1 Score:  0.4895243653341114


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df_dataset, df['Label'], test_size=0.2, random_state=23)
clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
y_pred = clf_nb.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

Accuracy:  0.42857142857142855
Precision:  0.6470418470418471
Recall:  0.42857142857142855
F1 Score:  0.5156196097972325


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
swlist = ['a', 'ada', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 
           'akankah', 'akhir', 'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 
           'anda', 'andalah', 'antar', 'antara', 'antaranya',  'apaan', 'apabila', 
           'apalagi', 'apatah',  'asal', 'asalkan', 'atas', 'atau', 'ataukah', 'ataupun', 'awal',
           'awalnya', 'b', 'bagai', 'bagaikan', 'bagaimanapun', 'bagi', 'bagian', 'bahkan','bahwa',
           'bahwasannya', 'bahwasanya', 'baik', 'baiklah', 'bakal', 'bakalan', 'balik', 'banyak', 'bapak', 'baru', 'bawah',
           'beberapa', 'begini', 'beginian', 'beginikah', 'beginilah', 'begitu', 'begitukah',
           'begitulah', 'begitupun', 'bekerja', 'belakang', 'belakangan', 'belum', 'belumlah',
           'benar', 'benarlah', 'berada', 'berakhir', 'berakhirlah', 'berakhirnya',
           'berapalah', 'berapapun', 'berarti', 'berawal', 'berbagai', 'berdatangan', 'berikut',
           'berikutnya', 'berkali-kali', 'berkata', 'berkehendak', 'berkeinginan', 'berkenaan',
           'berlainan', 'berlalu', 'berlangsung', 'berlebihan', 'bermacam', 'bermacam-macam',
           'bermaksud', 'bermula', 'bersama', 'bersama-sama', 'bersiap', 'bersiap-siap',
           'bertanya', 'bertanya-tanya', 'berturut', 'berturut-turut', 'bertutur', 'berujar',
           'berupa', 'besar', 'betul', 'biasa', 'biasanya', 'bila', 'bilakah', 'bisa', 'boleh', 'bolehkah', 'bolehlah', 'buat',
           'bukan', 'bukankah', 'bukanlah', 'bukannya', 'bulan', 'bung', 'c', 'cara', 'caranya', 'cukup', 'cukupkah',
           'cukuplah', 'cuma', 'd', 'dahulu', 'dalam', 'dan', 'dapat', 'dari', 'daripada', 'datang', 'dekat', 'demi',
           'demikian', 'demikianlah', 'dengan', 'depan', 'di', 'dia', 'diakhiri', 'diakhirinya',
           'dialah', 'diantara', 'diberi', 'diberikan', 'diberikannya', 'dibuat', 'dibuatnya', 'didapat',
           'didatangkan', 'digunakan', 'diibaratkan', 'diibaratkannya', 'diingat', 'diingatkan', 'diinginkan',
           'dijawab', 'dijelaskan', 'dijelaskannya', 'dikarenakan', 'dikatakan', 'dikatakannya', 'dikerjakan',
           'diketahui', 'diketahuinya', 'dikira', 'dilakukan', 'dilalui', 'dilihat', 'dimaksud', 'dimaksudkan',
           'dimaksudkannya', 'dimaksudnya', 'diminta', 'dimintai', 'dimisalkan', 'dimulai', 'dimulailah',
           'dimulainya', 'dimungkinkan', 'dini', 'dipastikan', 'diperbuat', 'diperbuatnya', 'dipergunakan',
           'diperkirakan', 'diperlihatkan', 'diperlukan', 'diperlukannya', 'dipersoalkan', 'dipertanyakan',
           'dipunyai', 'diri', 'dirinya', 'disampaikan', 'disebutkan', 'disebutkannya', 'disini', 'disinilah',
           'ditambahkan', 'ditandaskan', 'ditanya', 'ditanyai', 'ditanyakan', 'ditegaskan', 'ditujukan', 'ditunjuk',
           'ditunjuki', 'ditunjukkan', 'ditunjukkannya', 'ditunjuknya', 'dituturkan', 'dituturkannya', 'diucapkan',
           'diucapkannya', 'diungkapkan', 'dong', 'dua', 'dulu', 'e', 'empat', 'enak', 'enggak', 'enggaknya',
           'entah', 'entahlah', 'f', 'g', 'guna', 'gunakan', 'h', 'hadap', 'hai', 'hal', 'halo', 'hallo',
           'hampir', 'hanya', 'hanyalah', 'hari', 'harus', 'haruslah', 'harusnya', 'helo', 'hello', 'hendak',
           'hendaklah', 'hendaknya', 'hingga', 'i', 'ia', 'ialah', 'ibarat', 'ibaratkan', 'ibaratnya', 'ibu', 'ikut',
           'ingat', 'ingat-ingat', 'ingin', 'inginkah', 'inginkan', 'ini', 'inikah', 'inilah', 'itu', 'itukah', 'itulah',
           'j', 'jadi', 'jadilah', 'jadinya', 'jangan', 'jangankan', 'janganlah', 'jauh', 'jawab', 'jawaban',
           'jawabnya', 'jelas', 'jelaslah', 'jelasnya', 'jika', 'jikalau', 'juga', 'jumlah', 'jumlahnya', 'justru',
           'k', 'kadar', 'kala', 'kalau', 'kalaulah', 'kalaupun', 'kali', 'kalian', 'kami', 'kamilah', 'kamu',
           'kamulah', 'kan', 'kapankah', 'kapanpun',  'karenanya', 'kasus', 'kata', 'katakan', 'katakanlah', 'katanya',
           'ke', 'keadaan', 'kebetulan', 'kecil', 'kedua', 'keduanya', 'keinginan', 'kelamaan', 'kelihatan',
           'kelihatannya', 'kelima', 'keluar', 'kembali', 'kemudian', 'kemungkinan', 'kemungkinannya', 'kena',
           'kepada', 'kepadanya', 'kerja', 'kesampaian', 'keseluruhan', 'keseluruhannya', 'keterlaluan', 'ketika',
           'khusus', 'khususnya', 'kini', 'kinilah', 'kira', 'kira-kira', 'kiranya', 'kita', 'kitalah', 'kok', 'kurang',
           'l', 'lagi', 'lagian', 'lah', 'lain', 'lainnya', 'laku', 'lalu', 'lama', 'lamanya', 'langsung', 'lanjut',
           'lanjutnya', 'lebih', 'lewat', 'lihat', 'lima', 'luar', 'm', 'macam', 'maka', 'makanya', 'makin', 'maksud',
           'malah', 'malahan', 'mampu', 'mampukah', 'mana', 'manakala', 'manalagi', 'masa', 'masalah', 'masalahnya',
           'masih', 'masihkah', 'masing', 'masing-masing', 'masuk', 'mata', 'mau', 'maupun', 'melainkan', 'melakukan',
           'melalui', 'melihat', 'melihatnya', 'memang', 'memastikan', 'memberi', 'memberikan', 'membuat',
           'memerlukan', 'memihak', 'meminta', 'memintakan', 'memisalkan', 'memperbuat', 'mempergunakan',
           'memperkirakan', 'memperlihatkan', 'mempersiapkan', 'mempersoalkan', 'mempertanyakan', 'mempunyai',
           'memulai', 'memungkinkan', 'menaiki', 'menambahkan', 'menandaskan', 'menanti', 'menanti-nanti',
           'menantikan', 'menanya', 'menanyai', 'menanyakan', 'mendapat', 'mendapatkan', 'mendatang', 'mendatangi',
           'mendatangkan', 'menegaskan', 'mengakhiri', 'mengatakan', 'mengatakannya', 'mengenai', 'mengerjakan',
           'mengetahui', 'menggunakan', 'menghendaki', 'mengibaratkan', 'mengibaratkannya', 'mengingat', 'mengingatkan',
           'menginginkan', 'mengira', 'mengucapkan', 'mengucapkannya', 'mengungkapkan', 'menjadi', 'menjawab',
           'menjelaskan', 'menuju', 'menunjuk', 'menunjuki', 'menunjuknya', 'menurut', 'menuturkan', 'menyampaikan',
           'menyangkut', 'menyatakan', 'menyebutkan', 'menyeluruh', 'menyiapkan', 'merasa', 'mereka', 'merekalah',
           'meski', 'meskipun', 'meyakini', 'meyakinkan', 'minta', 'mirip', 'misal', 'misalkan', 'misalnya',
           'mohon', 'mula', 'mulai', 'mulailah', 'mulanya', 'mungkin', 'mungkinkah', 'n', 'nah', 'naik', 'namun',
           'nanti', 'nantinya', 'nya', 'nyaris', 'nyata', 'nyatanya', 'o', 'oleh', 'olehnya', 'orang', 'p', 'pada',
           'padahal', 'padanya', 'pak', 'paling', 'panjang', 'pantas', 'para', 'pasti', 'pastilah', 'penting',
           'pentingnya', 'per', 'percuma', 'perlu', 'perlukah', 'perlunya', 'pernah', 'persoalan', 'pertama',
           'pertama-tama', 'pertanyaan', 'pertanyakan', 'pihak', 'pihaknya', 'pukul', 'pula', 'pun', 'punya', 'q', 'r',
           'rasa', 'rasanya', 'rupanya', 's', 'saat', 'saatnya', 'saja', 'sajalah', 'salam', 'saling', 'sama',
           'sama-sama', 'sambil', 'sampai', 'sampai-sampai', 'sampaikan', 'sana', 'sangat', 'sangatlah', 'sangkut',
           'satu', 'saya', 'sayalah', 'se', 'sebab', 'sebabnya', 'sebagai', 'sebagaimana', 'sebagainya', 'sebagian',
           'sebaik', 'sebaik-baiknya', 'sebaiknya', 'sebaliknya', 'sebanyak', 'sebegini', 'sebegitu', 'sebelum',
           'sebelumnya', 'sebenarnya', 'seberapa', 'sebesar', 'sebetulnya', 'sebisanya', 'sebuah', 
           'sebutnya', 'secara', 'secukupnya', 'sedang', 'sedangkan', 'sedemikian', 'sedikit', 'sedikitnya', 'seenaknya',
           'segala', 'segalanya', 'segera', 'seharusnya', 'sehingga', 'seingat', 'sejak', 'sejauh', 'sejenak', 'sejumlah',
           'sekadar', 'sekadarnya', 'sekali', 'sekali-kali', 'sekalian', 'sekaligus', 'sekalipun', 'sekarang', 'sekaranglah',
           'sekecil', 'seketika', 'sekiranya', 'sekitar', 'sekitarnya', 'sekurang-kurangnya', 'sekurangnya', 'sela',
           'selain', 'selaku', 'selalu', 'selama', 'selama-lamanya', 'selamanya', 'selanjutnya', 'seluruh', 'seluruhnya',
           'semacam', 'semakin', 'semampu', 'semampunya', 'semasa', 'semasih', 'semata', 'semata-mata', 'semaunya',
           'sementara', 'semisal', 'semisalnya', 'sempat', 'semua', 'semuanya', 'semula', 'sendiri', 'sendirian',
           'sendirinya', 'seolah', 'seolah-olah', 'seorang', 'sepanjang', 'sepantasnya', 'sepantasnyalah', 'seperlunya',
           'seperti', 'sepertinya', 'sepihak', 'sering', 'seringnya', 'serta', 'serupa', 'sesaat', 'sesama', 'sesampai',
           'sesegera', 'sesekali', 'seseorang', 'sesuatu', 'sesuatunya', 'sesudah', 'sesudahnya', 'setelah', 'setempat',
           'setengah', 'seterusnya', 'setiap', 'setiba', 'setibanya', 'setidak-tidaknya', 'setidaknya', 'setinggi', 'seusai',
           'sewaktu', 'siap', 'siapa', 'siapakah', 'siapapun', 'sini', 'sinilah', 'soal', 'soalnya', 'suatu', 'sudah', 
           'sudahkah', 'sudahlah', 'supaya', 't', 'tadi', 'tadinya', 'tahu', 'tak', 'tambah', 'tambahnya', 'tampak',
           'tampaknya', 'tandas', 'tandasnya', 'tanpa', 'tanya', 'tanyakan', 'tanyanya', 'tapi', 'tegas', 'tegasnya',
           'telah', 'tempat', 'tentang', 'tentu', 'tentulah', 'tentunya', 'tepat', 'terakhir', 'terasa', 'terbanyak',
           'terdahulu', 'terdapat', 'terdiri', 'terhadap', 'terhadapnya', 'teringat', 'teringat-ingat', 'terjadi',
           'terjadilah', 'terjadinya', 'terkira', 'terlalu', 'terlebih', 'terlihat', 'termasuk', 'ternyata', 'tersampaikan',
           'tersebut', 'tersebutlah', 'tertentu', 'tertuju', 'terus', 'terutama', 'tetap', 'tetapi', 'tiap', 'tiba',
           'tiba-tiba', 'tidak', 'tidakkah', 'tidaklah', 'tiga', 'toh', 'tuju', 'tunjuk', 'turut', 'tutur', 'tuturnya',
           'u', 'ucap', 'ucapnya', 'ujar', 'ujarnya', 'umumnya', 'ungkap', 'ungkapnya', 'untuk', 'usah', 'usai', 'v', 'w',
           'waduh', 'wah', 'wahai', 'waktunya', 'walau', 'walaupun', 'wong', 'x', 'y', 'ya', 'yaitu', 'yakin', 'yakni',
           'yang', 'z']


dihapus = ['adalah', 'apa', 'apakah', 'arti', 'artinya', 'bagaimana', 'bagaimanakah', 'berapa', 'berapakah', 'benarkah',
           'beri', 'berikan', 'berjumlah', 'betulkah', 'bisakah', 'bolehkah', 'diantaranya', 'disebut', 'jelaskan', 'kapan',
           'karena', 'kenapa', 'menunjukkan', 'merupakan', 'rupa', 'sebut', 'sebutlah']