Testing of various classification algorithms for the word madu.
1.  load dataset and create train/test split
2. find baseline by training a supervised model
3. remove labels and test various unsupervised algorithms

In [0]:
import pandas as pd
import re
import numpy as np

Read input list uploaded into colab fs from local drive. Note: file need to be uploaded everytime colab notebook is started since colab doesn't store the file in disk once session ends.

In [0]:
df = pd.read_excel('madu-sense1-sense2-context-96sentences.xlsx')

get plain bag of words from all sentences

In [0]:
words = []
for line in df['SENTENCE']:
  for w in re.split('\s+',line):
    words.append(w)
    
print(len(words))

1688


Visualize top words used in list of sentences

In [0]:
def wordcount(inlist):
    #count unique words in list
    #ignore case
    list = [item.lower() for item in inlist]
    wc = {}
    for item in set(list):
        wc[item] = list.count(item)
    #wc_sorted = sorted(wc.items(), key=lambda x: x[1], reverse=True)
    return wc

wordcount(words)

{'': 18,
 '"bagi': 2,
 '"bahan': 1,
 '"dari': 1,
 '"jadi,': 1,
 '"kami': 1,
 '"kita': 1,
 '"lebah': 1,
 '"mahkota': 1,
 '"neetar"': 1,
 '"saya': 1,
 '"sekiranya': 1,
 '"selain': 1,
 '"sudah': 1,
 "'lamaran'": 1,
 '(1989),': 1,
 '(akhbar': 1,
 '(hadis': 1,
 '(kiri)': 1,
 '(madu)': 1,
 '(prk)': 1,
 '(ukm)': 1,
 ',': 5,
 '-': 1,
 '.': 1,
 '..': 1,
 '1,001': 2,
 '11': 1,
 '20': 1,
 '2018': 1,
 '2019.': 1,
 '25': 1,
 '29,': 1,
 '30': 1,
 '32,': 2,
 '33,': 1,
 '36': 1,
 '50': 1,
 '8': 1,
 '95': 1,
 '97': 1,
 ':': 1,
 '?': 1,
 'a,': 1,
 'abdul': 2,
 'abu…': 1,
 'ada': 7,
 'adakah': 1,
 'adalah': 3,
 'adam': 1,
 'adik': 1,
 'adira': 2,
 'aduhai': 1,
 'afifah,': 1,
 'ahli': 3,
 'ahmad': 1,
 'aida': 2,
 'air': 2,
 'ajak': 1,
 'akan': 7,
 'akel': 1,
 'aktres': 1,
 'aku': 3,
 'al-quran': 1,
 'alahai': 1,
 'alami': 1,
 'alasan': 1,
 'alimentarius': 1,
 'alkohol': 1,
 'allah': 1,
 'alternatif': 1,
 'amalan': 1,
 'an-nahl:69': 1,
 'anak': 2,
 'anak..': 1,
 'angkara': 1,
 'annapurna': 1,
 'antara': 1,

Some symbols and digits found in the words list that should be removed. Also, there are words that share the same root and can be merged. Will attempt naive bayes with and without stemming.

In [0]:
words_cleaned = []
#symbols_to_remove = re.compile('[/(){}\[\]\.\-\"|@,; ]')
symbols_to_remove = re.compile('[^\w]')
for line in df['SENTENCE']:
  for w in re.split('\s+',line):
    #won't try to correct non-words that have digits
    if not re.match('.*[0-9].*',w):
      w = re.sub(symbols_to_remove,'',w)
      if len(w)>0:
        words_cleaned.append(w)
    
words_cleaned_dict = wordcount(words_cleaned)

Use malaya library to stem the malay words and create a new words list

In [0]:
!pip install malaya
import malaya

Collecting malaya
[?25l  Downloading https://files.pythonhosted.org/packages/df/ed/ae9f6889296e7a874fb768d20df18b0c31bb908c4ee64fb21eb092dc65ac/malaya-1.9.1.2-py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 15.0MB/s 
Collecting PySastrawi (from malaya)
[?25l  Downloading https://files.pythonhosted.org/packages/61/84/b0a5454a040f81e81e6a95a5d5635f20ad43cc0c288f8b4966b339084962/PySastrawi-1.2.0-py2.py3-none-any.whl (210kB)
[K    100% |████████████████████████████████| 215kB 30.2MB/s 
Collecting fuzzywuzzy (from malaya)
  Downloading https://files.pythonhosted.org/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl
Collecting python-levenshtein (from malaya)
[?25l  Downloading https://files.pythonhosted.org/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)
[K    100% |████████████████████████████████| 51kB 10.5MB/s 
Collecting 

In [0]:
def clean_and_stem(sentence_list):
  #function to clean and stem all words
  #returns new list of sentences and words_dict{word:count}
  symbols_to_remove = re.compile('[^\w]')
  words_cleaned_stemmed = []
  words_dict = {}
  new_sentences =[]
  #stemmer = malaya.stem.deep_model(model)
  
  for sentence in sentence_list:
    nstr = ''
    sentence = malaya.stem.sastrawi(sentence)
    for w in re.split('\s+', sentence.lower()):
      #only include words that don't have digits and remove symbols
      if not re.match('.*[0-9].*',w):
        w = re.sub(symbols_to_remove,'',w)
        if len(w)>0:
          words_cleaned_stemmed.append(w)
          nstr = nstr+' '+w
    new_sentences.append(nstr)
  
  words_dict = wordcount(words_cleaned_stemmed)
  return new_sentences, words_dict
  

In [0]:
ilist=['MADU ialah makanan yang sangat bernilai yang digunakan sejak zaman purba sebagai sumber makanan tenaga yang penting.','Menurut Suruhanjaya Alimentarius Kodeks (1989), madu didefinisikan sebagai pemanis semula jadi dihasilkan lebah madu dari sari bunga atau nektar atau dari rembesan bahagian hidup tumbuhan, atau perkumuhan tumbuhan, dikumpulkan oleh lebah madu dan diubah, serta digabungkan dengan bahan spesifik dalam lebah madu sehingga matang.']

x,y = clean_and_stem(ilist)
x



[' madu ialah makan yang sangat nila yang guna sejak zaman purba bagai sumber makan tenaga yang penting',
 ' turut suruhanjaya alimentarius kodeks madu definisi bagai man mula jadi hasil lebah madu dari sari bunga atau nektar atau dari rembes bahagian hidup tumbuh atau kumuh tumbuh kumpul oleh lebah madu dan ubah serta gabung dengan bahan spesifik dalam lebah madu sehingga matang']

The cleaned list of words had 805 instances. Will use a CBOW representation for first trial.

In [0]:
def BOW(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    symbols_to_remove = re.compile('[^\w]')
    result_vector = np.zeros(dict_size)

    l=re.split('\s+',text)
    for w in l:
      w = re.sub(symbols_to_remove,'',w)
      if w in words_to_index.keys():
        result_vector[words_to_index[w]]=1
    return result_vector

Convert all sentences to BOW

Encode label 'manisan'=0 and 'manusia'=1

In [0]:
y_encoded = []
for line in df['CATEGORY & POS']:
  if re.match('manisan',line):
    y_encoded.append('0')
  else:
    y_encoded.append('1')

Create train-test split

In [0]:
from sklearn.model_selection import train_test_split

plot ROC curve

In [0]:
!pip install scikit-plot

Collecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [0]:
import matplotlib.pyplot as plt
import scikitplot as skplt
probs=classifier.predict_proba(X_test)
preds = probs[:,1]
skplt.metrics.plot_roc_curve(y_test, probs)
plt.show()

NameError: ignored

Get F1 score

In [0]:
from sklearn.metrics import f1_score

check for F1 score trend with different vocab size

In [0]:
#check F1 score for different vocab sizes using different classifiers
vocabsize=[10,20,50,100]
nbresult=[]
lrresult=[]
svmresult=[]

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

for i in vocabsize:
  DICT_SIZE = i
  WORDS_TO_INDEX = {}
  top_words=sorted(words_cleaned_dict.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
  for x in range(DICT_SIZE): WORDS_TO_INDEX.update({top_words[x][0]:x})
  #INDEX_TO_WORDS = {v:k for k,v in WORDS_TO_INDEX.items()}
  #ALL_WORDS = WORDS_TO_INDEX.keys()

  X_BOW = [BOW(sentence,WORDS_TO_INDEX, i) for sentence in df['SENTENCE']]
  X_train, X_test, y_train, y_test  = train_test_split(X_BOW, y_encoded, test_size=0.2, random_state=10)
  y_train=np.asarray(y_train, dtype=int)
  y_test=np.asarray(y_test, dtype=int)
  #naive bayes
  bnb = BernoulliNB()
  nbclassifier = bnb.fit(X_train, y_train)
  y_pred_nb = nbclassifier.predict(X_test)
  nbscore=f1_score(y_test, y_pred_nb)
  nbresult.append("{:.2f}".format(nbscore))
  
  result=[]
  #LogisticRegression
  lr = LogisticRegression()
  lrclassifier = lr.fit(X_train, y_train)
  y_pred_lr = lrclassifier.predict(X_test)
  lrscore=f1_score(y_test, y_pred_lr)
  lrresult.append("{:.2f}".format(lrscore))
  #SVM
  svmclassifier=SVC().fit(X_train, y_train)
  y_pred_svm = svmclassifier.predict(X_test)
  svmscore = f1_score(y_test, y_pred_svm)
  svmresult.append("{:.2f}".format(svmscore))
  
print(top_words[:10])
print("naive bayes")
print(vocabsize)
print(nbresult)
print("logistic regression")
print(vocabsize)
print(lrresult)
print("SVM")
print(vocabsize)
print(svmresult)

[('madu', 79), ('yang', 62), ('bermadu', 29), ('dan', 28), ('di', 24), ('lebah', 20), ('itu', 16), ('saya', 16), ('tidak', 15), ('dengan', 15)]
naive bayes
[10, 20, 50, 100]
['0.93', '0.93', '1.00', '1.00']
logistic regression
[10, 20, 50, 100]
['0.93', '0.93', '1.00', '1.00']
SVM
[10, 20, 50, 100]
['0.93', '0.93', '1.00', '0.60']


Checking to see with 100 words vocab how many of the sentence data had all '0' array

In [0]:
X_BOW = [BOW(sentence,WORDS_TO_INDEX, 200) for sentence in df['SENTENCE']]

In [0]:
#count how many entries in X_BOW equal 0
counter=0
for i in range(0,len(X_BOW)):
  if sum(X_BOW[i])==0:
    print(df['SENTENCE'][i])
    counter+=1
print(counter)

test models with stences that are stemmed

In [0]:
new_sentences, words_cleaned_stemmed = clean_and_stem(df['SENTENCE'])

#check F1 score for different vocab sizes using different classifiers
vocabsize=[10,20,50,100]
nbresult=[]
lrresult=[]
svmresult=[]

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

for i in vocabsize:
  DICT_SIZE = i
  WORDS_TO_INDEX = {}
  top_words=sorted(words_cleaned_stemmed.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
  for x in range(DICT_SIZE): WORDS_TO_INDEX.update({top_words[x][0]:x})
  #INDEX_TO_WORDS = {v:k for k,v in WORDS_TO_INDEX.items()}
  #ALL_WORDS = WORDS_TO_INDEX.keys()
  

  X_BOW = [BOW(sentence,WORDS_TO_INDEX, i) for sentence in new_sentences]
  X_train, X_test, y_train, y_test  = train_test_split(X_BOW, y_encoded, test_size=0.2, random_state=10)
  y_train=np.asarray(y_train, dtype=int)
  y_test=np.asarray(y_test, dtype=int)
  #naive bayes
  bnb = BernoulliNB()
  nbclassifier = bnb.fit(X_train, y_train)
  y_pred_nb = nbclassifier.predict(X_test)
  nbscore=f1_score(y_test, y_pred_nb)
  nbresult.append("{:.2f}".format(nbscore))
  
  result=[]
  #LogisticRegression
  lr = LogisticRegression()
  lrclassifier = lr.fit(X_train, y_train)
  y_pred_lr = lrclassifier.predict(X_test)
  lrscore=f1_score(y_test, y_pred_lr)
  lrresult.append("{:.2f}".format(lrscore))
  #SVM
  svmclassifier=SVC().fit(X_train, y_train)
  y_pred_svm = svmclassifier.predict(X_test)
  svmscore = f1_score(y_test, y_pred_svm)
  svmresult.append("{:.2f}".format(svmscore))
  
print(top_words[:10])
print("naive bayes")
print(vocabsize)
print(nbresult)
print("logistic regression")
print(vocabsize)
print(lrresult)
print("SVM")
print(vocabsize)
print(svmresult)

[('madu', 118), ('yang', 62), ('dan', 28), ('di', 24), ('jadi', 20), ('lebah', 20), ('dalam', 18), ('itu', 16), ('saya', 16), ('tidak', 15)]
naive bayes
[10, 20, 50, 100]
['0.78', '0.88', '0.88', '0.82']
logistic regression
[10, 20, 50, 100]
['0.63', '0.82', '0.88', '0.82']
SVM
[10, 20, 50, 100]
['0.52', '0.92', '0.44', '0.00']


  'precision', 'predicted', average, warn_for)
