# Cebuano Stemmer Demo

`Accepts a Cebuano word and returns the root word and its affixes`

In [3]:
from stemmer import stem_word

ceb_word = raw_input('Input a Cebuano word: ')
word = stem_word(word=ceb_word)

print(word.print_stem_results())

Input a Cebuano word: gipakaon
gipakaon:[(paka), {gi,-,on}]


# Cebuano POS Tagger Demo

`Accepts a Cebuano sentence and return POS tagged sentence`

In [2]:
from tagger import tag_sentence

ceb_sentence = raw_input('Input a Cebuano sentence: ')
words = tag_sentence(text=ceb_sentence)

sentence = ''
for word in words:
    sentence += str(word) + ' '

print(sentence)

Input a Cebuano sentence: Gipakaon ko niya og kanon.
gipakaon/['NOUN'] ko/['PRON'] niya/['PRON'] og/['CONJ'] kanon/['ADV'] ./['SYM'] 


# Stemmer Evaluation


`Evaluates the stemmer by getting the number of predicted root words vs. actual root words`

* The input words are already tokenized and stemmed

In [1]:
import stemmer_evaluator as se
import pandas as pd

result = se.to_panda_data()

df = pd.DataFrame(result['data'], index=result['index'])
df_valid = df[(df.is_valid == True)]
valid = df_valid.count()['is_valid']
print('Total tokens: ' + str(df.count()['is_valid']))
print('Correct root words: ' + str(df[(df.is_valid == True)].count()['is_valid']))
print('Incorrect root words: ' + str(df[(df.is_valid == False)].count()['is_valid']))
print("Correct / incorrect root percentage: " + str((valid / float(df.count()['is_valid'])) * 100))
print('\n')
print('Dictionary entry tokens: ' + str(df[(df.is_entry == True)].count()['is_entry']))
print('Correct dictionary entry tokens: ' + str(df[(df.is_entry == True) & (df.is_valid == True)].count()['is_entry']))
print('Incorrect dictionary entry tokens: ' + str(df[(df.is_entry == True) & (df.is_valid == False)].count()['is_entry']))
print('\n')
print('Non-dictionary entry tokens: ' + str(df[(df.is_entry == False)].count()['is_entry']))
print('Correct non-dictionary entry tokens: ' + str(df[(df.is_entry == False) & (df.is_valid == True)].count()['is_entry']))
print('Incorrect non-dictionary entry tokens: ' + str(df[(df.is_entry == False) & (df.is_valid == False)].count()['is_entry']))
print('\n')

df.tail(n=10)


Total tokens: 1298
Correct root words: 955
Incorrect root words: 343
Correct / incorrect root percentage: 73.57473035439138


Dictionary entry tokens: 974
Correct dictionary entry tokens: 878
Incorrect dictionary entry tokens: 96


Non-dictionary entry tokens: 324
Correct non-dictionary entry tokens: 77
Incorrect non-dictionary entry tokens: 247




Unnamed: 0,infix,is_entry,is_root,is_valid,prefix,root,suffix
nipahibawo,,False,False,False,ni,pahibaw,o
naghuwat,,True,False,True,nag,huwat,
unsaon,,True,False,True,,unsa,on
mamahimong,,True,False,True,ma,himo,ng
syudad,,True,True,True,,syudad,
sirhan,,True,False,False,,sir,han
kalapasan,,True,False,True,ka,lapas,an
bawian,,False,False,False,ba,wi,an
nakalapas,,True,False,True,naka,lapas,
nagkadaiyang,,True,False,False,nag,kadaiya,ng


# Tagger Evaluation


`Evaluates the tagger by getting the number of predicted pos tags vs. actual pos tags (F-score)`

In [1]:
from pandas_ml import ConfusionMatrix
from sklearn.metrics import f1_score
import tagger_evaluator as te

words = te.tag_test_sentences()
y_true = te.extract_actual_pos_tags()
y_pred = te.extract_predicted_pos_tags(words=words)
confusion_matrix = ConfusionMatrix(y_true, y_pred)
# confusion_matrix.print_stats()
print(confusion_matrix)

print('\nOverall F-Score\n')
f1score = f1_score(y_true, y_pred, average='micro')
print(f1score)
print('\n')

df = f1_score(y_true, y_pred, average=None)
print('Individual F-Score' + '\n')
print('ADJ  : ' + str(df[0]))
print('ADV  : ' + str(df[1]))
print('CONJ : ' + str(df[2]))
print('DET  : ' + str(df[3]))
print('NOUN : ' + str(df[4]))
print('NUM  : ' + str(df[5]))
print('OTH  : ' + str(df[6]))
print('PART : ' + str(df[7]))
print('PRON : ' + str(df[8]))
print('SYM  : ' + str(df[9]))
print('VERB : ' + str(df[10]))

No handlers could be found for logger "polyglot.detect.base"


Number of tokens: 1220

Disambiguated: 100.0% (1220) 

Predicted  ADJ  ADV  CONJ  DET  NOUN  NUM  OTH  PART  PRON  SYM  VERB  __all__
Actual                                                                        
ADJ         34    4     0    0     7    0    3     1     0    0     4       53
ADV          0   82     0    0     5    0    0     0     1    0     4       92
CONJ         0    0    48    0     0    0    0     4     0    0     0       52
DET          0    0    11   70     0    0    2     0     0    0     0       83
NOUN         4    0     1    1   234    0   45     1     0    0    28      314
NUM          0    0     0    0     9   28    1     0     0    0     0       38
OTH          0    0     0    0     0    0    3     0     0    0     0        3
PART         0    1     1    1     0    0    1   285     9    0     5      303
PRON         0    0     0    0     0    0    1     0    51    0     0       52
SYM          0    0     0    0     0    0    0     0     0   87     0       