# Cebuano Stemmer Demo

`Accepts a Cebuano word and returns the root word and its affixes`

In [2]:
from stemmer import stem_word

ceb_word = raw_input('Input a Cebuano word: ')
stems = stem_word(word=ceb_word)

print(stems)

Input a Cebuano word: nagbuangbuang
['buang', 'nag', None, None]


# Cebuano POS Tagger Demo

`Accepts a Cebuano sentence and return POS tagged sentence`

In [2]:
from tagger import tag_sentence

ceb_sentence = raw_input('Input a Cebuano sentence: ')
tagged_sentence = tag_sentence(text=ceb_sentence)

print(tagged_sentence)
# sentence = ''
# for word in words:
#     sentence += str(word) + ' '
    
# print(sentence)

Input a Cebuano sentence: But-an ang amigo nga manglibre.
[('But-an', 'ADJ'), ('ang', 'DET'), ('amigo', 'NOUN'), ('nga', 'PART'), ('manglibre', 'VERB'), ('.', 'SYM')]


# Stemmer Evaluation


`Evaluates the stemmer by getting the number of predicted root words vs. actual root words`

* The input words are already tokenized and stemmed

In [1]:
import stemmer_evaluator as se
import pandas as pd

result = se.to_panda_data()

df = pd.DataFrame(result['data'], index=result['index'])
df_valid = df[(df.is_valid == True)]
valid = df_valid.count()['is_valid']
print('Total tokens: ' + str(df.count()['is_valid']))
print('Correct root words: ' + str(df[(df.is_valid == True)].count()['is_valid']))
print('Incorrect root words: ' + str(df[(df.is_valid == False)].count()['is_valid']))
print("Correct / incorrect root percentage: " + str((valid / float(df.count()['is_valid'])) * 100))
print('\n')
print('Dictionary entry tokens: ' + str(df[(df.is_entry == True)].count()['is_entry']))
print('Correct dictionary entry tokens: ' + str(df[(df.is_entry == True) & (df.is_valid == True)].count()['is_entry']))
print('Incorrect dictionary entry tokens: ' + str(df[(df.is_entry == True) & (df.is_valid == False)].count()['is_entry']))
print('\n')
print('Non-dictionary entry tokens: ' + str(df[(df.is_entry == False)].count()['is_entry']))
print('Correct non-dictionary entry tokens: ' + str(df[(df.is_entry == False) & (df.is_valid == True)].count()['is_entry']))
print('Incorrect non-dictionary entry tokens: ' + str(df[(df.is_entry == False) & (df.is_valid == False)].count()['is_entry']))
print('\n')

df.tail(n=10)


Total tokens: 1298
Correct root words: 955
Incorrect root words: 343
Correct / incorrect root percentage: 73.57473035439138


Dictionary entry tokens: 974
Correct dictionary entry tokens: 878
Incorrect dictionary entry tokens: 96


Non-dictionary entry tokens: 324
Correct non-dictionary entry tokens: 77
Incorrect non-dictionary entry tokens: 247




Unnamed: 0,infix,is_entry,is_root,is_valid,prefix,root,suffix
nipahibawo,,False,False,False,ni,pahibaw,o
naghuwat,,True,False,True,nag,huwat,
unsaon,,True,False,True,,unsa,on
mamahimong,,True,False,True,ma,himo,ng
syudad,,True,True,True,,syudad,
sirhan,,True,False,False,,sir,han
kalapasan,,True,False,True,ka,lapas,an
bawian,,False,False,False,ba,wi,an
nakalapas,,True,False,True,naka,lapas,
nagkadaiyang,,True,False,False,nag,kadaiya,ng


# Tagger Evaluation


`Evaluates the tagger by getting the number of predicted pos tags vs. actual pos tags (F-score)`

In [1]:
import tagger_evaluator as te
from pandas_ml import ConfusionMatrix
from sklearn.metrics import f1_score, confusion_matrix
import pandas as pd

words = te.tag_test_sentences()
y_true = te.extract_actual_pos_tags()
y_pred = te.extract_predicted_pos_tags(words=words)
confusion_matrix = ConfusionMatrix(y_true, y_pred)
# confusion_matrix.print_stats()
# print(confusion_matrix)
# print(confusion_matrix)

# print('\nOverall F-Score\n')
# f1score = f1_score(y_true, y_pred, average='micro')
# print(f1score)
# print('\n')

# df = f1_score(y_true, y_pred, average=None)
# print('Individual F-Score' + '\n')
# print('ADJ  : ' + str(df[0]))
# print('ADV  : ' + str(df[1]))
# print('CONJ : ' + str(df[2]))
# print('DET  : ' + str(df[3]))
# print('NOUN : ' + str(df[4]))
# print('NUM  : ' + str(df[5]))
# print('OTH  : ' + str(df[6]))
# print('PART : ' + str(df[7]))
# print('PRON : ' + str(df[8]))
# print('SYM  : ' + str(df[9]))
# print('VERB : ' + str(df[10]))

# f_score = 0
# for i in range(0, 11):
#     if i != 6:
#         f_score += df[i]

# print(f_score / 10.0)

No handlers could be found for logger "polyglot.detect.base"
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  df = df.loc[idx, idx.copy()].fillna(0)  # if some columns or rows are missing


In [2]:
# For Matrix
# 0 ADJ
# 1 ADV
# 2 CONJ
# 3 DET
# 4 NOUN
# 5 NUM
# 6 OTH
# 7 PART
# 8 PRON
# 9 SYM
# 10 VERB

# For Values
# TP FP FN TN

# Performance
# Accuracy Precision Recall F1Score

tag_columns = ['ADJ', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'OTH', 'PART', 'PRON', 'SYM', 'VERB']
matrix = te.confusion_matrix(actual=y_true, pred=y_pred)
cm_df = pd.DataFrame(matrix,columns=tag_columns, index=tag_columns)
cm_df

Unnamed: 0,ADJ,ADV,CONJ,DET,NOUN,NUM,OTH,PART,PRON,SYM,VERB
ADJ,4,1,0,0,0,0,1,0,0,0,0
ADV,0,7,2,0,2,1,2,1,0,0,0
CONJ,0,0,7,0,0,0,0,0,0,0,0
DET,0,0,9,32,0,0,0,1,0,0,0
NOUN,2,0,0,0,52,0,6,0,2,0,5
NUM,0,0,0,0,0,3,0,0,0,0,0
OTH,0,0,0,0,0,0,0,0,0,0,0
PART,0,5,2,0,0,0,0,25,3,0,0
PRON,0,0,0,0,1,0,0,0,37,0,0
SYM,0,0,0,0,0,0,0,0,0,45,0


In [3]:
values_col = ['TP', 'FP', 'FN', 'TN']
values = te.cm_values(matrix=matrix)
values_df = pd.DataFrame(values,columns=values_col, index=tag_columns)
values_df

Unnamed: 0,TP,FP,FN,TN
ADJ,4,3,2,291
ADV,7,7,8,278
CONJ,7,13,0,280
DET,32,0,10,258
NOUN,52,9,15,224
NUM,3,1,0,296
OTH,0,10,0,290
PART,25,2,10,263
PRON,37,5,1,257
SYM,45,0,0,255


In [5]:
perf_col = ['Accuracy', 'Precision', 'Recall', 'F-Score']
perf = te.performance(values=values)
perf_df = pd.DataFrame(perf,columns=perf_col, index=tag_columns, dtype=float)
perf_df

Unnamed: 0,Accuracy,Precision,Recall,F-Score
ADJ,4.0,3.0,2.0,291.0
ADV,7.0,7.0,8.0,278.0
CONJ,7.0,13.0,0.0,280.0
DET,32.0,0.0,10.0,258.0
NOUN,52.0,9.0,15.0,224.0
NUM,3.0,1.0,0.0,296.0
OTH,0.0,10.0,0.0,290.0
PART,25.0,2.0,10.0,263.0
PRON,37.0,5.0,1.0,257.0
SYM,45.0,0.0,0.0,255.0
