# Cebuano Stemmer Demo

`Accepts a Cebuano word and returns the root word and its affixes`

In [None]:
from stemmer import stem_word

ceb_word = raw_input('Input a Cebuano word: ')
stems = stem_word(word=ceb_word)

print(stems)

# Cebuano POS Tagger Demo

`Accepts a Cebuano sentence and return POS tagged sentence`

In [1]:
from tagger import tag_sentence

ceb_sentence = raw_input('Input a Cebuano sentence: ')
tagged_sentence = tag_sentence(text=ceb_sentence)

print(tagged_sentence)

Input a Cebuano sentence: Gwapa kaayo ka.
[('Gwapa', 'ADJ'), ('kaayo', 'NOUN'), ('ka', 'PRON'), ('.', 'SYM')]


No handlers could be found for logger "polyglot.detect.base"


# Stemmer Evaluation


`Evaluates the stemmer by getting the number of predicted root words vs. actual root words`

* The input words are already tokenized and stemmed

In [7]:
import stemmer_evaluator as se
import pandas as pd

result = se.to_panda_data()
df = pd.DataFrame(result['data'], index=result['index'])
df.tail(n=10)

Unnamed: 0,infix,is_entry,is_root,is_valid,prefix,root,suffix
nipahibawo,,False,False,False,ni,pahibaw,o
naghuwat,,True,False,True,nag,huwat,
unsaon,,True,False,True,,unsa,on
mamahimong,,True,False,True,ma,himo,ng
syudad,,True,True,True,,syudad,
sirhan,,True,False,False,,sir,han
kalapasan,,True,False,True,ka,lapas,an
bawian,,False,False,False,ba,wi,an
nakalapas,,True,False,True,naka,lapas,
nagkadaiyang,,True,False,False,nag,kadaiya,ng


In [10]:
stats = se.statistics(df=df)
df_2 = pd.DataFrame(stats['values'], index=stats['index'], columns=['Stemmer Evaluation'])
df_2

Unnamed: 0,Stemmer Evaluation
Tokens,1298.0
Correct Root,955.0
Incorrect Root,343.0
Correct Root %,73.57473
Found Tokens,974.0
Correct Root (Found),878.0
Incorrect Root (Found),96.0
Unknown Tokens,324.0
Correct Root (Unknown),77.0
Incorrect Root (Unknown),247.0


# Tagger Evaluation


`Evaluates the tagger by getting the number of predicted pos tags vs. actual pos tags (F-score)`

In [1]:
import tagger_evaluator as te
from sklearn.metrics import f1_score, recall_score, precision_score
import pandas as pd

## News sentences (1200 tokens)

In [23]:
words = te.tag_test_sentences(test_all=False, specific='news-sentences.txt')
y_true_1 = te.extract_actual_pos_tags(test_all=False, specific='news-sentences.txt')
y_pred_1 = te.extract_predicted_pos_tags(words=words)

In [24]:
tag_columns = ['ADJ', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'OTH', 'PART', 'PRON', 'SYM', 'VERB']
matrix = te.confusion_matrix(actual=y_true_1, pred=y_pred_1)
cm_df = pd.DataFrame(matrix,columns=tag_columns, index=tag_columns)
cm_df

Unnamed: 0,ADJ,ADV,CONJ,DET,NOUN,NUM,OTH,PART,PRON,SYM,VERB
ADJ,32,9,0,0,3,0,3,1,0,0,5
ADV,0,91,0,0,0,0,0,0,0,0,2
CONJ,0,0,51,0,0,0,0,0,0,0,1
DET,0,0,11,70,0,0,2,0,0,0,0
NOUN,13,1,1,0,221,0,47,1,0,0,32
NUM,0,0,0,0,1,35,1,0,0,0,1
OTH,0,0,0,0,0,0,0,0,0,0,0
PART,0,1,2,1,0,0,1,275,15,0,8
PRON,0,1,0,0,0,0,1,0,50,0,0
SYM,0,0,0,0,0,0,0,0,0,87,0


In [25]:
values_col = ['TP', 'FP', 'FN', 'TN']
values = te.cm_values(matrix=matrix)
values_df = pd.DataFrame(values,columns=values_col, index=tag_columns)
values_df

Unnamed: 0,TP,FP,FN,TN
ADJ,32,17,21,1150
ADV,91,13,2,1114
CONJ,51,14,1,1154
DET,70,1,13,1136
NOUN,221,28,95,876
NUM,35,0,3,1182
OTH,0,79,0,1141
PART,275,2,28,915
PRON,50,17,2,1151
SYM,87,0,0,1133


In [26]:
recalls = recall_score(y_true=y_true_1, y_pred=y_pred_1, average=None)
precisions = precision_score(y_true=y_true_1, y_pred=y_pred_1, average=None)
fscores = f1_score(y_true=y_true_1, y_pred=y_pred_1, average=None)

recall_series = pd.Series(recalls, name='Recall', index=tag_columns, dtype=float)
precision_series = pd.Series(precisions, name='Precision', index=tag_columns, dtype=float)
fscore_series = pd.Series(fscores, name='F1 Score', index=tag_columns, dtype=float)

pd.concat([recall_series, precision_series, fscore_series], axis=1)

Unnamed: 0,Recall,Precision,F1 Score
ADJ,0.603774,0.653061,0.627451
ADV,0.978495,0.875,0.923858
CONJ,0.980769,0.784615,0.871795
DET,0.843373,0.985915,0.909091
NOUN,0.699367,0.88755,0.782301
NUM,0.921053,1.0,0.958904
OTH,0.0,0.0,0.0
PART,0.907591,0.99278,0.948276
PRON,0.961538,0.746269,0.840336
SYM,1.0,1.0,1.0


In [27]:
recall = recall_score(y_true=y_true_1, y_pred=y_pred_1, average='micro')
precision = precision_score(y_true=y_true_1, y_pred=y_pred_1, average='micro')
fscore = f1_score(y_true=y_true_1, y_pred=y_pred_1, average='micro')

pd.DataFrame([recall, precision, fscore], index=['Recall', 'Precision', 'F1 Score'], columns=['Overall'])

Unnamed: 0,Overall
Recall,0.819672
Precision,0.819672
F1 Score,0.819672


## Blog sentences (1045 tokens)

In [28]:
words = te.tag_test_sentences(test_all=False, specific='blog-sentences.txt')
y_true_2 = te.extract_actual_pos_tags(test_all=False, specific='blog-sentences.txt')
y_pred_2 = te.extract_predicted_pos_tags(words=words)

In [29]:
tag_columns = ['ADJ', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'OTH', 'PART', 'PRON', 'SYM', 'VERB']
matrix = te.confusion_matrix(actual=y_true_2, pred=y_pred_2)
cm_df = pd.DataFrame(matrix,columns=tag_columns, index=tag_columns)
cm_df

Unnamed: 0,ADJ,ADV,CONJ,DET,NOUN,NUM,OTH,PART,PRON,SYM,VERB
ADJ,55,9,0,0,7,0,6,0,0,0,3
ADV,4,68,0,1,1,0,7,1,0,0,4
CONJ,0,0,54,0,0,0,0,0,0,0,0
DET,0,0,17,58,0,0,0,0,0,0,0
NOUN,14,8,0,0,109,0,43,0,0,0,36
NUM,0,0,0,0,0,16,0,0,0,0,0
OTH,0,0,0,0,0,0,0,0,0,0,0
PART,1,1,0,0,0,0,2,180,14,0,3
PRON,1,0,1,1,1,0,0,0,90,0,2
SYM,0,0,0,0,0,0,0,0,0,97,0


In [30]:
values_col = ['TP', 'FP', 'FN', 'TN']
values = te.cm_values(matrix=matrix)
values_df = pd.DataFrame(values,columns=values_col, index=tag_columns)
values_df

Unnamed: 0,TP,FP,FN,TN
ADJ,55,31,25,934
ADV,68,20,18,939
CONJ,54,18,0,973
DET,58,2,17,968
NOUN,109,27,101,808
NUM,16,0,0,1029
OTH,0,70,0,975
PART,180,2,21,842
PRON,90,15,6,934
SYM,97,0,0,948


In [31]:
recalls = recall_score(y_true=y_true_2, y_pred=y_pred_2, average=None)
precisions = precision_score(y_true=y_true_2, y_pred=y_pred_2, average=None)
fscores = f1_score(y_true=y_true_2, y_pred=y_pred_2, average=None)

recall_series = pd.Series(recalls, name='Recall', index=tag_columns, dtype=float)
precision_series = pd.Series(precisions, name='Precision', index=tag_columns, dtype=float)
fscore_series = pd.Series(fscores, name='F1 Score', index=tag_columns, dtype=float)

pd.concat([recall_series, precision_series, fscore_series], axis=1)

Unnamed: 0,Recall,Precision,F1 Score
ADJ,0.6875,0.639535,0.662651
ADV,0.790698,0.772727,0.781609
CONJ,1.0,0.75,0.857143
DET,0.773333,0.966667,0.859259
NOUN,0.519048,0.801471,0.630058
NUM,1.0,1.0,1.0
OTH,0.0,0.0,0.0
PART,0.895522,0.989011,0.939948
PRON,0.9375,0.857143,0.895522
SYM,1.0,1.0,1.0


In [32]:
recall = recall_score(y_true=y_true_2, y_pred=y_pred_2, average='micro')
precision = precision_score(y_true=y_true_2, y_pred=y_pred_2, average='micro')
fscore = f1_score(y_true=y_true_2, y_pred=y_pred_2, average='micro')

pd.DataFrame([recall, precision, fscore], index=['Recall', 'Precision', 'F1 Score'], columns=['Overall'])

Unnamed: 0,Overall
Recall,0.777033
Precision,0.777033
F1 Score,0.777033


## Example sentences (300 tokens)

In [33]:
words = te.tag_test_sentences(test_all=False, specific='example-sentences.txt')
y_true_3 = te.extract_actual_pos_tags(test_all=False, specific='example-sentences.txt')
y_pred_3 = te.extract_predicted_pos_tags(words=words)

In [34]:
tag_columns = ['ADJ', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'OTH', 'PART', 'PRON', 'SYM', 'VERB']
matrix = te.confusion_matrix(actual=y_true_3, pred=y_pred_3)
cm_df = pd.DataFrame(matrix,columns=tag_columns, index=tag_columns)
cm_df

Unnamed: 0,ADJ,ADV,CONJ,DET,NOUN,NUM,OTH,PART,PRON,SYM,VERB
ADJ,4,1,0,0,0,0,1,0,0,0,0
ADV,0,7,2,0,2,1,2,1,0,0,0
CONJ,0,0,7,0,0,0,0,0,0,0,0
DET,0,0,9,32,0,0,0,1,0,0,0
NOUN,2,0,0,0,52,0,6,0,2,0,5
NUM,0,0,0,0,0,3,0,0,0,0,0
OTH,0,0,0,0,0,0,0,0,0,0,0
PART,0,5,2,0,0,0,0,25,3,0,0
PRON,0,0,0,0,1,0,0,0,37,0,0
SYM,0,0,0,0,0,0,0,0,0,45,0


In [35]:
values_col = ['TP', 'FP', 'FN', 'TN']
values = te.cm_values(matrix=matrix)
values_df = pd.DataFrame(values,columns=values_col, index=tag_columns)
values_df

Unnamed: 0,TP,FP,FN,TN
ADJ,4,3,2,291
ADV,7,7,8,278
CONJ,7,13,0,280
DET,32,0,10,258
NOUN,52,9,15,224
NUM,3,1,0,296
OTH,0,10,0,290
PART,25,2,10,263
PRON,37,5,1,257
SYM,45,0,0,255


In [36]:
recalls = recall_score(y_true=y_true_3, y_pred=y_pred_3, average=None)
precisions = precision_score(y_true=y_true_3, y_pred=y_pred_3, average=None)
fscores = f1_score(y_true=y_true_3, y_pred=y_pred_3, average=None)

recall_series = pd.Series(recalls, name='Recall', index=tag_columns, dtype=float)
precision_series = pd.Series(precisions, name='Precision', index=tag_columns, dtype=float)
fscore_series = pd.Series(fscores, name='F1 Score', index=tag_columns, dtype=float)

pd.concat([recall_series, precision_series, fscore_series], axis=1)

Unnamed: 0,Recall,Precision,F1 Score
ADJ,0.666667,0.571429,0.615385
ADV,0.466667,0.5,0.482759
CONJ,1.0,0.35,0.518519
DET,0.761905,1.0,0.864865
NOUN,0.776119,0.852459,0.8125
NUM,1.0,0.75,0.857143
OTH,0.0,0.0,0.0
PART,0.714286,0.925926,0.806452
PRON,0.973684,0.880952,0.925
SYM,1.0,1.0,1.0


In [37]:
recall = recall_score(y_true=y_true_3, y_pred=y_pred_3, average='micro')
precision = precision_score(y_true=y_true_3, y_pred=y_pred_3, average='micro')
fscore = f1_score(y_true=y_true_3, y_pred=y_pred_3, average='micro')

pd.DataFrame([recall, precision, fscore], index=['Recall', 'Precision', 'F1 Score'], columns=['Overall'])

Unnamed: 0,Overall
Recall,0.816667
Precision,0.816667
F1 Score,0.816667


## All Sentences (2565 tokens)

* News sentences
* Blog sentences
* Example sentences

In [38]:
words = te.tag_test_sentences()
y_true = te.extract_actual_pos_tags()
y_pred = te.extract_predicted_pos_tags(words=words)

In [39]:
tag_columns = ['ADJ', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'OTH', 'PART', 'PRON', 'SYM', 'VERB']
matrix = te.confusion_matrix(actual=y_true, pred=y_pred)
cm_df = pd.DataFrame(matrix,columns=tag_columns, index=tag_columns)
cm_df

Unnamed: 0,ADJ,ADV,CONJ,DET,NOUN,NUM,OTH,PART,PRON,SYM,VERB
ADJ,91,19,0,0,10,0,10,1,0,0,8
ADV,4,166,2,1,3,1,9,2,0,0,6
CONJ,0,0,112,0,0,0,0,0,0,0,1
DET,0,0,37,160,0,0,2,1,0,0,0
NOUN,29,9,1,0,382,0,96,1,2,0,73
NUM,0,0,0,0,1,54,1,0,0,0,1
OTH,0,0,0,0,0,0,0,0,0,0,0
PART,1,7,4,1,0,0,3,480,32,0,11
PRON,1,1,1,1,2,0,1,0,177,0,2
SYM,0,0,0,0,0,0,0,0,0,229,0


In [40]:
values_col = ['TP', 'FP', 'FN', 'TN']
values = te.cm_values(matrix=matrix)
values_df = pd.DataFrame(values,columns=values_col, index=tag_columns)
values_df

Unnamed: 0,TP,FP,FN,TN
ADJ,91,51,48,2375
ADV,166,40,28,2331
CONJ,112,45,1,2407
DET,160,3,40,2362
NOUN,382,64,211,1908
NUM,54,1,3,2507
OTH,0,159,0,2406
PART,480,6,59,2020
PRON,177,37,9,2342
SYM,229,0,0,2336


In [41]:
recalls = recall_score(y_true=y_true, y_pred=y_pred, average=None)
precisions = precision_score(y_true=y_true, y_pred=y_pred, average=None)
fscores = f1_score(y_true=y_true, y_pred=y_pred, average=None)

recall_series = pd.Series(recalls, name='Recall', index=tag_columns, dtype=float)
precision_series = pd.Series(precisions, name='Precision', index=tag_columns, dtype=float)
fscore_series = pd.Series(fscores, name='F1 Score', index=tag_columns, dtype=float)

pd.concat([recall_series, precision_series, fscore_series], axis=1)

Unnamed: 0,Recall,Precision,F1 Score
ADJ,0.654676,0.640845,0.647687
ADV,0.85567,0.805825,0.83
CONJ,0.99115,0.713376,0.82963
DET,0.8,0.981595,0.881543
NOUN,0.644182,0.856502,0.735322
NUM,0.947368,0.981818,0.964286
OTH,0.0,0.0,0.0
PART,0.890538,0.987654,0.936585
PRON,0.951613,0.827103,0.885
SYM,1.0,1.0,1.0


In [42]:
recall = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
precision = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
fscore = f1_score(y_true=y_true, y_pred=y_pred, average='micro')

pd.DataFrame([recall, precision, fscore], index=['Recall', 'Precision', 'F1 Score'], columns=['Overall'])

Unnamed: 0,Overall
Recall,0.801949
Precision,0.801949
F1 Score,0.801949
