In [10]:
import pandas as pd
from sklearn.metrics import classification_report
import re

import pathlib

In [18]:
class RulesAspectClassifier:

    perf_suf_tmpl = re.compile("^(po|pro|na|iz|za|s|u|do|raz|od)")
    imp_suf_tmpl  = re.compile("(iva|ava|ova|uva)$")

    def __init__(self, biase:str="imp"):
        self.biase = biase

    def predict_(self, x: str) -> int:

        x = x.lower()

        if self.perf_suf_tmpl.match(x):
            return "perf"

        elif self.imp_suf_tmpl.search(x):
            return "imp"

        elif x.endswith("ti"):
            return "imp"

        ##  Если ничего не найдено, возвращаем предубеждение
        else:
            return self.biase

    def predict(self, X:list[str]) -> list[str]:
        return [
            self.predict_(x)
            for x
            in X
        ]

In [12]:
data_path = pathlib.Path.cwd() / "data" / "forms2sents.csv"

In [14]:
df = pd.read_csv(data_path, sep="\t", index_col=0)
df.head(2)

Unnamed: 0,lemma,word,aspect,feats,nsubj,obj,iobj,obl,rel,text
3013,bacati,bacali,imp,Gender=Masc|Number=Plur|Tense=Past|VerbForm=Pa...,"šlepovi,NOUN,Case=Nom|Gender=Masc|Number=Plur","ga,PRON,Case=Acc|Gender=Masc|Number=Sing|Perso...",,"obale,NOUN,Case=Gen|Gender=Fem|Number=Sing",root,"""Umesto da bacaju mulj 25-30 km prema pučini, ..."
4147,bacati,bacaju,imp,Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbF...,"razgovori,NOUN,Case=Nom|Gender=Masc|Number=Plur","senku,NOUN,Case=Acc|Gender=Fem|Number=Sing",,"ekonomiju,NOUN,Case=Acc|Gender=Fem|Number=Sing",root,Blokirani razgovori sa MMF bacaju senku na nek...


In [15]:
X = df["word"]
y = df["aspect"]

In [19]:
rules_model_imp_biase  = RulesAspectClassifier(biase="imp")
rules_model_perf_biase = RulesAspectClassifier(biase="perf")
rules_model_both_biase = RulesAspectClassifier(biase="both")

In [20]:
y_imp_pred  = rules_model_imp_biase.predict(X)
y_perf_pred = rules_model_perf_biase.predict(X)
y_both_pred = rules_model_both_biase.predict(X)

In [25]:
print(classification_report(y, y_imp_pred, zero_division=0.0))

              precision    recall  f1-score   support

        both       0.00      0.00      0.00       212
         imp       0.60      0.66      0.63      2348
        perf       0.66      0.65      0.66      2448

    accuracy                           0.63      5008
   macro avg       0.42      0.44      0.43      5008
weighted avg       0.60      0.63      0.62      5008



In [28]:
print(classification_report(y, y_perf_pred, zero_division=0.0))

              precision    recall  f1-score   support

        both       0.00      0.00      0.00       212
         imp       0.56      0.05      0.08      2348
        perf       0.49      0.97      0.65      2448

    accuracy                           0.49      5008
   macro avg       0.35      0.34      0.25      5008
weighted avg       0.50      0.49      0.36      5008



In [29]:
print(classification_report(y, y_both_pred, zero_division=0.0))

              precision    recall  f1-score   support

        both       0.08      0.89      0.14       212
         imp       0.56      0.05      0.08      2348
        perf       0.66      0.65      0.66      2448

    accuracy                           0.38      5008
   macro avg       0.43      0.53      0.30      5008
weighted avg       0.59      0.38      0.37      5008

