In [1]:
import sklearn_crfsuite # Important

In [2]:
import sys

sys.path.insert(1, '../') # To import the modules below

In [3]:
import crf_glossing.features as cgfeat
import crf_glossing.majority_label as ml
import crf_glossing.process_file as cgpf
import crf_glossing.utils as utils

In [4]:
# Move to workspace - CHANGE FILE
!cd .. #PATH_TO_WORKSPACE

### 0. Loading the training and test datasets

In [5]:
# Load datasets: training and test files in the SIGMORPHON Shared Task format
## Change file paths here to read the desired data:
train_file = './git-train-track2-uncovered'
test_file = './git-test-track2-uncovered'

train = open(train_file, 'r').read()
test = open(test_file, 'r').read()

# Remove empty lines at the end of the file
if train[-1] == '\n':
    print('Empty line')
    train = train[:-1]
if test[-1] == '\n':
    print('Empty line')
    test = test[:-1]

Empty line
Empty line


In [6]:
mukri_train_corpus = cgpf.IGT_Corpus(train, test=False)
mukri_test_corpus = cgpf.IGT_Corpus(test, test=True)

There are 31 sentences.
This corpus is a training dataset.
There are 37 sentences.
This corpus is a test dataset.


In [7]:
# Converting the dataset into the CRF format
train_sents = mukri_train_corpus.convert_to_crf_format(stem=True) 

In [8]:
test_sents = mukri_test_corpus.convert_to_crf_format(stem=True)

In [9]:
# Prepare the data for training and testing (from CRFsuite)

X_train = [cgfeat.sent2features(s) for s in train_sents]
y_train = [cgfeat.sent2labels(s) for s in train_sents]

X_test = [cgfeat.sent2features(s) for s in test_sents]


### 1. Training the CRF

In [10]:
# Training the CRF model (default hyperparameters) - from CRFsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)
    

In [11]:
# Predict the labels for the test set
y_pred = crf.predict(X_test)


### 2. Predicting the lexical glosses

In [12]:
# Predicting the lexical glosses
majority_dictionary = ml.create_majority_dict(mukri_train_corpus)

y_pred = ml.apply_majority_label(y_pred, majority_dictionary, mukri_test_corpus)

In [13]:
gloss_sent_list = cgpf.convert_to_igt_format(y_pred)

In [14]:
# Saving the predictions (only) in a text file
## Change file path here
output_path = 'prediction.txt' 

with open(output_path, 'w', encoding='utf-8') as file:
    for gloss_sentence in gloss_sent_list:
        file.write('\\t' + '\n')
        file.write('\\m' + '\n')
        file.write('\\g ' + gloss_sentence + '\n')
        file.write('\\l' + '\n')
        file.write('\n')

### 3. Evaluating the predictions

In [15]:
# Evaluate the prediction quality (code from the SIGMORPHON Shared Task, modified to ignore the BLEU score)
## Change paths of output and gold files

!python3 ../crf_glossing/simple_eval.py --pred ./prediction.txt \
--gold ./git-test-track2-uncovered


  return re.split("\s|-", self.glosses)
{
    "classes": {
        "gram": {
            "f1": 0.5866666666666667,
            "prec": 0.49201277955271566,
            "rec": 0.7264150943396226
        },
        "stem": {
            "f1": 0.30120481927710846,
            "prec": 0.7692307692307693,
            "rec": 0.18726591760299627
        }
    },
    "morpheme_level": {
        "accuracy": 0.5180897250361794,
        "average_accuracy": 0.5321524093443845
    },
    "word_level": {
        "accuracy": 0.2916666666666667,
        "average_accuracy": 0.3124685727626905
    }
}


{
    "classes": {
        "gram": {
            "f1": 0.5866666666666667,
            "prec": 0.49201277955271566,
            "rec": 0.7264150943396226
        },
        "stem": {
            "f1": 0.30120481927710846,
            "prec": 0.7692307692307693,
            "rec": 0.18726591760299627
        }
    },
    "morpheme_level": {
        "accuracy": 0.5180897250361794,
        "average_accuracy": 0.5321524093443845
    },
    "word_level": {
        "accuracy": 0.2916666666666667,
        "average_accuracy": 0.3124685727626905
    }
}