In [1]:
!pip install --no-cache-dir transformers sentencepiece &> /dev/null 
!pip install datasets &> /dev/null 
!pip install evaluate &> /dev/null 

In [2]:
#from huggingface_hub import notebook_login
#notebook_login()

In [3]:
import torch
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



In [4]:
data_folder = '/content/drive/MyDrive/UNLP/unlp-2023-shared-task-main/data/'
gec_fluency_valid_src = data_folder + 'gec-fluency/valid.src.txt'
gec_fluency_valid_tgt = data_folder + 'gec-fluency/valid.tgt.txt'
gec_only_valid_src = data_folder + 'gec-only/valid.src.txt'
gec_only_valid_tgt = data_folder + 'gec-only/valid.tgt.txt'

In [5]:
import pandas as pd
from datasets import Dataset

# reading the output data
with open(gec_fluency_valid_src, 'r') as f:
    valid_src = [line[:-1] for line in f.readlines()]

with open(gec_fluency_valid_tgt, 'r') as f:
    valid_tgt = [line[:-1] for line in f.readlines()]

# converting sentences to pandas dataframe
valid_sentences = [[valid_src[i], valid_tgt[i]] for i in range(len(valid_src))]

valid_df = pd.DataFrame(valid_sentences)
valid_df.columns = ['source', 'target']

valid_dataset = Dataset.from_pandas(valid_df)

# Translator

In [6]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "StopFuture/future_13_2ep"
translator = pipeline("translation", src_lang="uk_UA", tgt_lang="uk_UA", model=model_checkpoint, device=0)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
output_sent = translator(valid_dataset['source'], batch_size=64)

In [None]:
import os
model_name = model_checkpoint.split('/')[1]

output_folder = f'/content/drive/MyDrive/UNLP/results/{model_name}'

# creating the output folder
if not os.path.exists(output_folder):
  os.mkdir(output_folder)

output_path = output_folder + '/output.txt'

output_sentences = [sent['translation_text'] + '\n' for sent in output_sent]
with open(output_path, 'w') as f:
  f.writelines(output_sentences)

# Analyzing results

In [7]:
import os
model_checkpoint = "StopFuture/future_13_2ep"
model_name = model_checkpoint.split('/')[1]
output_folder = f'/content/drive/MyDrive/UNLP/results/{model_name}'

# creating the output folder
if not os.path.exists(output_folder):
  os.mkdir(output_folder)

output_path = output_folder + '/output.txt'

In [8]:
corrected = output_path
m2 = '/content/drive/MyDrive/UNLP/unlp-2023-shared-task-main/data/gec-fluency/valid.m2'

In [9]:
!pip install ua-gec==2.0.0 &> /dev/null
!pip install errant==2.3.3 &> /dev/null
!pip install spacy>=2.2.0,<3 &> /dev/null
!pip install stanza==1.4.2 &> /dev/null

/bin/bash: 3: No such file or directory


In [10]:
import subprocess
import sys
import tempfile
from pathlib import Path

import spacy
import stanza


def tokenize(text: str) -> [str]:
    if not hasattr(tokenize, "nlp"):
        tokenize.nlp = stanza.Pipeline(lang="uk", processors="tokenize")
    nlp = tokenize.nlp

    tokenized = " ".join([t.text for t in nlp(text).iter_tokens()])
    return tokenized


def tokenize_file(input_file: Path, output_file: Path):
    with open(input_file, encoding="utf-8") as f, open(output_file, "w", encoding="utf-8") as out:
        for line in f:
            line = line.rstrip("\n")
            tokenized = tokenize(line)
            out.write(tokenized + "\n")

tmp = Path(tempfile.gettempdir())
print("Tokenizing submission...", file=sys.stderr)
tokenized_path = tmp / f"unlp.target.tok"
tokenize_file(corrected, tokenized_path)
print(f"Tokenized: {tokenized_path}", file=sys.stderr)


try:
  spacy.load("en")
except OSError:
  print("Downloading spacy resources...", file=sys.stderr)
  subprocess.run(["python", "-m", "spacy", "download", "en"], check=True)

  
# Get the source text out of m2
source_path = tmp / f"unlp.source.tok"
with open(m2, encoding="utf-8") as f, open(source_path, "w", encoding="utf-8") as out:
  for line in f:
    if line.startswith("S "):
      out.write(line[2:])

# Align tokenized submission with the original text with Errant
m2_target = output_folder + "/unlp.target.m2"
subprocess.run(["errant_parallel", "-orig", source_path, "-cor", tokenized_path, "-out", m2_target], check=True)
print(f"Aligned submission: {m2_target}", file=sys.stderr)

Tokenizing submission...
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …



Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.4.1/models/tokenize/iu.pt:   0%|          …

Downloading https://huggingface.co/stanfordnlp/stanza-uk/resolve/v1.4.1/models/mwt/iu.pt:   0%|          | 0.0…

INFO:stanza:Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Done loading processors!
Tokenized: /tmp/unlp.target.tok
Downloading spacy resources...
Aligned submission: /content/drive/MyDrive/UNLP/results/future_13_2ep/unlp.target.m2


# Filter

In [11]:
import pandas as pd

In [12]:
with open(m2_target, 'r') as f:
  sentences = f.read().split('\n\n')[:-1]

sentence_split = []
for sentence in sentences:
  sentence_split.append(sentence.split('\n'))

sent_df = []
for sent in sentence_split:

  edits = []
  for edit in sent[1:]:
    edit_lst = edit.split('|||')
    edit_type = edit_lst[1]
    ref = edit_lst[-1]

    edits.append([edit_type, ref])
  sent_dict = {'sent':sent[0], 
               'all':sent, 
               'total-edits':len(sent)-1,
               'all-edits':edits}
  sent_df.append(sent_dict)
sent_df = pd.DataFrame(sent_df)

big_edits = []
for edits in sent_df['all-edits']:
  for edit in edits:
    big_edits.append(edit[0])

all_cats = list(set(big_edits))

for cat in all_cats:
  sent_df[cat] = 0

sent_df['ref0'] = 0
sent_df['ref1'] = 0

for i in range(len(sent_df)):
  edits = sent_df['all-edits'][i]
  for edit in edits:
    sent_df[edit[0]][i] += 1
    sent_df[f'ref{edit[1]}'][i] +=1

drop_idx = list(sent_df[sent_df['total-edits']>4].index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_df[edit[0]][i] += 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_df[f'ref{edit[1]}'][i] +=1


In [13]:
sent_df['punct-edits'] = sent_df['M:PUNCT'] +  sent_df['R:PUNCT'] + sent_df['U:PUNCT']
drop_df = sent_df[sent_df['total-edits']-sent_df['punct-edits']>3]
drop_df

Unnamed: 0,sent,all,total-edits,all-edits,R:WO,M:NOUN,M:PUNCT,R:ORTH,M:OTHER,U:PREP,...,R:PUNCT,U:PUNCT,R:OTHER,R:DET,R:NOUN,R:VERB,U:NOUN,ref0,ref1,punct-edits
43,S Очерет ставав все густіший - що з однієї сто...,[S Очерет ставав все густіший - що з однієї ст...,7,"[[R:SPELL, 0], [R:SPELL, 0], [R:PUNCT, 0], [M:...",0,0,1,0,0,0,...,1,0,2,0,1,0,0,7,0,2
178,S Розглянемо типовий приклад в самому центрі м...,[S Розглянемо типовий приклад в самому центрі ...,5,"[[R:SPELL, 0], [R:OTHER, 0], [R:OTHER, 0], [R:...",0,0,1,0,0,0,...,0,0,3,0,0,0,0,5,0,1
280,S Отжу я поклав бутилку ліків в портфель та пі...,[S Отжу я поклав бутилку ліків в портфель та п...,4,"[[R:NOUN, 0], [R:NOUN, 0], [R:NOUN, 0], [R:SPE...",0,0,0,0,0,0,...,0,0,0,0,3,0,0,4,0,0


In [14]:
drop_idx = list(drop_df.index)

In [15]:
drop_idx

[43, 178, 280]

In [16]:
with open(output_path, 'r') as f:
  output_sentences = [sent[:-1] for sent in f.readlines()]

data_folder = '/content/drive/MyDrive/UNLP/unlp-2023-shared-task-main/data/'
gec_fluency_valid_src = data_folder + 'gec-fluency/valid.src.txt'

with open(gec_fluency_valid_src, 'r') as f:
  valid_src = [line[:-1] for line in f.readlines()]

In [17]:
for id in drop_idx:
  output_sentences[id] = valid_src[id]

In [18]:
output_sentences = [sent + '\n' for sent in output_sentences]
with open(output_path[:-4]+"filter.txt", 'w') as f:
  f.writelines(output_sentences)

#### Analyzing results

In [20]:
import os
model_checkpoint = "StopFuture/future_13_2ep"
model_name = model_checkpoint.split('/')[1]
output_folder = f'/content/drive/MyDrive/UNLP/results/{model_name}'
filter_path = output_folder + '/outputfilter.txt'

In [21]:
corrected = filter_path
m2 = '/content/drive/MyDrive/UNLP/unlp-2023-shared-task-main/data/gec-fluency/valid.m2'

In [22]:
import subprocess
import sys
import tempfile
from pathlib import Path

import spacy
import stanza


def tokenize(text: str) -> [str]:
    if not hasattr(tokenize, "nlp"):
        tokenize.nlp = stanza.Pipeline(lang="uk", processors="tokenize")
    nlp = tokenize.nlp

    tokenized = " ".join([t.text for t in nlp(text).iter_tokens()])
    return tokenized


def tokenize_file(input_file: Path, output_file: Path):
    with open(input_file, encoding="utf-8") as f, open(output_file, "w", encoding="utf-8") as out:
        for line in f:
            line = line.rstrip("\n")
            tokenized = tokenize(line)
            out.write(tokenized + "\n")

tmp = Path(tempfile.gettempdir())
print("Tokenizing submission...", file=sys.stderr)
tokenized_path = tmp / f"unlp.target.tok"
tokenize_file(corrected, tokenized_path)
print(f"Tokenized: {tokenized_path}", file=sys.stderr)


try:
  spacy.load("en")
except OSError:
  print("Downloading spacy resources...", file=sys.stderr)
  subprocess.run(["python", "-m", "spacy", "download", "en"], check=True)

  
# Get the source text out of m2
source_path = tmp / f"unlp.source.tok"
with open(m2, encoding="utf-8") as f, open(source_path, "w", encoding="utf-8") as out:
  for line in f:
    if line.startswith("S "):
      out.write(line[2:])

# Align tokenized submission with the original text with Errant
m2_target_filter = output_folder + "/filter.target.m2"
subprocess.run(["errant_parallel", "-orig", source_path, "-cor", tokenized_path, "-out", m2_target_filter], check=True)
print(f"Aligned submission: {m2_target_filter}", file=sys.stderr)

Tokenizing submission...
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

INFO:stanza:Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Done loading processors!
Tokenized: /tmp/unlp.target.tok
Aligned submission: /content/drive/MyDrive/UNLP/results/future_13_2ep/filter.target.m2


In [23]:
class Args():
  def __init__(self):
    self.hyp = m2_target_filter
    self.ref = m2
    self.ds = False
    self.dt = False
    self.single = False
    self.multi = False
    self.filt = []
    self.cse = False
    self.verbose = False # SET THIS TO TRUE TO SEE SENTENCE INFO
    self.beta = 0.5
    self.cat = 3
args = Args()

#### Evaluate script

In [24]:
from collections import Counter

# Input: An m2 format sentence with edits.
# Output: A list of lists. Each edit: [start, end, cat, cor, coder]
def simplify_edits(sent):
    out_edits = []
    # Get the edit lines from an m2 block.
    edits = sent.split("\n")[1:]
    # Loop through the edits
    for edit in edits:
        # Preprocessing
        edit = edit[2:].split("|||") # Ignore "A " then split.
        span = edit[0].split()
        start = int(span[0])
        end = int(span[1])
        cat = edit[1]
        cor = edit[2]
        coder = int(edit[-1])
        out_edit = [start, end, cat, cor, coder]
        out_edits.append(out_edit)
    return out_edits

# Input 1: A list of edits. Each edit: [start, end, cat, cor, coder]
# Input 2: Command line args
# Output: A dict; key is coder, value is edit dict.
def process_edits(edits, args):
    coder_dict = {}
    # Add an explicit noop edit if there are no edits.
    if not edits: edits = [[-1, -1, "noop", "-NONE-", 0]]
    # Loop through the edits
    for edit in edits:
        # Name the edit elements for clarity
        start = edit[0]
        end = edit[1]
        cat = edit[2]
        cor = edit[3]
        coder = edit[4]
        # Add the coder to the coder_dict if necessary
        if coder not in coder_dict: coder_dict[coder] = {}

        # Optionally apply filters based on args
        # 1. UNK type edits are only useful for detection, not correction.
        if not args.dt and not args.ds and cat == "UNK": continue
        # 2. Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1
        if args.single and (end-start >= 2 or len(cor.split()) >= 2): continue
        # 3. Only evaluate multi token edits; i.e. 2+:n or n:2+
        if args.multi and end-start < 2 and len(cor.split()) < 2: continue
        # 4. If there is a filter, ignore the specified error types
        if args.filt and cat in args.filt: continue

        # Token Based Detection
        if args.dt:
            # Preserve noop edits.
            if start == -1:
                if (start, start) in coder_dict[coder].keys():
                    coder_dict[coder][(start, start)].append(cat)
                else:
                    coder_dict[coder][(start, start)] = [cat]
            # Insertions defined as affecting the token on the right
            elif start == end and start >= 0:
                if (start, start+1) in coder_dict[coder].keys():
                    coder_dict[coder][(start, start+1)].append(cat)
                else:
                    coder_dict[coder][(start, start+1)] = [cat]
            # Edit spans are split for each token in the range.
            else:
                for tok_id in range(start, end):
                    if (tok_id, tok_id+1) in coder_dict[coder].keys():
                        coder_dict[coder][(tok_id, tok_id+1)].append(cat)
                    else:
                        coder_dict[coder][(tok_id, tok_id+1)] = [cat]

        # Span Based Detection
        elif args.ds:
            if (start, end) in coder_dict[coder].keys():
                coder_dict[coder][(start, end)].append(cat)
            else:
                coder_dict[coder][(start, end)] = [cat]

        # Span Based Correction
        else:
            # With error type classification
            if args.cse:
                if (start, end, cat, cor) in coder_dict[coder].keys():
                    coder_dict[coder][(start, end, cat, cor)].append(cat)
                else:
                    coder_dict[coder][(start, end, cat, cor)] = [cat]
            # Without error type classification
            else:
                if (start, end, cor) in coder_dict[coder].keys():
                    coder_dict[coder][(start, end, cor)].append(cat)
                else:
                    coder_dict[coder][(start, end, cor)] = [cat]
    return coder_dict

# Input 1: A hyp dict; key is coder_id, value is dict of processed hyp edits.
# Input 2: A ref dict; key is coder_id, value is dict of processed ref edits.
# Input 3: A dictionary of the best corpus level TP, FP and FN counts so far.
# Input 4: Sentence ID (for verbose output only)
# Input 5: Command line args
# Output 1: A dict of the best corpus level TP, FP and FN for the input sentence.
# Output 2: The corresponding error type dict for the above dict.
def evaluate_edits(hyp_dict, ref_dict, best, sent_id, original_sentence, args, big_table):
    # Verbose output: display the original sentence
    sentence_dict = {}
    if args.verbose:
        print('{:-^40}'.format(""))
        print("Original sentence " + str(sent_id) + ": " + original_sentence)
    
        sentence_dict["sent_id"]=sent_id,
        sentence_dict["original-sentence"]=original_sentence
    # Store the best sentence level scores and hyp+ref combination IDs
    # best_f is initialised as -1 cause 0 is a valid result.
    best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = 0, 0, 0, -1, 0, 0
    best_cat = {}
    # Compare each hyp and ref combination
    for hyp_id in hyp_dict.keys():
        for ref_id in ref_dict.keys():
            # Get the local counts for the current combination.
            tp, fp, fn, cat_dict, vector_of_cat = compareEdits(hyp_dict[hyp_id], ref_dict[ref_id])
            # Compute the local sentence scores (for verbose output only)
            loc_p, loc_r, loc_f = computeFScore(tp, fp, fn, args.beta)
            # Compute the global sentence scores
            p, r, f = computeFScore(
                tp+best["tp"], fp+best["fp"], fn+best["fn"], args.beta)
            # Save the scores if they are better in terms of:
            # 1. Higher F-score
            # 2. Same F-score, higher TP
            # 3. Same F-score and TP, lower FP
            # 4. Same F-score, TP and FP, lower FN
            if     (f > best_f) or \
                (f == best_f and tp > best_tp) or \
                (f == best_f and tp == best_tp and fp < best_fp) or \
                (f == best_f and tp == best_tp and fp == best_fp and fn < best_fn):
                best_tp, best_fp, best_fn = tp, fp, fn
                best_f, best_hyp, best_ref = f, hyp_id, ref_id
                best_cat = cat_dict
                best_vector = vector_of_cat
            # Verbose output
            if args.verbose:
                # Prepare verbose output edits.
                hyp_verb = list(sorted(hyp_dict[hyp_id].keys()))
                ref_verb = list(sorted(ref_dict[ref_id].keys()))
                # add categories
                # hyp_dict[hyp_id] looks like (0, 1, "str")
                # hyp_dict[hyp_id][h] is a list, always length one, of the corresponding category
                hyp_verb = [h + (hyp_dict[hyp_id][h][0],) for h in hyp_verb]
                ref_verb = [r + (ref_dict[ref_id][r][0],) for r in ref_verb]
                # Ignore noop edits
                if not hyp_verb or hyp_verb[0][0] == -1: hyp_verb = []
                if not ref_verb or ref_verb[0][0] == -1: ref_verb = []
                # Print verbose info
                print('{:-^40}'.format(""))
                print("SENTENCE "+str(sent_id)+" - HYP "+str(hyp_id)+" - REF "+str(ref_id))
                print("HYPOTHESIS EDITS :", hyp_verb)
                print("REFERENCE EDITS  :", ref_verb)
                print("Local TP/FP/FN   :", str(tp), str(fp), str(fn))
                print("Local P/R/F"+str(args.beta)+"  :", str(loc_p), str(loc_r), str(loc_f))
                print("Global TP/FP/FN  :", str(tp+best["tp"]), str(fp+best["fp"]), str(fn+best["fn"]))
                print("Global P/R/F"+str(args.beta)+"  :", str(p), str(r), str(f))
                print(" ".join(best_vector))
                sentence_dict[f'hyp{str(hyp_id)}-edits']= hyp_verb
                sentence_dict[f'ref{str(ref_id)}-edits'] = ref_verb
                sentence_dict['best_vector_of_cat'] = best_vector

    # Verbose output: display the best hyp+ref combination
    if args.verbose:
        print('{:-^40}'.format(""))
        print("^^ HYP "+str(best_hyp)+", REF "+str(best_ref)+" chosen for sentence "+str(sent_id))
        print("Local results:")
        header = ["Category", "TP", "FP", " FN"]
        body = [[k, *v] for k, v in best_cat.items()]
        print_table([header] + body)
    # Save the best TP, FP and FNs as a dict, and return this and the best_cat dict
    best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn}
    sentence_dict[f'best-tp'] = best_tp
    sentence_dict[f'best-fp']  = best_fp
    sentence_dict['best-fn'] = best_fn
    sentence_dict['best_ref'] = best_ref
    big_table.append(sentence_dict)
    return best_dict, best_cat

# Input 1: A dictionary of hypothesis edits for a single system.
# Input 2: A dictionary of reference edits for a single annotator.
# Output 1-3: The TP, FP and FN for the hyp vs the given ref annotator.
# Output 4: A dictionary of the error type counts.
def compareEdits(hyp_edits, ref_edits):
    tp = 0    # True Positives
    fp = 0    # False Positives
    fn = 0    # False Negatives
    cat_dict = {} # {cat: [tp, fp, fn], ...}
    vector_of_cat = [] # ["tp", "fp", "fp", "fn", ...]
    

    for h_edit, h_cats in hyp_edits.items():
        # noop hyp edits cannot be TP or FP
        if h_cats[0] == "noop": continue
        # TRUE POSITIVES
        if h_edit in ref_edits.keys():
            # On occasion, multiple tokens at same span.
            for h_cat in ref_edits[h_edit]: # Use ref dict for TP
                tp += 1
                vector_of_cat.append("tp")
                # Each dict value [TP, FP, FN]
                if h_cat in cat_dict.keys():
                    cat_dict[h_cat][0] += 1
                else:
                    cat_dict[h_cat] = [1, 0, 0] #Andre: what does it do?
        # FALSE POSITIVES
        else:
            # On occasion, multiple tokens at same span.
            for h_cat in h_cats:
                fp += 1
                vector_of_cat.append("fp")
                # Each dict value [TP, FP, FN]
                if h_cat in cat_dict.keys():
                    cat_dict[h_cat][1] += 1
                else:
                    cat_dict[h_cat] = [0, 1, 0]
    for r_edit, r_cats in ref_edits.items():
        # noop ref edits cannot be FN
        if r_cats[0] == "noop": continue
        # FALSE NEGATIVES
        if r_edit not in hyp_edits.keys():
            # On occasion, multiple tokens at same span.
            for r_cat in r_cats:
                fn += 1
                vector_of_cat.append("fn")
                # Each dict value [TP, FP, FN]
                if r_cat in cat_dict.keys():
                    cat_dict[r_cat][2] += 1
                else:
                    cat_dict[r_cat] = [0, 0, 1]

    return tp, fp, fn, cat_dict, vector_of_cat

# Input 1-3: True positives, false positives, false negatives
# Input 4: Value of beta in F-score.
# Output 1-3: Precision, Recall and F-score rounded to 4dp.
def computeFScore(tp, fp, fn, beta):
    p = float(tp)/(tp+fp) if fp else 1.0
    r = float(tp)/(tp+fn) if fn else 1.0
    f = float((1+(beta**2))*p*r)/(((beta**2)*p)+r) if p+r else 0.0
    return round(p, 4), round(r, 4), round(f, 4)

# Input 1-2: Two error category dicts. Key is cat, value is list of TP, FP, FN.
# Output: The dictionaries combined with cumulative TP, FP, FN.
def merge_dict(dict1, dict2):
    for cat, stats in dict2.items():
        if cat in dict1.keys():
            dict1[cat] = [x+y for x, y in zip(dict1[cat], stats)]
        else:
            dict1[cat] = stats
    return dict1

# Input 1: A dict; key is error cat, value is counts for [tp, fp, fn]
# Input 2: Integer value denoting level of error category granularity.
# 1: Operation tier; e.g. M, R, U.  2: Main tier; e.g. NOUN, VERB  3: Everything.
# Output: A dictionary of category TP, FP and FN based on Input 2.
def processCategories(cat_dict, setting):
    # Otherwise, do some processing.
    proc_cat_dict = {}
    for cat, cnt in cat_dict.items():
        if cat == "UNK":
            proc_cat_dict[cat] = cnt
            continue
        # M, U, R or UNK combined only.
        if setting == 1:
            if cat[0] in proc_cat_dict.keys():
                proc_cat_dict[cat[0]] = [x+y for x, y in zip(proc_cat_dict[cat[0]], cnt)]
            else:
                proc_cat_dict[cat[0]] = cnt
        # Everything without M, U or R.
        elif setting == 2:
            if cat[2:] in proc_cat_dict.keys():
                proc_cat_dict[cat[2:]] = [x+y for x, y in zip(proc_cat_dict[cat[2:]], cnt)]
            else:
                proc_cat_dict[cat[2:]] = cnt
        # All error category combinations
        else:
            return cat_dict
    return proc_cat_dict

# Input 1: A dict of global best TP, FP and FNs
# Input 2: A dict of error types and counts for those TP, FP and FNs
# Input 3: Command line args
def print_results(best, best_cats, args):
    category_df = []
    total_results = {}
    # Prepare output title.
    if args.dt: title = " Token-Based Detection "
    elif args.ds: title = " Span-Based Detection "
    elif args.cse: title = " Span-Based Correction + Classification "
    else: title = " Span-Based Correction "

    # Category Scores
    if args.cat:
        best_cats = processCategories(best_cats, args.cat)
        print("")
        print('{:=^66}'.format(title))
        print("Category".ljust(24), "TP".ljust(8), "FP".ljust(8), "FN".ljust(8),
            "P".ljust(8), "R".ljust(8), "F"+str(args.beta))
        for cat, cnts in sorted(best_cats.items()):
            cat_p, cat_r, cat_f = computeFScore(cnts[0], cnts[1], cnts[2], args.beta)
            print(cat.ljust(24), str(cnts[0]).ljust(8), str(cnts[1]).ljust(8),
                str(cnts[2]).ljust(8), str(cat_p).ljust(8), str(cat_r).ljust(8), cat_f)
            cat_dict = {'Category':cat,
                        'TP' : cnts[0],
                        'FP' : cnts[1],
                        'FN' : cnts[2],
                        'P' : cat_p,
                        'R' : cat_r,
                        'F' : cat_f}
            category_df.append(cat_dict)


    # Print the overall results.
    print("")
    print('{:=^46}'.format(title))
    print("\t".join(["TP", "FP", "FN", "Prec", "Rec", "F"+str(args.beta)]))
    print("\t".join(map(str, [best["tp"], best["fp"],
        best["fn"]]+list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta)))))
    print('{:=^46}'.format(""))
    print("")
    eval = list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta))
    total_results =  {'TP':best["tp"],
                      'FP':best['fp'],
                      'FN':best['fn'],
                      'Prec': eval[0],
                      'Rec': eval[1],
                      'F0.5':eval[2]}
    category_df = pd.DataFrame(category_df)
    return category_df, total_results

def print_table(table):
    longest_cols = [
        (max([len(str(row[i])) for row in table]) + 3)
        for i in range(len(table[0]))
    ]
    row_format = "".join(["{:>" + str(longest_col) + "}" for longest_col in longest_cols])
    for row in table:
        print(row_format.format(*row))

def append_table(table, big_table):
  big_table.append(table)

In [None]:
def main(args):
    big_table = []
    # Parse command line args
    # Open hypothesis and reference m2 files and split into chunks
    hyp_m2 = open(args.hyp).read().strip().split("\n\n")
    ref_m2 = open(args.ref).read().strip().split("\n\n")
    # Make sure they have the same number of sentences
    assert len(hyp_m2) == len(ref_m2)

    # Store global corpus level best counts here
    best_dict = Counter({"tp":0, "fp":0, "fn":0})
    best_cats = {}
    # Process each sentence
    sents = zip(hyp_m2, ref_m2)
    for sent_id, sent in enumerate(sents):
        # Simplify the edits into lists of lists
        hyp_edits = simplify_edits(sent[0])
        ref_edits = simplify_edits(sent[1])
        # Process the edits for detection/correction based on args
        hyp_dict = process_edits(hyp_edits, args)
        ref_dict = process_edits(ref_edits, args)
        # original sentence for logging
        original_sentence = sent[0][2:].split("\nA")[0]
        # Evaluate edits and get best TP, FP, FN hyp+ref combo.
        count_dict, cat_dict = evaluate_edits(
            hyp_dict, ref_dict, best_dict, sent_id, original_sentence, args, big_table)
        # Merge these dicts with best_dict and best_cats
        best_dict += Counter(count_dict)
        best_cats = merge_dict(best_cats, cat_dict)
    # Print results
    category_df, total_results = print_results(best_dict, best_cats, args)
    #not saving dfs for now
    big_table = pd.DataFrame(big_table)
    return category_df, total_results, big_table

## evaluate

In [None]:
category_df, total_results, big_table = main(args)

In [None]:
category_df.to_csv(output_folder + f'/{model_name}-category-data.csv')

In [None]:
# Parse command line args
def parse_args():
    parser = argparse.ArgumentParser(
        description="Calculate F-scores for error detection and/or correction.\n"
            "Flags let you evaluate at different levels of granularity.",
        formatter_class=argparse.RawTextHelpFormatter,
        usage="%(prog)s [options] -hyp HYP -ref REF")
    parser.add_argument(
        "-hyp",
        help="A hypothesis M2 file.",
        required=True)
    parser.add_argument(
        "-ref",
        help="A reference M2 file.",
        required=True)
    parser.add_argument(
        "-b",
        "--beta",
        help="Value of beta in F-score. (default: 0.5)",
        default=0.5,
        type=float)
    parser.add_argument(
        "-v",
        "--verbose",
        help="Print verbose output.",
        action="store_true")
    eval_type = parser.add_mutually_exclusive_group()
    eval_type.add_argument(
        "-dt",
        help="Evaluate Detection in terms of Tokens.",
        action="store_true")
    eval_type.add_argument(
        "-ds",
        help="Evaluate Detection in terms of Spans.",
        action="store_true")
    eval_type.add_argument(
        "-cs",
        help="Evaluate Correction in terms of Spans. (default)",
        action="store_true")
    eval_type.add_argument(
        "-cse",
        help="Evaluate Correction in terms of Spans and Error types.",
        action="store_true")

    parser.add_argument(
        "-cat",
        help="Show error category scores.\n"
            "1: Only show operation tier scores; e.g. R.\n"
            "2: Only show main tier scores; e.g. NOUN.\n"
            "3: Show all category scores; e.g. R:NOUN.",
        choices=[1, 2, 3],
        type=int)
    args = parser.parse_args()
    return args

# Generate results to submit

In [19]:
with open("/content/drive/MyDrive/UNLP/test-gec-fluency/test.src.txt", 'r') as f:
    test_data = [line[:-1] for line in f.readlines()]
test_out = translator(test_data, batch_size=16)

NameError: ignored

In [None]:
!pip install stanza==1.4.2 &> /dev/null
# provided tokenizer
import stanza

def tokenize(text):
    if not hasattr(tokenize, "nlp"):
        tokenize.nlp = stanza.Pipeline(
            lang="uk",
            processors="tokenize",
            download_method=stanza.DownloadMethod.REUSE_RESOURCES,
        )
    nlp = tokenize.nlp

    tokenized = " ".join([t.text for t in nlp(text).iter_tokens()])
    return tokenized

In [None]:
output_sentences = [tokenize(sent['translation_text']) + '\n' for sent in test_out]

model_name = model_checkpoint.split('/')[1]
output_folder = f'/content/drive/MyDrive/UNLP/competition-outputs/{model_name}'

import os
# creating the output folder
if not os.path.exists(output_folder):
  os.mkdir(output_folder)

with open(output_folder+'/test.tgt.tok', 'w') as f:
  f.writelines(output_sentences)