# Run all cells once, then run `Run Code` section for each iteration

In [1]:
from collections import Counter
from sklearn import metrics
import matplotlib.pyplot as plt
import string
import itertools
import datetime
import re

## Prepare data files before training Transformer 

Files will be written in the needed format and saved in the `data/fortransfer/` folder. 

Also, the `_log.txt` file with statistics about the training data will be written to the `reports/` folder.

In [2]:
def splitclitics(line):
    brokenline = line.split('-')
    #proclitic
    if '=' in brokenline[0]:
        cliticsplit = brokenline[0].split('=')
        first = cliticsplit[0]
        second = cliticsplit[1]
        brokenline[0] = second
        brokenline.insert(0, first+'=')
    if '=' in brokenline[-1]: # enclitic
        cliticsplit = brokenline[-1].split('=')
        first = cliticsplit[-2]
        second = cliticsplit[-1]
        brokenline.append('='+second)
    return brokenline

def formatting(line):
    '''Put language specific processing of text as needed
    This only for preprocessing that needs to happens
     after 1st training iteration'''
    if TEAMCODE == 'bkft':
        line = line.lower()
        line = line.replace('ó', 'o')
        line = line.replace('á', 'a')
        line = line.replace('í', 'i')
    return line


def grouplines(file):
    lines = [formatting(line.strip()) for line in open(file, encoding='utf8')] 
    # get interlinear lines for each sentence
    igtgroups = []
    curr_igt = []
    for line in lines:
        if line == '':
            igtgroups.append(curr_igt)
            curr_igt = []
        else:
            curr_igt.append(line+EOS)    
    return igtgroups


def retrieve(igtdata, devsize=0):
    devigt = []
    if type(igtdata) == list: # dev igt
        igt = igtdata
    else: 
        igt = grouplines(igtdata)
        # create dev split by sentence
        if igtdata.endswith('train'):
            devigt = igt[:devsize]
            igt = igt[devsize:]
    # break lines into words 
    orig_words = []
    morphseg = []
    gloss = []
    for group in igt:
        orig_words.extend(group[0].split())
        morphseg.extend(group[1].split())
        gloss.extend(group[2].split())
    # break into morphemes
    seg_words = []
    gloss_words = []
    for i,word in enumerate(morphseg):
        if '=' in word:
            seg_words.append(splitclitics(word))
            gloss_words.append(splitclitics(gloss[i]))
        else:
            seg_words.append(word.split('-'))
            gloss_words.append(gloss[i].split('-'))
           
    return orig_words, seg_words, gloss_words, devigt


def join_igt(igttuples):
    joined = []
    for word in igttuples:
        joined.append(' '.join((morph+'%%'+gloss for morph,gloss in word)))
    return joined


def prepoutput(segs, glosses):
    if TASK == SEG_GLS:
        seggls = [list(zip(seg, glosses[i])) for i,seg in enumerate(segs)]
        return join_igt(seggls)
    else:
        return [' '.join(seg) for seg in segs]


def toupload(datasplit, indata, outdata):
    filename = TRANSFERDIR+datasplit+KEY+ITERATION
    if outdata: 
        with open(filename+'.output', 'w', encoding='utf8') as e:
            e.write('\n'.join(outdata))
    indata = [' '.join(word) for word in indata]
    with open(filename+'.input', 'w', encoding='utf8') as e:
        e.write('\n'.join(indata))

In [3]:
def logstats(trainout):
    # List all "tags" in training data
    tokens = len(trainout)
    
    taglist = []
    for word in trainout:
        taglist.extend(word.split())
    tagset = list(set(taglist))
    sorted_tags = sorted(Counter(taglist).items(), key=lambda x:x[1], reverse=True)
    sort_string = ''
    for tag,val in sorted_tags:
        sort_string += "{: <10}\t{}\n".format(tag, val)
    
    logstring = str(datetime.datetime.now()) + '\n\n'    
    tokenreport = str(tokens) + " training tokens"  + '\n\n'
    tagreport = "SEG_GLS in training data:\n" + sort_string  + '\n'
    #print(tagreport) 
    #print(tokenreport)
    
    msg = logstring+NOTES+tokenreport+tagreport
    filepath = REPORTDIR+KEY+ITERATION
    with open(filepath+'_log.txt', 'w', encoding='utf8') as l:
        l.write(filepath+'\n\n'+ msg)

In [4]:
def prepare():
    '''get data from files for printing and sharing'''
    X_test, segs_test, gloss_test, __ = retrieve(DATADIR+KEY+ITERATION+'.test')
    X_train, segs_train, gloss_train, dev = retrieve(DATADIR+KEY+ITERATION+'.train', devsize=len(X_test))
    X_dev, segs_dev, gloss_dev, __ = retrieve(dev)
    to_predict, __, __, __ = retrieve(DATADIR+KEY+PREVITERATION+'.predict')
    
    print(len(X_train), len(segs_train), len(gloss_train))
    print(len(X_dev), len(segs_dev), len(gloss_dev))
    print()
    
    y_train = prepoutput(segs_train, gloss_train)
    y_dev = prepoutput(segs_dev, gloss_dev)
    y_test = prepoutput(segs_test, gloss_test)
    
    # write files for transformer
    toupload('predict.', to_predict, None)
    toupload('train.', X_train, y_train)
    toupload('dev.', X_dev, y_dev)
    toupload('test.', X_test, y_test)
    
    # Log training data statistics, get list of classes
    logstats(y_train)

# Post process after training Transformer

#### reformat data for editing in files

In [5]:
def confusionMatrix(gold_tags, predicted_tags):
    '''builds and display a confusion matrix so we 
    can evaluate where our tagger is making wrong 
    predictions, after we test a POS tagger'''
    
    alpha_taglist = sorted(set(gold_tags))
    confusion_matrix = metrics.confusion_matrix(gold_tags,predicted_tags,labels=alpha_taglist,normalize="true")
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix,display_labels=alpha_taglist)

    plt.rcParams["figure.figsize"] = (17,17)
    plt.rcParams.update({'font.size': 12})
    disp.plot(colorbar=False)
    plt.show() # display below
    #save as file
    plt.savefig(REPORTDIR+KEY+ITERATION+'_matrix.png')    
    
    matrixstring = '{0:5s}'.format(' ') + '\t'.join(['{0:^4s}'.format(tag) for tag in alpha_taglist]) + '\n'
    for i,row in enumerate(confusion_matrix):
        cols = '\t'.join(['{:.2f}'.format(round(col,2)) for col in row])
        matrixstring+='{0:6s}'.format(alpha_taglist[i]) + cols + '\n'
    
    return matrixstring

def pad(test_gold, test_guess):
    '''Need same number of annotations on each line for results'''
    for i,line in enumerate(test_gold):
        goldlen = len(line)
        guesslen = len(test_guess[i])
        # underguessed number of segments
        if goldlen > guesslen:
            diff = goldlen - guesslen
            padding = ['@@@']*diff
            test_guess[i] = test_guess[i] + padding
        else: # guess too many segments
            diff = guesslen - goldlen
            padding = ['NOMORPHEME']*diff
            test_gold[i] = line + padding
    return test_gold, test_guess
    
def logResults(test_gold, test_guess):
    test_gold, test_guess = pad(test_gold, test_guess)
    flat_testgold = list(itertools.chain.from_iterable(test_gold))
    flat_testpredict = list(itertools.chain.from_iterable(test_guess))
    classreport = metrics.classification_report(flat_testgold, flat_testpredict, zero_division=0.0)
    #confusionreport = confusionMatrix(flat_testgold, flat_testpredict)
    time = str(datetime.datetime.now())
    #report = '\n\nClassification Report\n\n{}\n\nConfusion Matrix\n\n{}\n'.format(classreport, confusionreport)
    report = '\n\nClassification Report\n\n{}'.format(classreport)
    with open(REPORTDIR+KEY+ITERATION+'_results.txt', 'w', encoding='utf8') as R:
        R.write(time + report)

In [6]:
def resegment(wordannotation):
    # join segs and glosses into words with hyphens
    # remove hyphens from clitics
    return '-'.join(wordannotation).replace('-=', '=').replace('=-', '=')


def regroup(prediction_output):
    '''reformat words, split glosses from segments if needed
    recreate sentence lines with scores and annotations'''
    
    predictwords = [word.strip() for word in open(TRANSFERDIR+'predict.'+KEY+ITERATION+'.input', encoding='utf8')]
    
    scores = []
    sentences = []
    annotations = []
    sent_score = 0.0
    sent_words = []
    sent_annotats = []
    
    for i,line in enumerate(prediction_output):
        h, score, prediction = line.split('\t')
        word = ''.join(predictwords[i].split()) #remove spaces
        if TASK == SEG_GLS:
            annotation = list(zip(*[mg.split('%%') for mg in prediction.split()]))
            annotation = (resegment(annotation[0]), resegment(annotation[1]))
        else:
            annotation = resegment(prediction.split())
        #recreate lines
        if word == '@EOS@':
            scores.append(sent_score/len(sent_words))
            sentences.append(sent_words)
            annotations.append(sent_annotats)
            sent_score = 0.0
            sent_words = []
            sent_annotats = []
        else:
            sent_score+=float(score)
            sent_words.append(word)
            sent_annotats.append(annotation)
            
    return list(zip(scores, sentences, annotations))

def predictFile(sorted_scored_lines):
    '''remove scores; 
    write new predict file for editing'''
    
    newpredict = []
    for line in sorted_scored_lines:
        s, text, annotations = line
        newline = ' '.join(text)+'\n'
        if TASK == SEG_GLS:
            annotations = list(zip(*annotations))
            newline+= ' '.join(annotations[0]) + '\n'
            newline+= ' '.join(annotations[1]) + '\n'
        else:
            newline+= ' '.join(annotations) + '\n'
        newpredict.append(newline)
        
    with open(DATADIR+KEY+ITERATION+'.predict', 'w', encoding='utf8') as new:
        new.write('\n'.join(newpredict))
        
def reformat(prediction_output):
    # predictions file of interlinear annotations
    scored_lines = regroup(prediction_output)
    scored_lines.sort()
    predictFile(scored_lines)

In [7]:
def postprocess():
    test_guess = [line.strip().split() for line in open(TRANSFERDIR+KEY+ITERATION+'.testpredict', encoding='utf8')]
    test_gold = [line.strip().split() for line in open(TRANSFERDIR+'test.'+KEY+ITERATION+'.output', encoding='utf8')]
    prediction_output = [line.strip() for line in open(TRANSFERDIR+KEY+ITERATION+'.confidence', encoding='utf8')]

    logResults(test_gold, test_guess)
    reformat(prediction_output)

# Run Code (all cells below)

During the Machine-in-the-Loop activity help team members through these steps.

    - Preprocess data:
        -- Check data file names are updated to new iteration number
        -- Keep notes of changes to data.
    - Adjust and prepare to run code that trains new model:
        -- Update variables.
        -- Add notes or comments in `NOTES` variable, if desired
        -- Run the `prepare()` function
        -- Eyeball log file
    - Train, test, predict:
        -- Upload files to server
        -- Run transformer script to train, test, and predict
        -- Download predictions and test predictions files
    - Evaluate results by running `postprocess()` function
    
### Update variables

In [8]:
EOS = ' @EOS@'

# Update first time for each task
TEAMCODE = 'bkft'
MODEL = 'Trans'
SEG_GLS = '_igt'
TASK = SEG_GLS

# Update every iteration
PREVITERATION = '2'
ITERATION = '3'
NOTES = 'no changes to test data'

# File and folder names
DATADIR = r'./'+TEAMCODE+'/data/'
TRANSFERDIR = r'./'+TEAMCODE+'/fortransfer/'
REPORTDIR = r'./'+TEAMCODE+'/reports/'

KEY = TEAMCODE+TASK+MODEL

### Preprocess data

In [11]:
prepare()

546 546 546
297 297 297



### Uploade, run Transformer, download

1. Upload files from `fortransfer/` folder to server with 
    
    Note: You can delete these file from the `fortransfer/` folder once they are safely uploaded..
    
2. Train, test, and predict with Transformer
3. Download the `.confidence` and the `.testpredict` files to `fortransfer/` directory.

### Evalute and postprocess results

In [12]:
postprocess()