# *If you are not interested in how the models work or do not plan to customize the code, jump to the <a href='#run-section'>"Run Code"</a> section.*

In [1]:
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn_crfsuite
import matplotlib.pyplot as plt
import string
import itertools
import datetime
import re
import pickle

# Data Prep Functions

In [2]:
def retrieve(filepath):
    pairswbar = [line.strip().split() for line in open(filepath)]
    tuple_lines = []
    for line in pairswbar:
        tuple_lines.append([tuple(pair.split(SEPARATOR)) for pair in line])
    mismatched = [i for i,line in enumerate(tuple_lines) for pair in line if len(pair) < 2]
    print([tuple_lines[i-1] for i in mismatched])
    return tuple_lines


def inandout(tuple_lines):
    inputlines = []
    outputlines = []
    for line in tuple_lines:
        inline = []
        outline = []
        for pair in line:
            inline.append(pair[0])
            outline.append(pair[1])
        inputlines.append(' '.join(inline))
        outputlines.append(' '.join(outline))
    return inputlines, outputlines


def datafile(filename, data):
    with open(DATADIR+filename, 'w') as T:
        T.write('\n'.join(data))

In [3]:
def logstats(train_tuples,filepath):
    # List all POS tags used in the data
    taglist = [tag for sent in train_tuples for word,tag in sent]
    tagset = list(set(taglist))
    tokens = len(train_tuples)
    sorted_tags = sorted(Counter(taglist).items(), key=lambda x:x[1], reverse=True)
    sort_string = ''
    for tag,val in sorted_tags:
        sort_string += "{: <10}\t{}\n".format(tag, val)
    
    logstring = str(datetime.datetime.now()) + '\n\n'    
    tokenreport = str(tokens) + " training tokens"  + '\n\n'
    tagreport = "Tags in training data:\n" + sort_string  + '\n'
    #print(tagreport) 
    #print(tokenreport)
    #print(tagcounts)
    
    msg = logstring+NOTES+tokenreport+tagreport
    
    with open(filepath+'_log.txt', 'w') as l:
        l.write(filepath + '\n\n' + msg)
    
    return tagset

In [4]:
def prepare(modelname):
    '''write data sets to files for printing and sharing'''
    fname = TEAMCODE+TASK+modelname
    unannotated_tuples = retrieve(DATADIR+fname+PREVITERATION+'.predict')
    train_pairs = retrieve(DATADIR+fname+ITERATION+'.train')
    test_pairs = retrieve(DATADIR+fname+ITERATION+'.test')
    
    # Log training data statistics
    uniqtags = logstats(train_pairs, REPORTDIR+fname+ITERATION)
        
    return train_pairs, test_pairs, unannotated_tuples, uniqtags

# Results Helper Functions

In [5]:
def cm(gold_tags, predicted_tags, taglist, modelname):
    '''builds and display a confusion matrix so we 
    can evaluate where our tagger is making wrong 
    predictions, after we test a POS tagger'''
    
    alpha_taglist = sorted(set(taglist))
    confusion_matrix = metrics.confusion_matrix(gold_tags,predicted_tags,labels=alpha_taglist,normalize="true")
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix,display_labels=alpha_taglist)

    plt.rcParams["figure.figsize"] = (17,17)
    plt.rcParams.update({'font.size': 12})
    disp.plot(colorbar=False)
    plt.show() # display below
    #save as file
    plt.savefig(REPORTDIR+TEAMCODE+TASK+modelname+ITERATION+'_matrix.png')    
    
    matrixstring = '{0:5s}'.format(' ') + '\t'.join(['{0:^4s}'.format(tag) for tag in alpha_taglist]) + '\n'
    for i,row in enumerate(confusion_matrix):
        cols = '\t'.join(['{:.2f}'.format(round(col,2)) for col in row])
        matrixstring+='{0:6s}'.format(alpha_taglist[i]) + cols + '\n'
    
    return matrixstring

    
def logResults(testgold, testpredict, confusionreport, modelname):
    time = str(datetime.datetime.now())
    testgold = list(itertools.chain.from_iterable(testgold))
    testpredict = list(itertools.chain.from_iterable(testpredict))
    classreport = metrics.classification_report(testgold, testpredict, zero_division=0.0)
    report = '\n\nClassification Report\n\n{}\n\nConfusion Matrix\n\n{}\n'.format(classreport, confusionreport)
    with open(REPORTDIR+TEAMCODE+TASK+modelname+ITERATION+'_results.txt', 'w') as R:
        R.write(time + report)
        
        
def printPredictions(confscores, w_t_predictions, modelname):
    '''sort auto-annotated sentences based on how "confident" 
    the model was at it predictions of each sentence's POS tags, 
    by decreasing "confidence", 
    i.e., lower probability == less confidence.
    Writes to file.'''
    
    with_confidence = list(zip(w_t_predictions, confscores))
    with_confidence.sort(key = lambda x: x[1])
    sorted_predictions = [z[0] for z in with_confidence]
    
    datastring = []
    for sent in sorted_predictions:
        datastring.append(' '.join([pair[0]+SEPARATOR+pair[1] for pair in sent]))

    datafile(TEAMCODE+TASK+modelname+ITERATION+'.predict', datastring)

# POS Tagger

With any model, we take the same three basic steps.

1) Train the model. The models use some statistical patterns or features of the training data with its POS tags to build a predictive model. 

2) Use the trained model to predict POS tags over the test sentences. Then compare those predictions to the correct tags and produce evaluation metrics for that model. We use precision, recall, F1 scores, and a confusion matrix.

3)  Predict tags over our unannotated sentences. Then sort these sentences based on the model's "confidence" in those predictions. That is, the probabilities that the POS tagger computed for that particular sequence of labels. A higher aggregate probablity == higher confidence in the predicted sequence of POS tags. 

## Conditional Random Fields (CRF) 

This sequence to sequence model requires feature engineering. It is set with features listed in https://towardsdatascience.com/pos-tagging-using-crfs-ea430c5fb78b#1c6a. Feel free to edit the features to see which combination gives the best results. 

**Log any features you add or change in the `_log.txt` file!** 

In [6]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit()}
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word1.lower()),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation)})     
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['EOS'] = True    
    
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word2.lower(),
            '+2:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word2.lower()),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
        })

    return features

In [7]:
# words and labels, feed words to feature extractor
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]

### Main Function for CRF

In [8]:
def mainCRF():
    
    train, test, to_predict, tags = prepare('CRF')
    
    # extracting features from all the sentences
    train_ftrs = [sent2features(s) for s in train]
    train_tags = [sent2labels(s) for s in train]

    test_ftrs = [sent2features(s) for s in test]
    test_tags = [sent2labels(s) for s in test]
    
    to_predict_ftrs = [sent2features(s) for s in to_predict]
    to_predict_words = [sent2tokens(s) for s in to_predict]
    
    # training parameters
    crf = sklearn_crfsuite.CRF(
        algorithm = 'lbfgs',
        c1 = 0.25,
        c2 = 0.3,
        max_iterations = 100,
        all_possible_transitions=True)
    #train
    crf.fit(train_ftrs, train_tags)
    # save model
    with open(MODELDIR+TEAMCODE+TASK+'CRF'+ITERATION+'-model.pkl','wb') as f:
        pickle.dump(crf,f)
    
    # testing
    test_output = crf.predict(test_ftrs)
    # get test reports
    matrix = cm(test_tags, test_output, tags, 'CRF')
    logResults(test_tags, test_output, matrix, 'CRF')
    
    # predict
    predicted_labels = crf.predict(to_predict_ftrs)
    print(predicted_labels)
    predicted_sequences = [list(zip(to_predict_words[i], predicted_labels[i])) for i in range(len(predicted_labels))]
    # get confidence score
    all_probs = crf.predict_marginals(to_predict_ftrs)
    confidences = []
    for s,sent in enumerate(predicted_sequences):
        confidences.append(sum(all_probs[s][i][wordpair[1]] for i,wordpair in enumerate(sent))/len(sent))
    printPredictions(confidences, predicted_sequences, 'CRF')

# Run Code (all cells below)

During the Machine-in-the-Loop activity help team members through these steps.

    1) Form teams
    2) Analyse Errors:
        -- Examine evaluation metrics and confusion matrix in the display or the `.results' file.
    3) Preprocess training data and/or CRFcustom code to improve Previous Model:
        -- Make any changes to train data and unannotated file
        -- Change .train and .predict filenames.
        -- Code any customized model features in custom_word2features()
        -- Make note of changes or other comments.
    4) Adjust and prepare to run code that trains new model:
        -- Update variables.
        -- Add note of changes or other comments in `NOTES` variable.
    5) Train, test, predict:
        -- Run the `main` function for chosen model. 
        -- Eyeball log file. Edit notes
        -- Eyeball files in `reports/` folder
        -- Debug and rerun code if something seems off in those files.
    6) Rinse and repeat

#### IMPORTANT Notes about Data and Files

- The three data sets are stored in files (with extensions: `.train`, `.test`, `.predict`) for easy printing and sharing and editing. Workshop participants can make changes to these data files, except the test data. Changes should be changed in new files with **the same file extensions!** These changed data will be used to train a new (and better?) POS tagger. 

- A `_log.txt` file will be written in the `reports/` folder before every training iteration. It contains the number of training tokens, the list and number of each POS tag in the training data.  **Please document any changes made since the last iteration which are not tracked with this code (e.g. "30 tags were corrected in the training data")**

##  Update Variables, Filenames and Notes

In [9]:
''' File naming key: <teamcode>_<task><modelname><iter#>'''

#UPDATE THESE
TEAMCODE = 'tau' 
TASK = '_pos' 

PREVITERATION = '' # previous iteration #; predictions from prev iter.
ITERATION = '0'    # current iteration #; used for edited train/test files
NOTES = ''

# Update these the first time only
DATADIR = r'../../'+TEAMCODE+'/data/'
REPORTDIR = r'../../'+TEAMCODE+'/reports/'
MODELDIR = r'../../'+TEAMCODE+'/models/'

SEPARATOR = '|'  # make sure it matches separator in preprocess code

## Train, Test, Predict CRF Model

In [10]:
# YOU MUST RESTART KERNEL FOR EVERY ITERATION. Otherwise predict file will preserve old POS tags
#TODO: Use dictionaries instead lists where possible to avoid need to restart kernel
mainCRF()

[]
[]
[]


ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.