In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import re
import random

def extractlexc(Replines, Flags=True, Boundaries=True):
    '''Takes list of foma output with transitions.
     Puts input and output lines in dict. Keeps flags'''
    
    MORPH_FLAGS = set(['@U.IC.Yes@','@U.StemInitial.Yes@','@U.StemFinal.Yes@','@U.PERNUM.3SGSAI@','@U.PERNUM.3PLSAI@',
                   '@U.PERNUM.4SGSAI@','@U.PERNUM.4PLSAI@','@U.PERNUM.3SGTI@','@U.PERNUM.3PLTI@','@U.PERNUM.4SGTI@',
                   '@U.PERNUM.4PLTI@','@U.PERNUM.3S4S@','@U.PERNUM.3S4PL@','@U.PERNUM.4S4S@','@U.PERNUM.4S4PL@',
                   '@U.PERNUM.3PL4S@','@U.PERNUM.3PL4PL@','@U.PERNUM.4PL4S@','@U.PERNUM.4PL4PL@','@U.PERNUM.4S3S@',
                   '@U.PERNUM.4PL3S@','@U.PERNUM.4S4S@','@U.PERNUM.4S4PL@','@U.PERNUM.4S3PL@','@U.PERNUM.4PL3PL@',
                   '@U.PERNUM.4S4PL@','@U.PERNUM.4PL4PL@','@U.Polarity.Negative@','@U.TENSE.Past@','@U.Mood.Question@',
                   '@U.Order.NAFF@','@U.StemType.II@','@U.StemType.AI@'])

    translib = {}
    
    for line in Replines:
        inputline = []
        outputline = []
        for element in line:
            inp = ''
            outp = ''
            #split if transition
            if ':' in element:
                inp,outp = element.split(':')
            #if no transition, element is both input and output
            else:
                inp = element
                outp = element
            #leave out null elements 
            if inp != '0':
                #filter flags in UR not affecting phonology
                if inp.startswith('@'):
                    #print(inp)
                    if Flags and inp in MORPH_FLAGS:
                            inputline.append(inp)
                else:
                    inputline.append(inp)
            if outp != '0':
                #filter flags in UR not affecting phonology
                if outp.startswith('@'):
                    #print(inp)
                    if Flags and outp in MORPH_FLAGS:
                        outputline.append(outp)
                elif outp == '^':
                    if Boundaries:
                        outputline.append(outp)
                else:
                    outputline.append(outp)
        inputstring = ' '.join(inputline)
        outputstring = ' '.join(outputline)
        #add input:output to dict, allow for multiple IR/SR representations
        if inputstring not in translib:
            translib[inputstring] = [outputstring]
        else:
            translib[inputstring].append(outputstring)
    
    return translib

def extractfoma(Replines, Flags=True):
    '''Takes list of foma output with transitions.
     Puts input and output lines in dict. Keeps flags'''
    
    MORPH_FLAGS = set(['@U.IC.Yes@','@U.StemInitial.Yes@','@U.StemFinal.Yes@','@U.PERNUM.3SGSAI@','@U.PERNUM.3PLSAI@',
                   '@U.PERNUM.4SGSAI@','@U.PERNUM.4PLSAI@','@U.PERNUM.3SGTI@','@U.PERNUM.3PLTI@','@U.PERNUM.4SGTI@',
                   '@U.PERNUM.4PLTI@','@U.PERNUM.3S4S@','@U.PERNUM.3S4PL@','@U.PERNUM.4S4S@','@U.PERNUM.4S4PL@',
                   '@U.PERNUM.3PL4S@','@U.PERNUM.3PL4PL@','@U.PERNUM.4PL4S@','@U.PERNUM.4PL4PL@','@U.PERNUM.4S3S@',
                   '@U.PERNUM.4PL3S@','@U.PERNUM.4S4S@','@U.PERNUM.4S4PL@','@U.PERNUM.4S3PL@','@U.PERNUM.4PL3PL@',
                   '@U.PERNUM.4S4PL@','@U.PERNUM.4PL4PL@','@U.Polarity.Negative@','@U.TENSE.Past@','@U.Mood.Question@',
                   '@U.Order.NAFF@','@U.StemType.II@','@U.StemType.AI@'])
    translib = {}
    
    for line in Replines:
        inputline = []
        outputline = []
        for element in line:
            inp = ''
            outp = ''
            #split if transition
            if ':' in element:
                inp,outp = element.split(':')
            #if no transition, element is both input and output
            else:
                inp = element
                outp = element
            #leave null elements out 
            if inp != '0':
                #filter flags in UR not affecting phonology
                if inp.startswith('@') and Flags:
                    if inp in MORPH_FLAGS:
                        inputline.append(inp)
                else:
                    inputline.append(inp)
            if outp != '0':
                #filter out boundaries and flags from SR
                if not outp.startswith('@') and outp != '^':
                    outputline.append(outp)
        inputstring = ' '.join(inputline)
        outputstring = ' '.join(outputline)
        #add input:output to dict, allow for multiple IR/SR representations
        if inputstring not in translib:
            translib[inputstring] = [outputstring]
        else:
            translib[inputstring].append(outputstring)
    
    return translib

In [2]:
def combineLibs(lib1, lib2):
    '''Takes URIR and URSR dicts 
    Returns dic with IR and SR lists as value for the corresponding UR'''
    
    lib = {}
    ct = 0
    for UR,Rep1 in lib1.items():
        lib[UR] = [Rep1]
        try:
            lib[UR].append(lib2[UR])
        except KeyError:
            ct +=1
            #print('Not found in second dict', ''.join(UR))
    print(ct)
    return lib

def unambiguate(combineddict):
    '''Takes {UR:[[IR],[SR]]} and limits to those with 1-1-1 mapping
    Returns three lists.'''
    
    URlist = []
    IRlist = []
    SRlist = []
    dbmistakes = ('a','l')
    
    for UR,reps in combineddict.items():
        if len(reps) == 2:
            if len(reps[0]) == 1 and len(reps[1]) == 1:
                if dbmistakes[0] in reps[0] or dbmistakes[1] in reps[0] or dbmistakes[0] in reps[1] or dbmistakes[1] in reps[1]:
                    break
                else:
                    URlist.append(UR)
                    IRlist.append(reps[0])
                    SRlist.append(reps[1])
    
    return URlist,IRlist,SRlist

    
def unambiguous2File(URlist,IRlist,SRlist,genRparse):
    '''Takes UR,IR,SR list, splits data 8/1/1, 
    prints to files, each line corresponding
    Not used.'''
    
    filenames = ['URtrain','URdev','URgold','IRtrain','IRdev','IRgold','SRtrain','SRdev','SRgold']
    
    #split data to 80/10/10
    URtrain,URsplit,IRtrain,IRsplit,SRtrain,SRsplit = train_test_split(URlist, IRlist, SRlist, test_size=.2)
    URdev,URtest,IRdev,IRtest,SRdev,SRtest = train_test_split(URsplit,IRsplit,SRsplit, test_size=.5)
    
    datasets = [URtrain,URdev,URtest,IRtrain,IRdev,IRtest,SRtrain,SRdev,SRtest]
    
    for idx,name in enumerate(filenames):
        cur_file = genRparse + '-' + name + '.txt'
        with open(cur_file, 'w') as F:
            if name.startswith('UR'):
                F.write(''.join(datasets[idx]))
            else:
                stringrep = [''.join(element) for element in datasets[idx]]
                F.write(''.join(stringrep))
                

def formatInputOutput(inputlist,outputlist,thirdlist,filename):
    '''Takes third lists from unambiguate method.
    Formats for processdump method.'''
    
    masterlist = []
    
    for idx,line in enumerate(inputlist):
        masterlist.append(line)
        masterlist.append(''.join(outputlist[idx]))
        masterlist.append(''.join(thirdlist[idx]))
    with open(filename, 'w') as M:
        masterstring = ''.join(masterlist)
        M.write(masterstring)
    return len(masterlist)

In [3]:
def writeOpenNMTfiles(data,rep,seed):
    '''Helper to process dump
    Takes datalist. Splits 40/40/10/10.
    Writes train1, test&train, dev, and test2.'''
    export = []
    #3 way split
    split1,test1train2 = train_test_split(data,test_size=0.55, random_state=seed)
    train1,val1 = train_test_split(split1,test_size=0.2, random_state=seed)
    
    splits = ['-val1.txt','-test1train2.txt', '-train1.txt']
    for split in splits:
        if split == '-test1train2.txt':
            export = test1train2
        elif split == '-train1.txt':
            export = train1
        else:
            export = val1
        #write to file
        file = rep + split
        with open(file, 'w') as F:
            for line in export:
                F.write(line)
                
def reduceTraining(q,num_ex):
    '''Returns q random indices from total number of data lines.'''
    idcs = []
    for i in range(q):
        idcs.append(random.randint(0,num_ex))
    return idcs
                
def processdump(masterfile,representation,quantity=5000):
    '''Takes file from formatInputOutput method where lines in order: UR,IR,SR.
    Prepares different files for OpenNMT with randomly chosen repeated lines.'''
    
    datalines = [line for line in open(masterfile)]
    
    inputline = []
    outputline = []
    thirdline = []
    inputline2 = []
    outputline2 = []
    thirdline2 = []
    #mistakes carried over from Arapaho database
    #dbmistakes = ['?','E','I','N','K','O','a','l']

    for linum,line in enumerate(datalines):
        #process UR, IR, and SR in same code block (so can skip all if needed)
        if linum % 3 == 0:
            #get gold lines
            inputline.append(line)
            outputline.append(''.join(datalines[linum+1]))
            thirdline.append(''.join(datalines[linum+2]))
            #Repeat lines for oNMT training
            inp2 = line[:-1] + line
            outp = ''.join(datalines[linum+1])
            outp2 = outp[:-1] + outp
            third = ''.join(datalines[linum+2])
            third2 = third[:-1] + third
            inputline2.append(''.join(inp2))
            outputline2.append(''.join(outp2))
            thirdline2.append(''.join(third2))
    
    #Same random split for surf & tags
    seed = random.randint(1,100)
    #To randomize and reduce data
    if quantity < len(datalines):
        rand_indices = reduceTraining(quantity,len(inputline))
        inputline = [inputline[idx] for idx in rand_indices]
        outputline = [outputline[idx] for idx in rand_indices]
        thirdline = [thirdline[idx] for idx in rand_indices]
        inputline2 = [inputline2[idx] for idx in rand_indices]
        outputline2 = [outputline2[idx] for idx in rand_indices]
        thirdline2 = [thirdline2[idx] for idx in rand_indices]
    #splits data and writes files
    writeOpenNMTfiles(inputline,representation[0],seed)
    writeOpenNMTfiles(outputline,representation[1],seed)
    writeOpenNMTfiles(thirdline,representation[2],seed)
    writeOpenNMTfiles(inputline2,representation[3],seed)
    writeOpenNMTfiles(outputline2,representation[4],seed)
    writeOpenNMTfiles(thirdline2,representation[5],seed)

In [None]:
lexcfile = os.path.join(r'C:\Users\Sarah R M\Documents\CU\Arapaho\ComputELppr', 'lexcWordsFlagsCorrected3')
URIRlines = [line.split(' ') for line in open(lexcfile)]

In [None]:
fwBwFfile = os.path.join(r'C:\Users\Sarah R M\Documents\CU\Arapaho\ComputELppr', 'fomaNoBoundaryNoFlagWithSpaceCorrected3')
URSRlines = [line.split(' ') for line in open(fwBwFfile)]

In [None]:
URIRlib = extractlexc(URIRlines,Flags=False,Boundaries=False)
#URIRnFlagslib = extractRep(URIRlines, generation=False)
URSRlib = extractfoma(URSRlines, Flags=False)

In [None]:
#unambiguous2File(unambiguate(combineLibs(URIRlib,URSRlib)), 'parse')

comboURIRSR = combineLibs(URIRlib,URSRlib)

In [None]:
UR,IR,SR = unambiguate(comboURIRSR)

In [None]:
#unambiguous2File(UR,IR,SR, 'gen')
fulllines = formatInputOutput(UR,IR,SR,'MasterFileAL2nFnB')

In [23]:
#processdump('MasterFileALnFnB',['URnFnB125kf0','IRnFnB125kf0','SRnFnB125f0','URnFnB125k2f0','IRnFnB125k2f0','SRnFnB125k2f0'],quantity=125000)
processdump('MasterFileALnFnB',['URnFnB100kf0','IRnFnB100kf0','SRnFnB100kf0','URnFnB100k2f0','IRnFnB100k2f0','SRnFnB100k2f0'],quantity=100000)
#processdump('MasterFileALnFnB',['URnFnB75k','IRnFnB75k','SRnFnB75k','URnFnB75k2','IRnFnB75k2','SRnFnB75k2'],quantity=75000)
#processdump('MasterFilenFnB',['URnFnB12k','IRnFnB12k','SRnFnB12k','URnFnB12k2','IRnFnB12k2','SRnFnB12k2'],quantity=12000)
#processdump('MasterFileALnFnB',['URnFnB87k','IRnFnB87k','SRnFnB87k','URnFnB87k2','IRnFnB87k2','SRnFnB87k2'],quantity=87500)
#processdump('MasterFileALnFnB',['JURnFnB50k','JIRnFnB50k','JSRnFnB50k','JURnFnB50k2','JIRnFnB50k2','JSRnFnB50k2'],quantity=50000)
#processdump('MasterFilenFnB',['URnFnB','IRnFnB','SRnFnB','URnFnB2','IRnFnB2','SRnFnB2'],quantity=len(open('MasterFilenFnB').readlines()))

# Evaluate

In [4]:
import numpy as np
import codecs

def distance(str1, str2):
    """Simple Levenshtein implementation for evalOpenNMT."""
    m = np.zeros([len(str2)+1, len(str1)+1])
    for x in range(1, len(str2) + 1):
        m[x][0] = m[x-1][0] + 1
    for y in range(1, len(str1) + 1):
        m[0][y] = m[0][y-1] + 1
    for x in range(1, len(str2) + 1):
        for y in range(1, len(str1) + 1):
            if str1[y-1] == str2[x-1]:
                dg = 0
            else:
                dg = 1
            m[x][y] = min(m[x-1][y] + 1, m[x][y-1] + 1, m[x-1][y-1] + dg)
    return int(m[len(str2)][len(str1)])


def delEndTags(predfile,goldfile):
    '''Takes results from repeated data & gold.
    Deletes all elements not corresponding to an element in gold.
    Writes to new file for evaluation.'''
    testlines = [line.strip().split(' ') for line in open(goldfile)]
    predlines = [line.strip().split(' ') for line in open(predfile)]
    
    reducedlines = []
    for linidx,predline in enumerate(predlines):
        newline = []
        if len(predline) > len(testlines[linidx]):
            newline = predline[:len(testlines[linidx])]
        else:
            newline = predline
        reducedlines.append(newline)
    return reducedlines, testlines

def evalOpenNMT(predfile,goldfile,note,file='Report-',interm=False):
    '''Takes pred eval file from OpenNMT and test file. 
    Compares. Returns PR and F1. 
    Prints report to file.'''
    
    evallines, goldlines = delEndTags(predfile,goldfile)
    
    TPFNlines = len(goldlines)
    TPFPlines = len(evallines)
    lines_correct = 0
    correct_tagcts = {}
    incorrect_tagcts = {}
    gold_tagscts = {}
    reportstring = ''
    dist = 0.
    
    #get total gold tags
    for line in goldlines:
        for gold_element in line:
            gold_tagscts[gold_element] = gold_tagscts.get(gold_element, 0) + 1
    
    #tag/element accuracy
    for linidx, line in enumerate(evallines):
        #line counts
        if line == goldlines[linidx]:
            lines_correct += 1
        else:
            predstr = ''.join(line)
            goldstr = ''.join(goldlines[linidx])
            with open('diffPred-' + predfile, 'a') as D:
                D.write("Pred: " + predstr + '\n' + 
                        'Gold: ' + goldstr +'\n')
            dist += distance(goldstr, predstr)

        #allow for different line lengths
        shorter = evallines[linidx]
        longer = goldlines[linidx]
        if len(evallines[linidx]) > len(goldlines[linidx]):
            shorter = goldlines[linidx]
            longer = evallines[linidx]
            for tag in evallines[linidx][(len(shorter) - len(longer)):]:
                incorrect_tagcts[tag] = incorrect_tagcts.get(tag,0) + 1
        #count tags
        for tagidx,tag in enumerate(shorter):
            try:
                if tag == longer[tagidx]:
                    correct_tagcts[tag] = correct_tagcts.get(tag, 0) + 1
                else:
                    incorrect_tagcts[evallines[linidx][tagidx]] = incorrect_tagcts.get(evallines[linidx][tagidx], 0) + 1
            except IndexError:
                print(tagidx, len(longer), longer[tagidx], len(shorter), tag)
    
    #overall accuracy
    try:
        linesP = lines_correct/TPFPlines
        linesR = lines_correct/TPFNlines
        avgdist = round(dist/TPFNlines, 2)
        reportstring += "%s\n%s\n\n Num lines: %d\n All lines Prec: %.4f\n All lines Recall: %.4f\n All lines F1: %.4f\n\n levenshtein:\t%.2f\n" % (predfile,note,TPFNlines,round(linesP,5),round(linesR,5),round((2*linesP*linesR)/(linesP+linesR),15),avgdist)
    except ZeroDivisionError:
        reportstring += '%s\n%s\n\n Num lines: %d\n All lines Accuracy: %.4f\n\n levenshtein:\t%.2f\n' %(predfile,note,TPFNlines,round(lines_correct/TPFPlines),avgdist)
    
    #all tags in file, PR&F1        
    num_allcorrect = sum(correct_tagcts.values())
    num_allincorrect = sum(incorrect_tagcts.values())
    allPrec = num_allcorrect/(num_allcorrect+num_allincorrect)
    allRecall = num_allcorrect/sum(gold_tagscts.values())
    reportstring += "\n  All tags Precision: %.4f\n  All tags Recall: %.4f\n  All tags F1: %.4f\n" % (round(allPrec,5),round(allRecall,5),round((2*allPrec*allRecall)/(allPrec+allRecall),10))
    
    #individual tags, PR&F1
    reportstring += "\n              TAG   \t PREC\t RECALL\t  F1\tinstances \n"
    tag_precisions = {}
    tag_recalls = {}
    tag_F1s = {}
    avgP = 0.00
    avgR = 0.00
    avgF1 = 0.00
    for tag in sorted(gold_tagscts.keys()):
        try:
            tag_precisions[tag] = correct_tagcts.get(tag, 0.00)/(correct_tagcts.get(tag,0.00) +
                                                                 incorrect_tagcts.get(tag,0.00))
        except ZeroDivisionError:
            tag_precisions[tag] = 0.00
            instances = 0
        try:
            tag_recalls[tag] = correct_tagcts[tag]/gold_tagscts[tag]
        except (ZeroDivisionError,KeyError):
            tag_recalls[tag] = 0.00
        try:
            tag_F1s[tag] = (2 * tag_precisions[tag] * tag_recalls[tag])/(tag_precisions[tag]
                                                                          + tag_recalls[tag])
        except ZeroDivisionError:
            tag_F1s[tag] = 0.00
        avgP += tag_precisions[tag]
        avgR += tag_recalls[tag]
        avgF1 += tag_F1s[tag]
        uniq_tags = len(gold_tagscts.keys())
        reportstring += "  %20s\t %.4f\t %.4f\t %.4f\t %d\n" % (tag,
                                                           tag_precisions[tag],
                                                           tag_recalls[tag],
                                                           tag_F1s[tag],
                                                           gold_tagscts[tag])
    reportstring += "    total/average: \t %.4f\t %.4f\t %.4f\t %d" %(round((avgP/uniq_tags),5),
                                                                     round((avgR/uniq_tags),5),
                                                                     round((avgF1/uniq_tags),5),
                                                                    sum(gold_tagscts.values()))
                                                                    
    print(num_allcorrect, num_allincorrect, sum(gold_tagscts.values()))
    with open(file+predfile, "w") as R:     
        R.write(reportstring)
    
    if interm:
        return evallines


In [6]:
#evalOpenNMT('SR-URnFnB25k-eval.txt','URnFnB25k-test2.txt','\n 40/10/10 split. No flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n25 epochs 1.05 perplexity',file='Report-')
#evalOpenNMT('SR-IRnFnB25k-eval1.txt','IRnFnB25k-test1train2.txt','\n 40/10/40 split. No flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n25 epochs 1.15 perplexity',file='Report-')
#evalOpenNMT('SR-IR-URnFnB25k-eval.txt','URnFnB25k-test2.txt','\n 40/10/10 split. No flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n25 epochs ?',file='Report-')
IRALreduced = evalOpenNMT('UR-SRnFnB125k2f0-output.txt','SRnFnB125f0-test1train2.txt','\n 45K training lines  36/9/55 split. NO flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n20 epochs, 1.00 PPL',file='Report-',interm=True)
#evalOpenNMT('IR-SRnFnB125k2f0-output2.txt','SRnFnB100k2f0-test2.txt','\n 36301 training lines. 66/12/12 split.\nNO flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n20 epochs. 1.?? perplexity', file='Report-')
#evalOpenNMT('SR-URnFnB125k2-pred.txt','URnFnB125k-test2.txt','\n 45375 training lines. 66/12/12 split.\nNO flags or boundaries.\nRepeated lines in training and test, deleted for evaluation.\n20 epochs',file='Report-')

1088154 6668 1094822


In [29]:
def repeatLines(predtesttrain):
    '''Takes eval file for intermediate prediction.
    Repeat lines for training second LSTM in OpenNMT.'''
    
    return [' '.join(line)+' '+ ' '.join(line) +'\n' for line in predtesttrain]
    
        
def newsplit(seeddata,rep):
    '''Takes unrepeated lines as list from itermediate predictions and repeats lines. 
    Or takes 2-test1train2 data file. 
    Returns train,dev,test files for OpenNMT.'''
    
    data = []
    if isinstance(seeddata, list):
        data = repeatLines(seeddata)
    else:
        data = [line for line in open(seeddata)]
        
    export = []
    #3 way split
    split1,train2 = train_test_split(data,test_size=0.66, random_state=45)
    test2,val2 = train_test_split(split1,test_size=0.5, random_state=45)
    
    splits = ['-val2.txt','-train2.txt', '-test2.txt']
    for split in splits:
        if split == '-train2.txt':
            export = train2
        elif split == '-test2.txt':
            export = test2
        else:
            export = val2
        #write to file
        file = rep + split
        with open(file, 'w') as F:
            for line in export:
                F.write(line)

In [30]:
newsplit(IRALreduced, 'IRnFnB100k2f0')
newsplit('SRnFnB100k2f0-test1train2.txt','SRnFnB100k2f0')
newsplit('URnFnB100k2f0-test1train2.txt','URnFnB100k2f0')
newsplit('URnFnB100k2f0-test1train2.txt','URnFnB100k2f0')

In [7]:
len(open('IRnFnB125k2f0-train1.txt').readlines())

45000