In [1]:
from sklearn.model_selection import train_test_split
import random
from collections import Counter
import numpy as np
import pandas as pd
import pprint, time

# Get Data

Get training and test data from `_T` files. Get incomplete annotations from `_U` files.
Write three data sets to files for printing and sharing.

In [2]:
def getdata(filepath):
    X_all = [line.strip() for line in open(filepath+'.input')]
    y_all = [line.strip() for line in open(filepath+'.output')]
    return X_all, y_all

In [3]:
def stats(trainlabels):
    # List all POS tags used in the data
    taglist = [tag for sent in trainlabels for tag in sent.split()]
    tagset = list(set(taglist))
    tokens = len(trainlabels)
    sorted_tags = sorted(Counter(taglist).items(), key=lambda x:x[1], reverse=True)
    sort_string = ''
    for tag,val in sorted_tags:
        sort_string += "{: <10}\t{}\n".format(tag, val)
        
    tagreport = "Tags in training data:\n" + ', '.join(tagset) + '\n'
    tokenreport = str(tokens) + " training tokens"  + '\n'
    tagcounts = "Frequency of tags in training data:\n" + sort_string  + '\n'
    print(tagreport)
    print(tokenreport)
    print(tagcounts)
    
    return tagreport+tokenreport+tagcounts

In [4]:
def POS2file(xlist, ylist, filepath):
    '''Format data for human readable files'''
    line_tuples = list(zip(xlist, ylist))
    data = []
    for sent in line_tuples:
        pairs = zip(sent[0].split(), sent[1].split())
        data.append(' '.join([word+'|'+tag for word,tag in pairs]))

    with open(filepath, 'w') as T:
        T.write('\n'.join(data))

In [5]:
def main():
    '''write data sets to files for printing and sharing
    Remove POS tags if data is supposed to be unannotated'''
    
    X_unannotated, y_unannotated = getdata(r'./Alberta/tau/'+LANG+'_U'+TASK)
    X_annotated, y_annotated = getdata(FROMDIR+LANG+'_T'+TASK)
    
    X_train, X_test, y_train, y_test = train_test_split(X_annotated, y_annotated, test_size=.1, random_state=42)
    
    if TASK == '_pos':
        POS2file(X_train, y_train, DATADIR+CRF_FILENAME+'0.train')
        POS2file(X_test, y_test, DATADIR+CRF_FILENAME+'0.test')
        POS2file(X_unannotated, y_unannotated, DATADIR+CRF_FILENAME+'.predict')
        with open(REPORTDIR+CRF_FILENAME+'_log.txt', 'w') as l:
            l.write(CRF_FILENAME + '\n\n' + stats(y_train))
            
        POS2file(X_train, y_train, DATADIR+TRANSFORMER_FILENAME+'0.train')
        POS2file(X_test, y_test, DATADIR+TRANSFORMER_FILENAME+'0.test')
        POS2file(X_unannotated, y_unannotated, DATADIR+TRANSFORMER_FILENAME+'.predict')
        with open(REPORTDIR+TRANSFORMER_FILENAME+'_log.txt', 'w') as l:
            l.write(TRANSFORMER_FILENAME + '\n\n' + stats(y_train))

In [6]:
LANG = 'tau'
TASK = '_pos'

DATADIR = r'./Alberta/'+LANG+'/data/'
REPORTDIR = r'./Alberta/'+LANG+'/reports/'

TRANSFORMER_FILENAME = LANG+TASK+'Trans'
CRF_FILENAME = LANG+TASK+'CRF'

main()

FileNotFoundError: [Errno 2] No such file or directory: './Alberta/tau/data/tau_T_pos.input'