In [12]:
import nltk
import random
from collections import Counter
import numpy as np
import pandas as pd
import pprint, time

# Get Data

Get a few hundred words for training and test data from the Brown Corpus via nltk. 
To imitate field data, randomly change POS tags for 20% of train set.
Extract a few thousands words to simulate the rest of the field data which has not been annotated yet. 

Write the three data sets to files for printing and sharing. 

In [13]:
# Select random sentences from Brown corpus
text = random.choices(list(nltk.corpus.brown.tagged_sents(tagset="universal")),k=300)

In [14]:
# Split data into training, test, and untagged set
train_set = text[:10]
test_set = text[10:35]
untagged_set = text[35:]

# List all POS tags used in the data
tags = [tag for sent in train_set for word,tag in sent]
tagset = list(set(tags))
tokens = len(tags)

print("Tag list:", tagset) # should match nltk Universal POS Tagset
print("Train Tokens:", tokens)
print("Untagged tokens:", len([count for sent in untagged_set for count in sent]))
print("Sample sentences:\n", train_set[:2])

Tag list: ['ADP', 'NUM', 'CONJ', 'ADJ', 'DET', 'NOUN', 'PRT', 'VERB', '.', 'ADV', 'PRON']
Train Tokens: 308
Untagged tokens: 5782
Sample sentences:
 [[('Total', 'NOUN'), ('distance', 'NOUN'), ('is', 'VERB'), ('about', 'ADV'), ("21/64''", 'NOUN'), ("''", '.'), ('.', '.')], [('Over', 'ADP'), ('a', 'DET'), ('relatively', 'ADV'), ('short', 'ADJ'), ('period', 'NOUN'), ('of', 'ADP'), ('time', 'NOUN'), (',', '.'), ('usually', 'ADV'), ('about', 'ADV'), ('four', 'NUM'), ('to', 'ADP'), ('twelve', 'NUM'), ('weeks', 'NOUN'), (',', '.'), ('the', 'DET'), ('worker', 'NOUN'), ('must', 'VERB'), ('be', 'VERB'), ('able', 'ADJ'), ('to', 'PRT'), ('shift', 'VERB'), ('the', 'DET'), ('focus', 'NOUN'), (',', '.'), ('back', 'ADV'), ('and', 'CONJ'), ('forth', 'ADV'), (',', '.'), ('between', 'ADP'), ('immediate', 'ADJ'), ('external', 'ADJ'), ('stressful', 'ADJ'), ('exigencies', 'NOUN'), ('(', '.'), ('``', '.'), ('precipitating', 'VERB'), ('stress', 'NOUN'), ("''", '.'), (')', '.'), ('and', 'CONJ'), ('the', 'DET')

In [15]:
# Tag statistics
tag_freq = Counter(tags)
print(tag_freq)

Counter({'NOUN': 66, '.': 48, 'VERB': 39, 'ADP': 39, 'DET': 36, 'ADJ': 28, 'ADV': 27, 'CONJ': 9, 'PRON': 9, 'PRT': 5, 'NUM': 2})


In [16]:
# don't allow punctuation tag as possible mistake
tagset.remove('.')
print(tagset)

['ADP', 'NUM', 'CONJ', 'ADJ', 'DET', 'NOUN', 'PRT', 'VERB', 'ADV', 'PRON']


In [17]:
# randomly change about 25% of POS tags to create "mistakes"
switch_idx = random.sample(range(0,tokens), 55)

random_train_set = []
wrd_counter = 0
for sent in train_set:
    random_sentence = []
    for pair in sent:
        wrd_counter+=1
        if wrd_counter in switch_idx and pair[1] != '.': # if punctuation, skip switch, change tag to PUNCT
            random_sentence.append((pair[0], random.choice(tagset)))
        else:
            random_sentence.append(pair)
    random_train_set.append(random_sentence)

In [18]:
# Tag statistics
newtags = [p for sent in random_train_set for w,p in sent ]
new_tag_freq = Counter(newtags)
print(new_tag_freq)

Counter({'NOUN': 58, '.': 48, 'DET': 39, 'ADP': 38, 'VERB': 35, 'ADJ': 28, 'ADV': 24, 'CONJ': 14, 'PRT': 10, 'PRON': 9, 'NUM': 5})


In [19]:
def dataFiles(tuple_dataset, filename, tagged=True):
    '''write data sets to files for printing and sharing
    Remove POS tags if data is supposed to be unannotated'''
    
    datastring = []
    for sent in tuple_dataset:
        sentence = []
        for pair in sent:
            if tagged:
                sentence.append(pair[0] + '/' + pair[1])
            else: 
                sentence.append(pair[0])
        datastring.append(' '.join(sentence))

    with open(filename, 'w') as T:
        T.write('\n'.join(datastring))

In [22]:
dataFiles(random_train_set, 'DIU_origin0.train')
dataFiles(test_set, 'DIU_origin0.test')
dataFiles(untagged_set, 'DIU_origin0.predict', tagged=False)