In [2]:
import nltk
import random
from collections import Counter
import numpy as np
import pandas as pd
import pprint, time

# Get Data

Get a few hundred words for training and test data from the Brown Corpus via nltk. 
To imitate field data, randomly change POS tags for 20% of train set.
Extract a few thousands words to simulate the rest of the field data which has not been annotated yet. 

Write the three data sets to files for printing and sharing. 

![image.png](attachment:image.png)

In [3]:
# Select random sentences from Brown corpus
text = random.choices(list(nltk.corpus.brown.tagged_sents(tagset="universal")),k=300)

In [4]:
# Split data into training, test, and untagged set
train_set = text[:10]
test_set = text[10:35]
untagged_set = text[35:]

# List all POS tags used in the data
tags = [tag for sent in train_set for word,tag in sent]
tagset = list(set(tags))
tokens = len(tags)

print("Tag list:", tagset) # should match nltk Universal POS Tagset
print("Train Tokens:", tokens)
print("Untagged tokens:", len([count for sent in untagged_set for count in sent]))
print("Sample sentences:\n", train_set[:2])

Tag list: ['ADP', 'CONJ', 'ADJ', 'DET', 'NOUN', 'PRT', 'VERB', '.', 'ADV', 'PRON']
Train Tokens: 106
Untagged tokens: 5470
Sample sentences:
 [[('We', 'PRON'), ('write', 'VERB'), ('this', 'DET'), ('Af', 'NOUN'), ('.', '.')], [('I', 'PRON'), ('knew', 'VERB'), ('it', 'PRON'), ("wouldn't", 'VERB'), ('be', 'VERB'), ('the', 'DET'), ('same', 'ADJ'), ('.', '.')]]


In [5]:
# Tag statistics
tag_freq = Counter(tags)
print(tag_freq)

Counter({'VERB': 23, 'NOUN': 20, '.': 17, 'DET': 12, 'ADJ': 11, 'PRON': 9, 'ADP': 5, 'ADV': 4, 'PRT': 3, 'CONJ': 2})


In [6]:
# don't allow punctuation tag as possible mistake
tagset.remove('.')
print(tagset)

['ADP', 'CONJ', 'ADJ', 'DET', 'NOUN', 'PRT', 'VERB', 'ADV', 'PRON']


In [7]:
# randomly change about 25% of POS tags to create "mistakes"
switch_idx = random.sample(range(0,tokens), 55)

random_train_set = []
wrd_counter = 0
for sent in train_set:
    random_sentence = []
    for pair in sent:
        wrd_counter+=1
        if wrd_counter in switch_idx and pair[1] != '.': # if punctuation, skip switch, change tag to PUNCT
            random_sentence.append((pair[0], random.choice(tagset)))
        else:
            random_sentence.append(pair)
    random_train_set.append(random_sentence)

In [8]:
# Tag statistics
newtags = [p for sent in random_train_set for w,p in sent ]
new_tag_freq = Counter(newtags)
print(new_tag_freq)

Counter({'.': 17, 'VERB': 16, 'DET': 15, 'ADP': 13, 'NOUN': 12, 'PRON': 10, 'ADJ': 9, 'ADV': 6, 'CONJ': 4, 'PRT': 4})


In [9]:
def dataFiles(tuple_dataset, filename, tagged=True):
    '''write data sets to files for printing and sharing
    Remove POS tags if data is supposed to be unannotated'''
    
    datastring = []
    for sent in tuple_dataset:
        sentence = []
        for pair in sent:
            if tagged:
                sentence.append(pair[0] + '/' + pair[1])
            else: 
                sentence.append(pair[0])
        datastring.append(' '.join(sentence))

    with open(filename, 'w') as T:
        T.write('\n'.join(datastring))

In [10]:
dataFiles(random_train_set, 'pos.train')
dataFiles(test_set, 'pos.test')
dataFiles(untagged_set, 'pos.unann', tagged=False)