In [6]:
from nltk.corpus import treebank, ptb
from collections import Counter
import nltk
import pickle as pkl
import random

## Data loading and analysis

In [7]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/rj1408/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [8]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /home/rj1408/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [9]:
nltk.download('ptb')

[nltk_data] Downloading package ptb to /home/rj1408/nltk_data...
[nltk_data]   Package ptb is already up-to-date!


True

In [16]:
#total number of words/sents in corpus
len(ptb.tagged_sents()), len(ptb.tagged_words())

(3914, 100676)

In [26]:
allfileids = ptb.fileids()
allfileids = list(map(lambda fid: (fid, fid.split('/')[1]), allfileids))
trainfileids = list(filter(lambda tup: int(tup[1]) <= 18, allfileids))
valfileids = list(filter(lambda tup: int(tup[1]) > 18 and int(tup[1]) <= 21, allfileids))
testfileids = list(filter(lambda tup: int(tup[1]) >= 22, allfileids))
trainfileids = list(map(lambda tup: tup[0], trainfileids))
valfileids = list(map(lambda tup: tup[0], valfileids))
testfileids = list(map(lambda tup: tup[0], testfileids))

In [54]:
traindict = {}
traindict['tagged_words'] = list(ptb.tagged_words(fileids=trainfileids))
traindict['tagged_sents'] = list(ptb.tagged_sents(fileids=trainfileids))
valdict = {}
valdict['tagged_words'] = list(ptb.tagged_words(fileids=valfileids))
valdict['tagged_sents'] = list(ptb.tagged_sents(fileids=valfileids))
testdict = {}
testdict['tagged_words'] = list(ptb.tagged_words(fileids=testfileids))
testdict['tagged_sents'] = list(ptb.tagged_sents(fileids=testfileids))

In [56]:
with open("/scratch/rj1408/pos_lm/ptb_wsj_pos/train.p","wb") as f:
    pkl.dump( traindict, f)
with open("/scratch/rj1408/pos_lm/ptb_wsj_pos/val.p","wb") as f:
    pkl.dump( valdict, f)
with open("/scratch/rj1408/pos_lm/ptb_wsj_pos/test.p","wb") as f:
    pkl.dump( testdict, f)

In [6]:
with open("/misc/vlgscratch4/BrunaGroup/rj1408/nlu/ptb_wsj_pos/train.p","rb") as f:
    traindict = pkl.load(f)
with open("/misc/vlgscratch4/BrunaGroup/rj1408/nlu/ptb_wsj_pos/val.p","rb") as f:
    valdict = pkl.load(f)
with open("/misc/vlgscratch4/BrunaGroup/rj1408/nlu/ptb_wsj_pos/test.p","rb") as f:
    testdict = pkl.load(f)

In [7]:
len(traindict['tagged_words']), len(valdict['tagged_words']), len(testdict['tagged_words'])

(974254, 140551, 138208)

In [12]:
#tag analysis
with open('tagset.txt') as f:
    alltags = f.read()

alltags = alltags.split('\n')    
alltags = set(alltags)

In [6]:
#get distribution of tags
tagcntr = Counter(map(lambda tupl: tupl[1], filter(lambda tup: tup[1] in alltags, traindict['tagged_words'])))
tagcntr.most_common()

[('NN', 127563),
 ('IN', 94760),
 ('NNP', 87693),
 ('DT', 78777),
 ('-NONE-', 61910),
 ('JJ', 58957),
 ('NNS', 57860),
 (',', 46526),
 ('.', 37884),
 ('CD', 34891),
 ('RB', 29621),
 ('VBD', 28311),
 ('VB', 25489),
 ('CC', 22832),
 ('TO', 21462),
 ('VBZ', 20982),
 ('VBN', 19333),
 ('PRP', 16766),
 ('VBG', 14350),
 ('VBP', 12326),
 ('MD', 9437),
 ('POS', 8284),
 ('PRP$', 7989),
 ('$', 6952),
 ('``', 6782),
 ("''", 6622),
 (':', 4696),
 ('WDT', 4194),
 ('JJR', 3174),
 ('RP', 2515),
 ('NNPS', 2505),
 ('WP', 2285),
 ('WRB', 2051),
 ('JJS', 1867),
 ('RBR', 1675),
 ('-RRB-', 1321),
 ('-LRB-', 1305),
 ('EX', 833),
 ('RBS', 435),
 ('PDT', 333),
 ('FW', 224),
 ('WP$', 166),
 ('#', 127),
 ('UH', 87),
 ('SYM', 55),
 ('LS', 47)]

## Base model - Majority class voting

In [8]:
def getTokenAccuracy(labels, preds):
    alllabels = [tok for lis in labels for tok in lis]
    allpreds= [tok for lis in preds for tok in lis]
    cnt = 0
    for i,tok in enumerate(alllabels):
        if alllabels[i]==allpreds[i]:
            cnt += 1
    return float(cnt)/len(alllabels)    

def getSentAccuracy(labels, preds):
    cnt = 0
    for i,sent in enumerate(labels):
        if labels[i]==preds[i]:
            cnt += 1
    return float(cnt)/len(labels)

In [9]:
def pruneNonLabels(labels, preds, alltags):
    useless = list(map(lambda lis: list(filter(lambda tup: tup[1] not in alltags, list(enumerate(lis)))), labels))
    useless = list(map(lambda lis: set(map(lambda tup: tup[0], lis)), useless))
    prunedLabels = list(map(lambda lis: list(filter(lambda lab: lab in alltags, lis)), labels))
    prunedPreds = list(map(lambda tup: list(filter(lambda tupl: tupl[0] not in useless[tup[0]], list(enumerate(tup[1])))), list(enumerate(preds))))
    prunedPreds = list(map(lambda lis: list(map(lambda tupl: tupl[1], lis)), prunedPreds))
    itemsLabels = [tok for lis in prunedLabels for tok in lis]
    itemsPruned = [tok for lis in prunedPreds for tok in lis]
    assert len(itemsLabels) == len(itemsPruned)
    return prunedLabels, prunedPreds

def getRandomPredictions(lisoflisof_tokens, wrd_cntr):
    return list(map(lambda lisoftokens: list(map(lambda tok: "-1" if tok not in wrd_cntr else random.choice(list(wrd_cntr[tok].items()))[0][0] , lisoftokens)), lisoflisof_tokens))

def getPredictions(lisoflisof_tokens, wrd_cntr):
    return list(map(lambda lisoftokens: list(map(lambda tok: "-1" if tok not in wrd_cntr else wrd_cntr[tok].most_common(1)[0][0], lisoftokens)), lisoflisof_tokens))

In [13]:
lis = traindict['tagged_words']
wrd_cntr = {}
for tup in lis:
    if tup[0] not in wrd_cntr:
        wrd_cntr[tup[0]] = Counter()
    wrd_cntr[tup[0]][tup[1]] += 1

In [14]:
#Predict majority

test_sents = list(map(lambda lis: list(map(lambda tup: tup[0], lis)), testdict['tagged_sents']))
predictions = getPredictions(test_sents, wrd_cntr)
labels = list(map(lambda lis: list(map(lambda tup: tup[1], lis)), testdict['tagged_sents']))
labels, predictions = pruneNonLabels(labels, predictions, alltags)

In [15]:
getTokenAccuracy(labels, predictions), getSentAccuracy(labels, predictions)

(0.9180872308404723, 0.16935188575613327)

In [16]:
#Predict Random

test_sents = list(map(lambda lis: list(map(lambda tup: tup[0], lis)), testdict['tagged_sents']))
predictions = getRandomPredictions(test_sents, wrd_cntr)
labels = list(map(lambda lis: list(map(lambda tup: tup[1], lis)), testdict['tagged_sents']))
labels, predictions = pruneNonLabels(labels, predictions, alltags)

In [17]:
getTokenAccuracy(labels, predictions), getSentAccuracy(labels, predictions)

(0.10105782588562168, 0.0)