In [1]:
from nltk.corpus import treebank, ptb
from collections import Counter
import nltk
import pickle as pkl

In [3]:
with open("/data/rj1408/ptb_wsj_pos/train.p","rb") as f:
    traindict = pkl.load(f)
with open("/data/rj1408/ptb_wsj_pos/val.p","rb") as f:
    valdict = pkl.load(f)
with open("/data/rj1408//ptb_wsj_pos/test.p","rb") as f:
    testdict = pkl.load(f)

In [4]:
#tag analysis
with open('tagset.txt') as f:
    alltags = f.read()

alltags = list(map(lambda strline: strline.split('\t')[1], alltags.split('\n')))
alltags = set(alltags)

In [5]:
lis = traindict['tagged_words']
tag_cntr = {}
for tup in lis:
    if tup[1] in alltags:
        if tup[1] not in tag_cntr:
            tag_cntr[tup[1]] = Counter()
        tag_cntr[tup[1]][tup[0]] += 1

In [12]:
#vocab size for each tag
[(tup[0], len(tup[1])) for tup in tag_cntr.items()]

[('NNP', 11127),
 ('CD', 4597),
 ('NNS', 4872),
 ('JJ', 7958),
 ('MD', 27),
 ('VB', 2056),
 ('DT', 48),
 ('NN', 9149),
 ('IN', 187),
 ('VBZ', 1123),
 ('VBG', 1987),
 ('CC', 33),
 ('VBD', 1560),
 ('VBN', 2089),
 ('RB', 1283),
 ('TO', 4),
 ('PRP', 46),
 ('RBR', 51),
 ('WDT', 12),
 ('VBP', 975),
 ('RP', 35),
 ('PRP$', 15),
 ('JJS', 103),
 ('POS', 3),
 ('EX', 2),
 ('WP', 8),
 ('JJR', 151),
 ('WRB', 14),
 ('NNPS', 562),
 ('WP$', 1),
 ('PDT', 13),
 ('RBS', 8),
 ('FW', 101),
 ('UH', 39),
 ('SYM', 15),
 ('LS', 18)]

In [6]:
vocab_cntr = Counter([tup[0] for tup in  lis])

In [17]:
len(vocab_cntr)

43813

In [21]:
#sent len analysis
sentlens = Counter([len(sent) for sent in traindict['tagged_sents']]).most_common()

num = sum([tup[1] for tup in sentlens if tup[0] <= 60])
den = len(traindict['tagged_sents'])
num/den

0.9921243360632146

In [23]:
def getPercentile(voc_cntr, topk):
    totalsize = sum([tup[1] for tup in voc_cntr.items()])
    topksize = sum([tup[1] for tup in voc_cntr.most_common(topk)])
    return topksize/totalsize

In [26]:
getPercentile(vocab_cntr, 15000)

0.9592498465492572

In [40]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle as pkl
from collections import defaultdict,deque,Counter,OrderedDict
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
import os
import time
import copy

from models import LM_latent
from vocab import Vocabulary

N = 10
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=-1)
groundtruth = torch.rand(N, ).ge(0.5).type(torch.LongTensor)
groundtruth[7:] = -1
pred = torch.rand(N, 2, requires_grad=True)
loss = criterion(pred, groundtruth)
loss.backward()
print(pred.grad)

# Manual approach
pred.grad.zero_()
target = groundtruth[groundtruth!=-1]
output = pred[groundtruth!=-1]
loss_manual = -1 * F.log_softmax(output, 1).gather(1, target.unsqueeze(1))
loss_manual = loss_manual.mean()
loss_manual.backward()
print(pred.grad)

#third Manual approach
pred.grad.zero_()
target_2 = groundtruth[groundtruth!=-1]
output_2 = pred[groundtruth!=-1]
criterion3 = nn.CrossEntropyLoss(reduction='mean')
loss_manual_3 = criterion3(output_2, target_2)
loss_manual_3.backward()
print(pred.grad)

#another manual approach
pred.grad.zero_()
criterion2 = nn.CrossEntropyLoss(reduction='none')
loss2 = criterion2(pred, groundtruth)
msk = (groundtruth == -1)
loss2[msk] = 0
nonzeroitems = (~msk).sum().item()
loss2 = torch.sum(loss2)
loss2 = loss2/nonzeroitems
loss2.backward()
print(pred.grad)

tensor([[ 0.0973, -0.0973],
        [ 0.0682, -0.0682],
        [ 0.0671, -0.0671],
        [-0.0660,  0.0660],
        [ 0.0605, -0.0605],
        [-0.0781,  0.0781],
        [-0.0750,  0.0750],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]])
tensor([[ 0.0973, -0.0973],
        [ 0.0682, -0.0682],
        [ 0.0671, -0.0671],
        [-0.0660,  0.0660],
        [ 0.0605, -0.0605],
        [-0.0781,  0.0781],
        [-0.0750,  0.0750],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]])
tensor([[ 0.0973, -0.0973],
        [ 0.0682, -0.0682],
        [ 0.0671, -0.0671],
        [-0.0660,  0.0660],
        [ 0.0605, -0.0605],
        [-0.0781,  0.0781],
        [-0.0750,  0.0750],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000],
        [ 0.0000,  0.0000]])
tensor([[ 0.0973, -0.0973],
        [ 0.0682, -0.0682],
        [ 0.0671, -0.0671],
        [-0.0660,  0.0660],
        [ 0.0605, -0.0605],
        [-0.0781,