In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

torch.manual_seed(1)

from tqdm import tqdm_notebook as tqdm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

## Load data
source is https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [2]:
data_path = '/Users/tyler/Documents/programming/pytorch_nlp/data/word2vec-nlp-tutorial/'

In [3]:
df = pd.read_csv(data_path+'labeledTrainData.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
test = df[:7500]
train = df[7500:]

In [5]:
len(train),len(test)

(17500, 7500)

In [6]:
word_tokenize('d asdf asdfsd. $5')

['d', 'asdf', 'asdfsd', '.', '$', '5']

In [7]:
def process_review(review):
    tokens = word_tokenize(review)
    tokens = [t.lower() for t in tokens]
    tokens = [t.replace('/','') for t in tokens]
    tokens = [t.replace('\\','') for t in tokens]
    tokens = [t.replace('>','') for t in tokens]
    tokens = [t.replace('<','') for t in tokens]
    return tokens

## Make vocab and train data

In [10]:
labels = list(train.sentiment)
reviews = list(train.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

train_data = list(zip(all_words,labels))

HBox(children=(IntProgress(value=0, max=17500), HTML(value='')))




In [11]:
flat_list = [item for sublist in all_words for item in sublist]
vocab = set(flat_list)

len(vocab)

word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}

counts = Counter(flat_list)

In [47]:
idx_to_word = {idx:word for word,idx in word_to_idx.items()}

## Make test data

In [12]:
labels = list(test.sentiment)
reviews = list(test.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

test_data = list(zip(all_words,labels))

HBox(children=(IntProgress(value=0, max=7500), HTML(value='')))




## Set up model

In [13]:
VOCAB_SIZE = len(word_to_idx)
NUM_LABELS = 2


class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)


    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)


def make_bow_vector(sentence, word_to_idx):
    vec = torch.zeros(len(word_to_idx))
    for word in sentence:
        if word in word_to_idx:
            vec[word_to_idx[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)


In [14]:
vec = make_bow_vector('this is aasd fast d sentence'.split(),word_to_idx)

In [15]:
vec.sum()

tensor(5.)

In [17]:
label_to_ix = {'negative': 0, 'positive': 1}

In [18]:
#all_words[0]

In [19]:
with torch.no_grad():
    sample = all_words[0]
    bow_vector = make_bow_vector(sample[0], word_to_idx)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.6948, -0.6915]])


## Train

In [20]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [24]:
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0
    for sentence, label in train_data:
        model.zero_grad()

        vec = make_bow_vector(sentence, word_to_idx)
        target = torch.LongTensor([label])

        log_probs = model(vec)

        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}: loss of {round(total_loss,3)}')

Epoch 1/5: loss of 860449.916
Epoch 2/5: loss of 612736.484
Epoch 3/5: loss of 491620.345
Epoch 4/5: loss of 440858.004
Epoch 5/5: loss of 375019.431


In [25]:
total_loss = 0
num_correct = 0
to_eval = train_data
for sentence, label in to_eval:
    model.eval()

    vec = make_bow_vector(sentence, word_to_idx)
    target = torch.LongTensor([label])

    log_probs = model(vec)
    pred = log_probs.argmax().detach().numpy()
    correct = int(pred == label)
    num_correct += correct
    loss = loss_function(log_probs, target)

    total_loss += loss.item()
print(f'Train loss of {round(total_loss,3)}')
print(f'Train accuracy of {round(num_correct*100/len(to_eval),2)}')

Train loss of 162082.578
Train accuracy of 89.65


## Test

In [26]:
num_epochs = 5
total_loss = 0
num_correct = 0
to_eval = test_data
for sentence, label in to_eval:
    model.eval()

    vec = make_bow_vector(sentence, word_to_idx)
    target = torch.LongTensor([label])

    log_probs = model(vec)
    pred = log_probs.argmax().detach().numpy()
    correct = int(pred == label)
    num_correct += correct
    loss = loss_function(log_probs, target)

    total_loss += loss.item()
print(f'Test loss of {round(total_loss,3)}')
print(f'Test accuracy of {round(num_correct*100/len(to_eval),2)}')


Test loss of 125092.704
Test accuracy of 86.01


## Model Parameters

In [54]:
params = []
for p in model.parameters():
    print(p)
    a = p.detach().numpy()
    params.append(a)

Parameter containing:
tensor([[ 7.8233e-01,  1.4648e-01,  4.0102e-01,  ...,  2.3998e-03,
         -6.2271e-04, -1.0051e-01],
        [-7.7985e-01, -1.4611e-01, -3.9964e-01,  ..., -8.9297e-04,
         -9.6313e-04,  1.0101e-01]], requires_grad=True)
Parameter containing:
tensor([ 5.4877, -5.4903], requires_grad=True)


In [59]:
a = params[0][1]

In [60]:
a.shape

(95091,)

In [88]:
sorted_idx = a.argsort()

In [89]:
for idx in sorted_idx[:10]:
    print(idx_to_word[idx],round(a[idx],3))

worst -49.409
waste -33.08
awful -31.459
boring -29.581
terrible -24.071
worse -23.892
fails -23.09
mess -22.583
dull -22.37
unfortunately -22.146


In [90]:
for idx in sorted_idx[-10:]:
    print(idx_to_word[idx],round(a[idx],3))

brilliant 17.592
710 18.039
beautiful 18.873
wonderful 20.09
definitely 20.4
today 20.702
loved 21.29
amazing 23.117
perfect 31.693
excellent 33.634


In [107]:
examples = [(sentence,label) for sentence,label in train_data if '710' in sentence]

In [127]:
def show_text(example):
    sentence,label = example
    print(f'label is {label}'.upper())
    print('----------------------------------------')
    print(''.join(f'{w} ' for w in sentence))

In [129]:
show_text(examples[8])

LABEL IS 1
----------------------------------------
was'nt really bad for raw 's first ppv of 006 . but the ending was really really shocking to everyone in attendance & the ones who were watching at home.  br    br   first match- ric flair vs . edge w lita for the wwe intercontinental championship not a bad opener , these two can seriously put on a great match if they had more time to put on a wrestling match . flair wins by dq after edge slams him with his mitb briefcase . 310 second match- trish stratus vs. mickie james for the wwe women 's championship not bad noticing the fact that this is the first time these divas faced off in the ring together . mickie goes for a modified chick kick , but trish ducks & nails her own chick kick for the win to retain her title . 310 third match- triple h vs. big show seriously good this match was , really . the whole match hhh focuses on big show 's injured arm but big show still fights back . later hhh is able to topple down big show & nails a p

In [121]:
len(examples)

133

In [124]:
sum([label for _,label in examples])

128