In [70]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

torch.manual_seed(1)

#from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

## Load data
source is https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [2]:
data_path = '/Users/tyler/Documents/programming/pytorch_nlp/data/word2vec-nlp-tutorial/'

In [3]:
df = pd.read_csv(data_path+'labeledTrainData.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
train = df[:15000]
val = df[15000:20000]
test = df[20000:]

In [6]:
len(train),len(val),len(test)

(15000, 5000, 5000)

In [7]:
def process_review(review):
    chars = ['/','\\','>','<','-','br']
    chars.extend('1 2 3 4 5 6 7 8 9 0'.split())
    for char in chars:
        review = review.replace(char,'')
    
    tokens = word_tokenize(review)
    tokens = [t.lower() for t in tokens]
    return tokens

## Make vocab and train data

In [8]:
labels = list(train.sentiment)
reviews = list(train.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

train_data = list(zip(all_words,labels))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




In [10]:
flat_list = [item for sublist in all_words for item in sublist]
vocab = set(flat_list)

len(vocab)

word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}

counts = Counter(flat_list)

In [166]:
counts = Counter(flat_list).most_common()

In [169]:
start = 10
keep = counts[start:20000+start]

In [172]:
keep[:5]

[('i', 52046), ('this', 45732), ('that', 44178), ("'s", 37794), ('was', 30368)]

In [173]:
vocab = [word for word,count in keep]
vocab.append('UNK')

In [176]:
word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}
idx_to_word = {idx:word for word,idx in word_to_idx.items()}

## Make validation and test data

In [12]:
labels = list(test.sentiment)
reviews = list(test.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

test_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [13]:
labels = list(val.sentiment)
reviews = list(val.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

val_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




## CBOW --> Embeddings

In [20]:
data = []
for sentence, label in tqdm(train_data):
    for i in range(2, len(sentence) - 2):
        context = [sentence[i - 2], sentence[i - 1],sentence[i + 1], sentence[i + 2]]
        target = sentence[i]
        data.append((context, target))

HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




In [178]:
tensor_data = []

for context,target_word in data:
    inputs = make_context_vector(context, word_to_idx)
    target = to_idx(target_word)
    tensor_data.append((inputs,target))

In [179]:
data_loader = DataLoader(tensor_data, batch_size=1000, shuffle=True)

In [90]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size,batch_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_sum = embeds.sum(dim=1)
        out = F.relu(self.linear1(embeds_sum))
        out = self.linear2(out)
        out = out.view(batch_size,-1)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [109]:
def to_idx(w):
    idx = word_to_idx.get(w)
    if idx is None:
        idx = 20000
    return idx

In [112]:
def make_context_vector(context, word_to_idx):
    
    idxs = [to_idx(w) for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [185]:
vocab_size = len(vocab) + 1
embedding_dim = 10
context_size = 4
batch_size = 1000
cbow = CBOW(vocab_size, embedding_dim, context_size,batch_size)

In [186]:
losses = []
loss_function = nn.NLLLoss()
optimizer = optim.SGD(cbow.parameters(), lr=0.001)

In [191]:
losses = []
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = []
    idx = 0
    for context,target_word in data_loader:
        if idx  < 200:
            #inputs = make_context_vector(context, word_to_idx)
            #target = torch.tensor([word_to_idx[target_word]], dtype=torch.long)
            inputs = context
            target = target_word
            cbow.zero_grad()

            log_probs = cbow(inputs)
            loss = loss_function(log_probs, target)
            loss.backward()
            optimizer.step()
            idx += 1

    train_loss.append(loss.item())
    mean_train_loss = np.mean(train_loss)
    losses.append(round(mean_train_loss,2))

    print(f'Epoch {epoch+1}/{num_epochs}: train loss of {mean_train_loss:.3f}')

Epoch 1/10: train loss of 8.339
Epoch 2/10: train loss of 8.000
Epoch 3/10: train loss of 8.044
Epoch 4/10: train loss of 7.909
Epoch 5/10: train loss of 7.744
Epoch 6/10: train loss of 7.975
Epoch 7/10: train loss of 7.644
Epoch 8/10: train loss of 7.706
Epoch 9/10: train loss of 7.805
Epoch 10/10: train loss of 7.698


In [396]:
for context,target_word in data_loader:
    inputs = context
    target = target_word
    cbow.eval()
    log_probs = cbow(inputs)
    argmax = log_probs.argmax(dim=1)
    
    for i in range(20):
        c = context[i]
        c1 = idx_to_word[int(c[0])]
        c2 = idx_to_word[int(c[1])]
        c3 = idx_to_word[int(c[2])]
        c4 = idx_to_word[int(c[3])]
        prediction = idx_to_word[int(argmax[i])]
        t = idx_to_word[int(target[i])]
        print(f'{c1} {c2} [{t}] {c3} {c4} ==> {prediction}')

    break

waiting UNK [get] married ... ==> UNK
UNK worst [UNK] all UNK ==> UNK
irish filmmakers [kim] UNK UNK ==> UNK
really UNK [UNK] UNK me ==> UNK
forced UNK [UNK] 's an ==> UNK
UNK UNK [greek] tragedy UNK ==> UNK
horror movie [UNK] supposed UNK ==> UNK
( UNK [UNK] climax UNK ==> UNK
UNK annoying [UNK] UNK bit ==> this
tv show [UNK] i wo ==> UNK
interesting drama [UNK] considering UNK ==> UNK
UNK rather [bad] UNK though ==> UNK
again that [he] UNK absolutely ==> UNK
film UNK [you] can assume ==> UNK
suspects ) [UNK] found their ==> UNK
UNK despicable [judge] who allows ==> UNK
UNK london [UNK] she UNK ==> UNK
arjun does [reach] boiling point ==> UNK
but now [that] i 've ==> UNK
transformed from [being] that romantic ==> UNK


In [194]:
save_path = 'model_checkpoints/embeddings.pt'

In [196]:
torch.save(cbow.state_dict(),save_path)

In [197]:
weights = cbow.embeddings.weight.detach().cpu().numpy()

## Embeddings --> Prediction

In [230]:
def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

def make_bow_vector(sentence, word_to_idx):
    vec = torch.zeros(len(word_to_idx),dtype=torch.long)
    for word in sentence:
        if word in word_to_idx:
            vec[word_to_idx[word]] += 1
    return vec.view(1, -1)

In [313]:
class embed_classifier(nn.Module):
    def __init__(self, num_labels, vocab_size, embedding_dim,hidden):
        super(embed_classifier, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, hidden)
        self.linear_2 = nn.Linear(hidden, num_labels)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_sum = embeds.sum(dim=1)
        x = self.linear(embeds_sum)
        #rint(x.shape)
        x = self.linear_2(x)
        #print(x.shape)
        return F.log_softmax((x), dim=1)

In [378]:
VOCAB_SIZE = len(word_to_idx) + 1
NUM_LABELS = 2
embedding_dim = 10
hidden = 10
model = embed_classifier(NUM_LABELS, VOCAB_SIZE,embedding_dim,hidden)

In [379]:
len(word_to_idx)

20001

In [380]:
model.embeddings = nn.Embedding.weight.data.copy_(torch.tensor(weights))
model.embeddings.weight.requires_grad = False

In [381]:
loss_function = nn.NLLLoss()
lr = .001
#optimizer = optim.SGD(model.parameters(), lr=0.0001)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [382]:
with torch.no_grad():
    sample = all_words[4]
    bow_vector = make_bow_vector(sample[:20], word_to_idx)
    log_probs = model(bow_vector)
    print(log_probs)
    loss = -loss_function(log_probs, target)

tensor([[-2718.1111,     0.0000]])


In [392]:
save_path = 'model_checkpoints/embed_classifier.pt'
val_loss_min = np.Inf
val_loss_min = 0.694
num_epochs = 10
loss_history = []
for epoch in range(num_epochs):
    train_loss = []
    for sentence, label in train_data:
        model.zero_grad()

        vec = make_bow_vector(sentence, word_to_idx)
        vec = torch.LongTensor(vec)
        target = torch.LongTensor([label])

        log_probs = model(vec)

        loss = loss_function(log_probs, target)
        #print(loss)
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        
    mean_train_loss = np.mean(train_loss)
    val_loss = []
    for sentence, label in val_data:
        model.eval()

        vec = make_bow_vector(sentence, word_to_idx)
        target = torch.LongTensor([label])

        log_probs = model(vec)
        pred = log_probs.argmax().detach().numpy()
        loss = loss_function(log_probs, target)

        val_loss.append(loss.item())
        
    mean_val_loss = np.mean(val_loss)
    
    loss_history.append((mean_train_loss,mean_val_loss))
    print(f'Epoch {epoch+1}/{num_epochs}: train loss of {mean_train_loss:.3f}, val loss of {mean_val_loss:.3f}')
    
    if mean_val_loss <= val_loss_min:
        print(f'Val loss decreased {val_loss_min:.3f} --> {mean_val_loss:.3f} saving model...')
        torch.save(model.state_dict(),save_path)
        val_loss_min = mean_val_loss


Epoch 1/10: train loss of 1.122, val loss of 0.695
Epoch 2/10: train loss of 1.101, val loss of 0.696
Epoch 3/10: train loss of 1.125, val loss of 0.699
Epoch 4/10: train loss of 1.090, val loss of 0.697
Epoch 5/10: train loss of 1.114, val loss of 0.700
Epoch 6/10: train loss of 1.086, val loss of 0.714


KeyboardInterrupt: 

In [388]:
names = 'train val test'.split()
data_list = [train_data,val_data,test_data]

for name,data in zip(names,data_list):
    eval_loss = []
    num_correct = 0
    to_eval = test_data
    for sentence, label in data:
        model.eval()

        vec = make_bow_vector(sentence, word_to_idx)
        target = torch.LongTensor([label])

        log_probs = model(vec)
        pred = log_probs.argmax().detach().numpy()
        correct = int(pred == label)
        num_correct += correct
        loss = loss_function(log_probs, target)

        eval_loss.append(loss.item())
    
    mean_loss = np.mean(eval_loss)
    print(f'----{name} set----'.upper())
    print(f'{name} loss of {round(mean_loss,3)}')
    print(f'{name} accuracy of {round(num_correct*100/len(data),2)}')


----TRAIN SET----
train loss of 0.695
train accuracy of 50.09
----VAL SET----
val loss of 0.695
val accuracy of 50.3
----TEST SET----
test loss of 0.696
test accuracy of 49.44


## Use pretained word vectors

In [508]:
class glove_classifier(nn.Module):
    def __init__(self, num_labels, vocab_size, embedding_dim,hidden,glove):
        super(glove_classifier, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(torch.FloatTensor(glove.vectors))
        self.embeddings.requires_grad = False
        self.linear = nn.Linear(embedding_dim, hidden)
        self.linear_2 = nn.Linear(hidden, num_labels)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_sum = embeds.sum(dim=1)
        x = self.linear(embeds_sum)
        #rint(x.shape)
        x = self.linear_2(x)
        #print(x.shape)
        return F.softmax((x), dim=1)

In [398]:
from gensim.models import KeyedVectors

In [404]:
path = '/Users/tyler/Documents/programming/embeddings/models/glove.840B.300d.model'


In [405]:
glove = KeyedVectors.load_word2vec_format(path,limit=100000)


In [406]:
weights = torch.FloatTensor(glove.vectors)

In [526]:
NUM_LABELS = 2
VOCAB_SIZE = 100000
embedding_dim = 300
hidden = 100

model = glove_classifier(NUM_LABELS, VOCAB_SIZE,embedding_dim,hidden,glove)

loss_function = nn.NLLLoss()
loss_function = nn.BCELoss()

lr = .005
#optimizer = optim.SGD(model.parameters(), lr=0.0001)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [525]:
a = torch.tensor([0,1],dtype=torch.float)
b = torch.tensor([.5,.5],dtype=torch.float)
loss_function(a,b)

tensor(13.8155)

In [498]:
weights.shape

torch.Size([100000, 300])

In [499]:
glove.index2word[5]

'of'

In [447]:
word2idx = {word:idx for idx,word in enumerate(glove.vocab.keys())}

In [451]:
word2idx['episode']

2534

In [490]:
def make_input(sentence, word2idx):
    vec = torch.zeros(len(word2idx),dtype=torch.long)
    for word in sentence:
        if word in word2idx:
            vec[word2idx[word]] = 1
    return vec.view(1, -1)

In [493]:
sentence = ', , this is a sentence asdf'.split()
x = make_input(sentence, word2idx)

In [510]:
x

tensor([[1, 0, 0,  ..., 0, 0, 0]])

In [511]:
target = torch.tensor([0,1],dtype=torch.float)

In [527]:
with torch.no_grad():
    sample = all_words[4]
    vec = make_input(sample[:20], word_to_idx)
    log_probs = model(vec)
    print(log_probs)
    loss = loss_function(log_probs, target)

tensor([[1., 0.]])


In [528]:
save_path = 'model_checkpoints/glove_classifier.pt'
val_loss_min = np.Inf
#val_loss_min = 0.694
num_epochs = 10
loss_history = []
for epoch in range(num_epochs):
    train_loss = []
    for sentence, label in train_data[:1000]:
        model.zero_grad()

        vec = make_input(sentence, word2idx)
        vec = torch.LongTensor(vec)
        if label == 0:
            target = torch.tensor([1,0],dtype=torch.float)
        else:
            target = torch.tensor([0,1],dtype=torch.float)
        #target = torch.LongTensor([label])

        log_probs = model(vec)

        loss = loss_function(log_probs, target)
        #print(loss)
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        
    mean_train_loss = np.mean(train_loss)
    val_loss = []
    for sentence, label in val_data[:100]:
        model.eval()

        vec = make_input(sentence, word2idx)
        if label == 0:
            target = torch.tensor([1,0],dtype=torch.float)
        else:
            target = torch.tensor([0,1],dtype=torch.float)
        #target = torch.LongTensor([label])

        log_probs = model(vec)
        pred = log_probs.argmax().detach().numpy()
        loss = loss_function(log_probs, target)

        val_loss.append(loss.item())
        
    mean_val_loss = np.mean(val_loss)
    
    loss_history.append((mean_train_loss,mean_val_loss))
    print(f'Epoch {epoch+1}/{num_epochs}: train loss of {mean_train_loss:.3f}, val loss of {mean_val_loss:.3f}')
    
    if mean_val_loss <= val_loss_min:
        print(f'Val loss decreased {val_loss_min:.3f} --> {mean_val_loss:.3f} saving model...')
        torch.save(model.state_dict(),save_path)
        val_loss_min = mean_val_loss


Epoch 1/10: train loss of 13.318, val loss of 14.092
Val loss decreased inf --> 14.092 saving model...
Epoch 2/10: train loss of 13.318, val loss of 14.092
Val loss decreased 14.092 --> 14.092 saving model...
Epoch 3/10: train loss of 13.318, val loss of 14.092
Val loss decreased 14.092 --> 14.092 saving model...
Epoch 4/10: train loss of 13.318, val loss of 14.092
Val loss decreased 14.092 --> 14.092 saving model...
Epoch 5/10: train loss of 13.318, val loss of 14.092
Val loss decreased 14.092 --> 14.092 saving model...
Epoch 6/10: train loss of 13.318, val loss of 14.092
Val loss decreased 14.092 --> 14.092 saving model...


KeyboardInterrupt: 

In [514]:
mean_val_loss

13.539200534820557

In [487]:
pred

array(0)

In [515]:
log_probs

tensor([[0., 1.]], grad_fn=<SoftmaxBackward>)

In [489]:
names = 'train val test'.split()
num = 100
data_list = [train_data[:num],val_data[:num],test_data[:num]]

for name,data in zip(names,data_list):
    eval_loss = []
    num_correct = 0
    to_eval = test_data
    for sentence, label in data:
        model.eval()

        vec = make_input(sentence, word2idx)
        target = torch.LongTensor([label])

        log_probs = model(vec)
        pred = log_probs.argmax().detach().numpy()
        correct = int(pred == label)
        num_correct += correct
        loss = loss_function(log_probs, target)

        eval_loss.append(loss.item())
    
    mean_loss = np.mean(eval_loss)
    print(f'----{name} set----'.upper())
    print(f'{name} loss of {round(mean_loss,3)}')
    print(f'{name} accuracy of {round(num_correct*100/len(data),2)}')


----TRAIN SET----
train loss of 4116.119
train accuracy of 55.0
----VAL SET----
val loss of 4664.869
val accuracy of 49.0
----TEST SET----
test loss of 4756.277
test accuracy of 48.0


In [473]:
data.index((sentence, label))

9665