In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np

torch.manual_seed(1)

#from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from gensim.models import KeyedVectors
from torch.nn.utils.rnn import pad_sequence
from glob import glob

In [2]:
%load_ext tensorboard

In [3]:
from tensorboard import notebook

In [4]:
notebook.list()

No known TensorBoard instances running.


## Load data
source is https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [5]:
data_path = '/Users/tyler/Documents/programming/pytorch_nlp/data/word2vec-nlp-tutorial/'

In [6]:
df = pd.read_csv(data_path+'labeledTrainData.tsv',sep='\t')

In [7]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [8]:
train = df[:15000]
val = df[15000:20000]
test = df[20000:]

In [9]:
len(train),len(val),len(test)

(15000, 5000, 5000)

In [10]:
def process_review(review):
    chars = ['/','\\','>','<','-','br']
    chars.extend('1 2 3 4 5 6 7 8 9 0'.split())
    for char in chars:
        review = review.replace(char,'')
    
    tokens = word_tokenize(review)
    tokens = [t.lower() for t in tokens]
    return tokens

## Make vocab and train data

In [11]:
labels = list(train.sentiment)
reviews = list(train.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

train_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=15000.0), HTML(value='')))




In [12]:
flat_list = [item for sublist in all_words for item in sublist]
vocab = set(flat_list)

len(vocab)

word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}

counts = Counter(flat_list)

In [13]:
counts = Counter(flat_list).most_common()

In [14]:
start = 10
keep = counts[start:20000+start]

In [15]:
keep[:5]

[('i', 52046), ('this', 45732), ('that', 44178), ("'s", 37794), ('was', 30368)]

In [16]:
vocab = [word for word,count in keep]
vocab.append('UNK')

In [17]:
word_to_idx = {word:idx for idx,word in enumerate(list(vocab))}
idx_to_word = {idx:word for word,idx in word_to_idx.items()}

## Make validation and test data

In [18]:
labels = list(test.sentiment)
reviews = list(test.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

test_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [19]:
labels = list(val.sentiment)
reviews = list(val.review.values)

all_words = [process_review(review) for review in tqdm(reviews)]

val_data = list(zip(all_words,labels))

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




## Use pretained word vectors

In [20]:
path = '/Users/tyler/Documents/programming/embeddings/models/glove.840B.300d.model'
glove = KeyedVectors.load_word2vec_format(path,limit=100000)
weights = torch.FloatTensor(glove.vectors)

In [21]:
glove.index2word[5]

'of'

In [66]:
word2idx = {word:idx for idx,word in enumerate(glove.vocab.keys())}

In [67]:
def make_input(sentence, word2idx):
    vec = torch.zeros(len(word2idx),dtype=torch.long)
    for word in sentence:
        if word in word2idx:
            vec[word2idx[word]] = 1
    return vec.view(1, -1)

In [83]:
def make_input(sentence, word2idx):
    vec = torch.zeros(len(word2idx),dtype=torch.long)
    word_idx = []
    for word in sentence:
        if word in word2idx:
            word_idx.append(word2idx[word])
    return torch.tensor(word_idx,dtype=torch.long)

In [84]:
sentence = 'this is a sentence'.split()
make_input(sentence, word2idx)

tensor([  27,   10,    6, 4757])

In [23]:
def make_padded_input(sentence, word2idx):
    vec = np.zeros((50,300))
    vec_idx = 0
    for word in sentence:
        if vec_idx < 50:
            if word in glove:
                vec[vec_idx] = glove[word]
                vec_idx += 1
                
    #vec = vec.mean(axis=0)
    ten = torch.tensor(vec,dtype=torch.float)
    
    return ten

In [24]:
sentence = ', , this is a sentence asdf'.split()
x = make_padded_input(sentence, word2idx)

In [25]:
x.shape

torch.Size([50, 300])

In [100]:
import torch.autograd as autograd

In [251]:
class lstm_clf(nn.Module):
    def __init__(self, num_labels, vocab_size, embedding_dim, hidden, weight,max_len):
        
        super(lstm_clf, self).__init__()
        self.hidden_dim = hidden
        self.embedding_dim = embedding_dim
        self.linear = nn.Linear(embedding_dim * max_len, self.hidden_dim)
        self.linear_2 = nn.Linear(self.hidden_dim, num_labels)
        self.dropout = nn.Dropout(p=.3)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weight))
        self.embedding.requires_grad = False
        
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=1,batch_first=True)
        self.hidden = self.init_hidden()

    def forward(self, inputs,h):
        #x = inputs.view(1,-1)
        embeds = self.embedding(torch.LongTensor(inputs))
        embeds = embeds.view(1,-1,self.embedding_dim)
        #print(embeds.shape)
        
        #self.hidden = self.init_hidden(self.hidden_dim)
        #print(self.hidden[0].shape)
        lstm_out, h = self.lstm(embeds,h)
        #print(lstm_out.shape)
        
        x = self.linear_2(lstm_out[:,-1])
        #x = self.dropout(x)
        probs = F.softmax((x), dim=1)
        return probs[0],h
    
    def init_hidden(self):
        #return (autograd.Variable(torch.zeros(1, 1, size)),autograd.Variable(torch.zeros(1, 1, size)))
        batch_size = 1
        h0 = torch.zeros((1,batch_size,self.hidden_dim))
        c0 = torch.zeros((1,batch_size,self.hidden_dim))
        hidden = (h0,c0)
        return hidden

In [252]:
NUM_LABELS = 2
VOCAB_SIZE = 100000
embedding_dim = 300
hidden = 100
max_len = 50

model = lstm_clf(NUM_LABELS, VOCAB_SIZE,embedding_dim,hidden,glove.vectors,max_len)


loss_function = nn.NLLLoss()
loss_function = nn.BCELoss()

lr = .001
#optimizer = optim.SGD(model.parameters(), lr=0.0001)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [253]:
sentence = ', , this is a cool sentence'.split()
x = make_input(sentence, word2idx)

In [254]:
x

tensor([   0,    0,   27,   10,    6,  970, 4757])

In [260]:
h = model.init_hidden()
probs,h = model(x,h)

In [261]:
probs

tensor([0.4098, 0.5902], grad_fn=<SelectBackward>)

In [218]:
len(val_data)

5000

In [221]:
import tensorboard
from torch.utils.tensorboard import SummaryWriter

In [222]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [283]:
len(files)

0

In [284]:
files

[]

In [288]:
def get_run_version():
    files = glob('logs/*')
    return f'v{str(len(files))}'

In [290]:
get_run_version()

'v0'

In [292]:
log_dir = get_run_version()
writer = SummaryWriter(f'logs/{log_dir}')
print(log_dir)

v0


In [293]:
save_path = 'model_checkpoints/lstm.pt'
val_loss_min = np.Inf
#val_loss_min = 0.694
num_epochs = 10
loss_history = []
batch_size = 1
for epoch in range(num_epochs):
    train_loss = []
    
    h = model.init_hidden()
    
    for sentence, label in train_data[:2000]:
        h = tuple([each.data for each in h])
        model.zero_grad()

        vec = make_input(sentence, word2idx)
        if label == 0:
            target = torch.tensor([1,0],dtype=torch.float)
        else:
            target = torch.tensor([0,1],dtype=torch.float)

        probs,h = model(vec,h)

        loss = loss_function(probs, target)
        #print(loss)
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.item())
        
    mean_train_loss = np.mean(train_loss)
    
    val_h = model.init_hidden()
    val_loss = []
    for sentence, label in val_data[:1000]:
        val_h = tuple([each.data for each in val_h])
        model.eval()

        vec = make_input(sentence, word2idx)
        if label == 0:
            target = torch.tensor([1,0],dtype=torch.float)
        else:
            target = torch.tensor([0,1],dtype=torch.float)

        probs,val_h = model(vec,val_h)
        pred = probs.argmax().detach().numpy()
        loss = loss_function(probs, target)

        val_loss.append(loss.item())
        
    mean_val_loss = np.mean(val_loss)
    
    loss_history.append((mean_train_loss,mean_val_loss))
    print(f'Epoch {epoch+1}/{num_epochs}: train loss of {mean_train_loss:.3f}, val loss of {mean_val_loss:.3f}')
    
    if mean_val_loss <= val_loss_min:
        print(f'Val loss decreased {val_loss_min:.3f} --> {mean_val_loss:.3f} saving model...')
        torch.save(model.state_dict(),save_path)
        val_loss_min = mean_val_loss
    
    writer.add_scalar('Loss (train)', mean_train_loss, epoch)
    writer.add_scalar('Loss (val)', mean_val_loss, epoch)
writer.close()


Epoch 1/10: train loss of 0.238, val loss of 0.409
Val loss decreased inf --> 0.409 saving model...
Epoch 2/10: train loss of 0.162, val loss of 0.523
Epoch 3/10: train loss of 0.084, val loss of 0.599
Epoch 4/10: train loss of 0.046, val loss of 0.705
Epoch 5/10: train loss of 0.036, val loss of 0.828


KeyboardInterrupt: 

## Evaluate Model

In [294]:
names = 'train val test'.split()
num = 1000
data_list = [train_data[:num],val_data[:num],test_data[:num]]
#data_list = [train_data,val_data,test_data]

for name,data in zip(names,data_list):
    h = model.init_hidden()
    eval_loss = []
    num_correct = 0
    to_eval = test_data
    for sentence, label in data:
        h = tuple([each.data for each in h])
        model.eval()

        vec = make_input(sentence, word2idx)        
        if label == 0:
            target = torch.tensor([1,0],dtype=torch.float)
        else:
            target = torch.tensor([0,1],dtype=torch.float)

        probs,h = model(vec,h)
        pred = probs.argmax().detach().numpy()
        correct = int(pred == label)
        num_correct += correct
        loss = loss_function(probs, target)

        eval_loss.append(loss.item())
    
    mean_loss = np.mean(eval_loss)
    print(f'----{name} set----'.upper())
    print(f'{name} loss of {round(mean_loss,3)}')
    print(f'{name} accuracy of {round(num_correct*100/len(data),2)}')


----TRAIN SET----
train loss of 0.033
train accuracy of 99.2
----VAL SET----
val loss of 0.827
val accuracy of 78.0
----TEST SET----
test loss of 0.812
test accuracy of 77.8
