In [1]:
import spacy
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("E:\Data\datasets\imdb_long_text_dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment,token_lengths
0,So im not a big fan of Boll's work but then ag...,negative,563
1,"""The Cell"" is an exotic masterpiece, a dizzyin...",positive,749
2,'War movie' is a Hollywood genre that has been...,positive,845
3,"Taut and organically gripping, Edward Dmytryk'...",positive,608
4,One of the most significant quotes from the en...,positive,908


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], stratify=df['sentiment'], test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2)

In [5]:
print ("y_test\n", y_test.value_counts(normalize=True), '\n')
print ("y_val\n", y_val.value_counts(normalize=True), '\n')
print ("y_train\n", y_train.value_counts(normalize=True), '\n')

y_test
 sentiment
positive    0.514089
negative    0.485911
Name: proportion, dtype: float64 

y_val
 sentiment
positive    0.514605
negative    0.485395
Name: proportion, dtype: float64 

y_train
 sentiment
positive    0.514071
negative    0.485929
Name: proportion, dtype: float64 



In [6]:
# # Loading spacy as sentence chunker
# nlp = spacy.load("en_core_web_sm")

In [7]:
# Chunking each document
# X_train_chunked = [nlp(each_sent) for each_sent in X_train]
# X_train_chunked = [[sent for sent in nlp(each_sent).sents] for each_sent in X_train[:20]]


# # Spacy takes too long, will chunk lexically first
# X_train_chunked = [each_sent.split(". ") for each_sent in X_train]
# X_val_chunked = [each_sent.split(". ") for each_sent in X_val
# X_test_chunked = [each_sent.split(". ") for each_sent in X_test]

# Spacy takes too long, will chunk lexically first
X_train_chunked = [each_sent.split(". ") for each_sent in X_train[:20]]
X_val_chunked = [each_sent.split(". ") for each_sent in X_val[:20]]
X_test_chunked = [each_sent.split(". ") for each_sent in X_test[:20]]

In [8]:
# Check max chunk length
print ("Max chunk length for X_train: ", max([len(chunks) for chunks in X_train_chunked]))
print ("Max chunk length for X_val: ", max([len(chunks) for chunks in X_val_chunked]))
print ("Max chunk length for X_test: ", max([len(chunks) for chunks in X_test_chunked]))

Max chunk length for X_train:  35
Max chunk length for X_val:  67
Max chunk length for X_test:  33


In [9]:
len(X_train)

4655

In [10]:
len(X_train_chunked)

20

In [11]:
encoder_path = "D:\\DSAI\\Pre-Trained Models\\distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(encoder_path)
# encoder = AutoModel.from_pretrained(encoder_path)



In [12]:
X_train_tokenized = [[tokenizer(each_sent, padding='max_length', truncation=True, return_tensors='pt') for each_sent in each_doc] for each_doc in X_train_chunked]
X_val_tokenized = [[tokenizer(each_sent, padding='max_length', truncation=True, return_tensors='pt') for each_sent in each_doc] for each_doc in X_val_chunked]
X_test_tokenized = [[tokenizer(each_sent, padding='max_length', truncation=True, return_tensors='pt') for each_sent in each_doc] for each_doc in X_test_chunked]

In [13]:
# X_train_tokenized[0][0]

In [14]:
# X_train_tokenized[0][0]['input_ids'].clone().detach()

In [15]:
# train_seq = [[sent['input_ids'].clone().detach() for sent in doc] for doc in X_train_tokenized]
# train_mask = [[sent['attention_mask'].clone().detach() for sent in doc] for doc in X_train_tokenized]

# val_seq = [[sent['input_ids'].clone().detach() for sent in doc] for doc in X_val_tokenized]
# val_mask = [[sent['attention_mask'].clone().detach() for sent in doc] for doc in X_val_tokenized]

# test_seq = [[sent['input_ids'].clone().detach() for sent in doc] for doc in X_test_tokenized]
# test_mask = [[sent['attention_mask'].clone().detach() for sent in doc] for doc in X_test_tokenized]

# train_label = torch.tensor(y_train.map({'positive':1, 'negative':0}).tolist())
# val_label = torch.tensor(y_val.map({'positive':1, 'negative':0}).tolist())
# test_label = torch.tensor(y_test.map({'positive':1, 'negative':0}).tolist())

In [16]:
def get_doc_tensors(document_corpus, embedding_to_extract, max_chunks=30, max_sentence_token_len=512):
    doc_list = []
    for doc in document_corpus:
        sent_list = []
        for sent in doc:
            sent_list.append(sent[embedding_to_extract].clone().detach()[0])
            
        sent_seqs = torch.stack(sent_list, dim=0)
    
        if sent_seqs.size()[0] < max_chunks: # keep it below 30 sentences for now
            empty_sent_to_pad = torch.zeros(max_chunks-sent_seqs.size()[0], max_sentence_token_len)
    
            sent_seqs = torch.cat((empty_sent_to_pad, sent_seqs), dim=0)
    
        else:
            sent_seqs = sent_seqs[:max_chunks, :]
    
        doc_list.append(sent_seqs)

    return torch.stack(doc_list, dim=0)

In [17]:
train_seq = get_doc_tensors(X_train_tokenized, embedding_to_extract='input_ids')
train_mask = get_doc_tensors(X_train_tokenized, embedding_to_extract='attention_mask')

val_seq = get_doc_tensors(X_val_tokenized, embedding_to_extract='input_ids')
val_mask = get_doc_tensors(X_val_tokenized, embedding_to_extract='attention_mask')

test_seq = get_doc_tensors(X_test_tokenized, embedding_to_extract='input_ids')
test_mask = get_doc_tensors(X_test_tokenized, embedding_to_extract='attention_mask')

# train_label = torch.tensor(y_train.map({'positive':1, 'negative':0}).tolist())
# val_label = torch.tensor(y_val.map({'positive':1, 'negative':0}).tolist())
# test_label = torch.tensor(y_test.map({'positive':1, 'negative':0}).tolist())

train_label = torch.tensor(y_train.map({'positive':1, 'negative':0}).tolist()[:20])
val_label = torch.tensor(y_val.map({'positive':1, 'negative':0}).tolist()[:20])
test_label = torch.tensor(y_test.map({'positive':1, 'negative':0}).tolist()[:20])

In [18]:
# doc_list = []
# for doc in X_train_tokenized:
#     sent_list = []
#     for sent in doc:
#         sent_list.append(sent['input_ids'].clone().detach()[0])
        
#     sent_seqs = torch.stack(sent_list, dim=0)

#     if sent_seqs.size()[0] < 30: # keep it below 30 sentences for now
#         empty_sent_to_pad = torch.zeros(30-sent_seqs.size()[0], 512)

#         sent_seqs = torch.cat((empty_sent_to_pad, sent_seqs), dim=0)

#     else:
#         sent_seqs = sent_seqs[:30, :]

#     doc_list.append(sent_seqs)

In [19]:
# sent_seqs.size()

In [20]:
# torch.zeros(5, 512).size()

In [21]:
# sent_seqs[:2, :]

In [22]:
# sent_list

In [23]:
# doc_seq = torch.stack(doc_list, dim=0) # Need to pas to max length for this one! Else the shape wont fit

# Probably have to do the torch.zeros method and slowly fill in the tensor??
## Dont need can just manuall pad the fucking thing.. damn annoying - Solo

In [24]:
# doc_seq.size()

In [29]:
# len(X_train_tokenized)

In [30]:
# len(X_train_tokenized[-1])

In [31]:
# len(X_train_tokenized[-1])

In [32]:
# sent_seqs[-1].size()

In [34]:
# doc_seq.size()

In [35]:
# # FOR TRAINING
# # Define batch size
# batch_size = 8

# # Wrap tensors
# train_data = TensorDataset(doc_seq)
# # Sampler for sampling the data during training
# train_sampler = SequentialSampler(train_data)
# # Dataloader for train set
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [36]:
# train_data

In [37]:
# for num, batch in enumerate(train_dataloader):
#     see = batch

#     print (see[0].size())

In [38]:
# see[0][-1]

In [39]:
# sent_seqs

In [40]:
# torch.equal(see[0][-1], sent_seqs)

In [41]:
# doc_seq[-2]

In [42]:
# see[0][-2]

In [43]:
# torch.equal(see[0][-2], doc_seq[-2])

In [44]:
# y_train.map({'positive':1, 'negative':0})

In [45]:
# torch.tensor(y_train.map({'positive':1, 'negative':0}).tolist())

In [48]:
# FOR TRAINING
# Define batch size
batch_size = 8

# Wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_label)
# Sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# Dataloader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# Wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_label)
# Sampler for sampling the data during validation for training
val_sampler = SequentialSampler(val_data)
# Dataloader for val set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


# Wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_label)
# Sampler for sampling the data for testing
test_sampler = SequentialSampler(test_data)
# Dataloader for test set
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# len(train_anchor_seq)

In [None]:
# len(train_anchor_seq[0])

In [None]:
# train_anchor_seq[0][0]

In [None]:
# len(X_train_tokenized)

In [None]:
# X_train_chunked[0]

In [None]:
# len(X_train_tokenized[0])

---
## Training

---

In [None]:
model = model.to(device)

In [None]:
from torch.optim import AdamW

# Define optimiser
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
pos_weight = np.array(y_train.value_counts()[0]/y_train.value_counts()[1])

In [None]:
# Converting list of class weights to a tensor
weights = torch.tensor(weight, dtype=torch.float)

# Push weights to GPU
weights = weights.to(device)

# Define loss function
cross_entropy = nn.BCEWithLogitsLoss(pos_weight=weights)

In [None]:
def train(train_dataloader):
    model.train()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(train_dataloader):
        # Progress update for every 50 batches
        if step%50==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))

        # Push batch to GPU
        batch = [r.to(device) for r in batch]
        anchor_id, anchor_mask, positive_id, positive_mask, negative_id, negative_mask = batch

        # Clear previously calculated gradients
        model.zero_grad()

        # Get model predictions for the current batch
        anchor_output = model(anchor_id, anchor_mask)
        positive_output = model(positive_id, positive_mask)
        negative_output = model(negative_id, negative_mask)
        
        """
        nn.CosineSimilarity measures similarity between 2 outputs, the more similar, the bigger the score.
        However for triplet loss, the positive cases are supposed to be closer and have a smaller score.
        To make things easier, we flipped the negative and positive positions
        i.e. loss(anchor, positive, negative) --> loss(anchor, negative, positive)
        """

        # Compute loss 
        loss = loss_fn(mean_pooling(anchor_output, anchor_mask), mean_pooling(negative_output, negative_mask), mean_pooling(positive_output, positive_mask))

        # Add on to the total loss
        total_loss = total_loss + loss.item()

        # Backward pass to calculate gradients
        loss.backward()

        # Update parameters
        optimizer.step()

    # Compute training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    return avg_loss

In [None]:
def evaluate():
    print ('\nEvaluating...')
    
    # Deactivate dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(val_dataloader):
        # Progress update for every 50 batches
        if step%50==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(val_dataloader)))

        # Push batch to GPU
        batch = [t.to(device) for t in batch]
        anchor_id, anchor_mask, positive_id, positive_mask, negative_id, negative_mask = batch

        # Deactivate autograd()
        with torch.no_grad():

            # Get model predictions for the current batch
            anchor_output = model(anchor_id, anchor_mask)
            positive_output = model(positive_id, positive_mask)
            negative_output = model(negative_id, negative_mask)
            
            """
            nn.CosineSimilarity measures similarity between 2 outputs, the more similar, the bigger the score.
            However for triplet loss, the positive cases are supposed to be closer and have a smaller score.
            To make things easier, we flipped the negative and positive positions
            i.e. loss(anchor, positive, negative) --> loss(anchor, negative, positive)
            """

            # Compute loss 
            loss = loss_fn(mean_pooling(anchor_output, anchor_mask), mean_pooling(negative_output, negative_mask), mean_pooling(positive_output, positive_mask))

            total_loss = total_loss + loss.item()

    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    return avg_loss

In [None]:
# Set initial loss to infinite
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []

# For each epoch
for epoch in range(epochs):
    print ('\nEpoch {:}/ {:}'.format(epoch+1, epochs))
    
    # Train model
    train_loss, _ = train()
    
    # Evaluate model
    valid_loss, _ = evaluate()
    
    # Save the best model
    if valid_loss<best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_model_weights/pytorch_bilstm.pt')
        
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print (f"\nTraining Loss: {train_loss:.5f}")
    print (f"Validation Loss: {valid_loss:.5f}")

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses, 'g', valid_losses, 'r')

---
## Model Evaluation

---

In [None]:
# model_path = 'saved_model_weights/pytorch_bilstm.pt'

model.load_state_dict(torch.load(model_path))