In [1]:
import torch
import torch.nn as nn
import numpy as np
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from multilabel_model import miltilabel_model
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1.0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1.0,0.0,0.0,0.0,0.0,0.0
1,2.0,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1.0,0.0,0.0,0.0,0.0,0.0
2,3.0,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0.0,0.0,1.0,0.0,0.0,0.0
3,4.0,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0.0,0.0,1.0,0.0,0.0,0.0
4,5.0,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1.0,0.0,0.0,1.0,0.0,0.0


In [4]:
X = df['ABSTRACT']
y = df[['Computer Science', 'Physics']]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [6]:
if torch.cuda.is_available():
    device = torch.device('cuda')

In [7]:
encoder_path = "D:\\DSAI\\Pre-Trained Models\\distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(encoder_path)
encoder = AutoModel.from_pretrained(encoder_path)



In [8]:
train_seq = tokenizer(X_train.tolist(), padding='max_length', return_tensors='pt')['input_ids']
train_mask = tokenizer(X_train.tolist(), padding='max_length', return_tensors='pt')['attention_mask']

val_seq = tokenizer(X_val.tolist(), padding='max_length', return_tensors='pt')['input_ids']
val_mask = tokenizer(X_val.tolist(), padding='max_length', return_tensors='pt')['attention_mask']


train_label = torch.tensor(y_train.to_numpy())
val_label = torch.tensor(y_val.to_numpy())

In [9]:
# FOR TRAINING
# Define batch size
batch_size = 2

# Wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_label)
# Sampler for sampling the data during training
# train_sampler = SequentialSampler(train_data)
train_sampler = RandomSampler(train_data)
# Dataloader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# Wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_label)
# Sampler for sampling the data during validation for training
val_sampler = SequentialSampler(val_data)
# Dataloader for val set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


# # Wrap tensors
# test_data = TensorDataset(test_seq, test_mask, test_label)
# # Sampler for sampling the data for testing
# test_sampler = SequentialSampler(test_data)
# # Dataloader for test set
# test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [10]:
model = miltilabel_model(encoder, dropout=0.2, device=device)

In [11]:
model = model.to(device)

In [12]:
from torch.optim import AdamW

# Define optimiser
optimizer = AdamW(model.parameters(), lr=2e-5)

In [13]:
weight = np.array([0.5, 12])

In [14]:
# Converting list of class weights to a tensor
weights = torch.tensor(weight, dtype=torch.float)

# Push weights to GPU
weights = weights.to(device)

# Define loss function
cross_entropy = nn.BCEWithLogitsLoss(pos_weight=weights)

In [15]:
def train(train_dataloader):
    model.train()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(train_dataloader):
        # Progress update for every 50 batches
        if step%10==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))

        # Push batch to GPU
        batch = [r.to(device) for r in batch]
        train_input_seq, train_input_mask, train_input_label = batch

        # Clear previously calculated gradients
        model.zero_grad()

        # Get model predictions for the current batch
        train_output = model(train_input_seq, train_input_mask)
        
        """
        nn.CosineSimilarity measures similarity between 2 outputs, the more similar, the bigger the score.
        However for triplet loss, the positive cases are supposed to be closer and have a smaller score.
        To make things easier, we flipped the negative and positive positions
        i.e. loss(anchor, positive, negative) --> loss(anchor, negative, positive)
        """

        # print (train_output, train_input_label)
        # print (torch.squeeze(train_output))
        
        # Compute loss 
        # loss = cross_entropy(train_output, train_input_label)
        loss = cross_entropy(torch.squeeze(train_output), train_input_label.float())

        # Add on to the total loss
        total_loss = total_loss + loss.item()

        # Backward pass to calculate gradients
        loss.backward()

        # Update parameters
        optimizer.step()

    # Compute training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    return avg_loss

In [16]:
def evaluate(val_dataloader):
    print ('\nEvaluating...')
    
    # Deactivate dropout layers
    model.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # Empty list to save model predictions
    total_preds = []
    
    # Iterate over batches
    for step, batch in enumerate(val_dataloader):
        # Progress update for every 50 batches
        if step%10==0 and not step==0:
            print ('Batch {:>5,} of {:>5,}.'.format(step, len(val_dataloader)))

        # Push batch to GPU
        batch = [t.to(device) for t in batch]
        val_input_seq, val_input_mask, val_input_label = batch

        # Deactivate autograd()
        with torch.no_grad():

            
            # Get model predictions for the current batch
            val_output = model(val_input_seq, val_input_mask)
        
            """
            nn.CosineSimilarity measures similarity between 2 outputs, the more similar, the bigger the score.
            However for triplet loss, the positive cases are supposed to be closer and have a smaller score.
            To make things easier, we flipped the negative and positive positions
            i.e. loss(anchor, positive, negative) --> loss(anchor, negative, positive)
            """

            # Compute loss 
            # loss = cross_entropy(val_output, val_input_label)
            loss = cross_entropy(torch.squeeze(val_output), val_input_label.float())

            total_loss = total_loss + loss.item()

    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    return avg_loss

In [17]:
epochs = 20

# Set initial loss to infinite
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses = []
valid_losses = []

# For each epoch
for epoch in range(epochs):
    print ('\nEpoch {:}/ {:}'.format(epoch+1, epochs))
    
    # Train model
    train_loss = train(train_dataloader)
    
    # Evaluate model
    valid_loss = evaluate(val_dataloader)
    
    # Save the best model
    if valid_loss<best_valid_loss:
        best_valid_loss = valid_loss
        # torch.save(model.state_dict(), 'test_model.pt')
        
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print (f"\nTraining Loss: {train_loss:.5f}")
    print (f"Validation Loss: {valid_loss:.5f}")


Epoch 1/ 20
torch.Size([2, 512, 768])
torch.Size([2, 1, 768])
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')
torch.Size([2, 2, 768])
torch.Size([2, 1, 768])


ValueError: Target size (torch.Size([2, 2])) must be the same as input size (torch.Size([4]))

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses, 'g', valid_losses, 'r')