##Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Installing and Importing required Libraries 

In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 778kB 8.4MB/s 
[K     |████████████████████████████████| 1.1MB 24.5MB/s 
[K     |████████████████████████████████| 3.0MB 59.8MB/s 
[K     |████████████████████████████████| 890kB 56.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from tqdm.notebook import tqdm
from transformers import get_linear_schedule_with_warmup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

## Data Preprocessing

In [None]:
df = pd.read_csv("/content/drive/My Drive/bert-multilabel/train.csv")
df['list'] = df[df.columns[3:]].values.tolist()
new_df = df[['TITLE', 'ABSTRACT', 'list']].copy()
new_df.head()

Unnamed: 0,TITLE,ABSTRACT,list
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,"[1, 0, 0, 0, 0, 0]"
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,"[1, 0, 0, 0, 0, 0]"
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,"[0, 0, 1, 0, 0, 0]"
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,"[0, 0, 1, 0, 0, 0]"
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,"[1, 0, 0, 1, 0, 0]"


## Model Configurations

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




## Creating Custom Dataset class

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.abstract = dataframe.ABSTRACT
        self.title = dataframe.TITLE
        self.targets = self.data.list
        self.max_len = max_len
        self.max_len_title = 200

    def __len__(self):
        return len(self.abstract)

    def __getitem__(self, index):
        abstract = str(self.abstract[index])
        title = str(self.title[index])

        abstract = " ".join(abstract.split())
        title = " ".join(abstract.split())

        inputs_abstract = self.tokenizer.encode_plus(
            abstract,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            truncation = True
        )

        inputs_title = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens = True,
            max_length = self.max_len_title,
            pad_to_max_length = True,
            truncation = True
        )

        ids_abstract = inputs_abstract['input_ids']
        mask_abstract = inputs_abstract['attention_mask']

        ids_title = inputs_title['input_ids']
        mask_title = inputs_title['attention_mask']

        return{
                'ids_abstract': torch.tensor(ids_abstract, dtype=torch.long),
                'mask_abstract': torch.tensor(mask_abstract, dtype=torch.long),
                'ids_title': torch.tensor(ids_title, dtype=torch.long),
                'mask_title': torch.tensor(mask_title, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }

In [None]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (20972, 3)
TRAIN Dataset: (16778, 3)
TEST Dataset: (4194, 3)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Roberta Model

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of roberta to get the final output for the model. 

class RobertaMultiheadClass(torch.nn.Module):
    def __init__(self):
        super(RobertaMultiheadClass, self).__init__()
        self.roberta = transformers.RobertaModel.from_pretrained('roberta-base')
        self.drop = torch.nn.Dropout(0.3)
        self.linear_1 = torch.nn.Linear(1536, 768)
        self.linear_2 = torch.nn.Linear(768, 6)
    
    def forward(self, ids_1, mask_1, ids_2, mask_2):
        _, output_1= self.roberta(ids_1, attention_mask = mask_1)
        _, output_2= self.roberta(ids_2, attention_mask = mask_2)

        output = torch.cat((output_1, output_2), dim = 1)
        output = self.drop(output)
        output = self.linear_1(output)
        output = self.drop(output)
        output = self.linear_2(output)

        return output

model = RobertaMultiheadClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




RobertaMultiheadClass(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

## Hyperparameters & Loss function

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)
num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

## Train & Eval Functions

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0), total=len(training_loader)):
        ids_1 = data['ids_abstract'].to(device, dtype = torch.long)
        mask_1 = data['mask_abstract'].to(device, dtype = torch.long)

        ids_2 = data['ids_title'].to(device, dtype = torch.long)
        mask_2 = data['mask_title'].to(device, dtype = torch.long)

        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids_1, mask_1, ids_2, mask_2)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            ids_1 = data['ids_abstract'].to(device, dtype = torch.long)
            mask_1 = data['mask_abstract'].to(device, dtype = torch.long)
            
            ids_2 = data['ids_title'].to(device, dtype = torch.long)
            mask_2 = data['mask_title'].to(device, dtype = torch.long)

            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids_1, mask_1, ids_2, mask_2)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Training Model

In [None]:
MODEL_PATH = "/content/drive/My Drive/roberta-multilabel/model.bin"
best_micro = 0
for epoch in range(EPOCHS):
    train(epoch)
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    if f1_score_micro > best_micro:
        torch.save(model.state_dict(), MODEL_PATH)
        best_micro = f1_score_micro

HBox(children=(FloatProgress(value=0.0, max=2098.0), HTML(value='')))

Epoch: 0, Loss:  0.6962360143661499
Epoch: 0, Loss:  0.18524011969566345
Epoch: 0, Loss:  0.2449212670326233



HBox(children=(FloatProgress(value=0.0, max=1049.0), HTML(value='')))


Accuracy Score = 0.6351931330472103
F1 Score (Micro) = 0.8064823641563394
F1 Score (Macro) = 0.6840674658832094


HBox(children=(FloatProgress(value=0.0, max=2098.0), HTML(value='')))

Epoch: 1, Loss:  0.1575789898633957
Epoch: 1, Loss:  0.16945797204971313
Epoch: 1, Loss:  0.1378794014453888



HBox(children=(FloatProgress(value=0.0, max=1049.0), HTML(value='')))


Accuracy Score = 0.6711969480209824
F1 Score (Micro) = 0.8208742024569089
F1 Score (Macro) = 0.7249638008935323


HBox(children=(FloatProgress(value=0.0, max=2098.0), HTML(value='')))

Epoch: 2, Loss:  0.08451372385025024
Epoch: 2, Loss:  0.06796027719974518
Epoch: 2, Loss:  0.09773905575275421



HBox(children=(FloatProgress(value=0.0, max=1049.0), HTML(value='')))


Accuracy Score = 0.6719122556032427
F1 Score (Micro) = 0.8254743565658464
F1 Score (Macro) = 0.7635743400645257


## Predictions

In [None]:
PATH = "/content/drive/My Drive/roberta-multilabel/model.bin"
model = RobertaMultiheadClass()
model.load_state_dict(torch.load(PATH))
model.to(device)
model.eval()

RobertaMultiheadClass(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

In [None]:
def predict(id, abstract, title):
    MAX_LENGTH = 512
    inputs_abstract = tokenizer.encode_plus(
        abstract,
        None,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation = True
    )

    inputs_title = tokenizer.encode_plus(
        title,
        None,
        add_special_tokens=True,
        max_length=200,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation = True
    )
    
    ids_1 = inputs_abstract['input_ids']
    mask_1 = inputs_abstract['attention_mask']

    ids_1 = torch.tensor(ids_1, dtype=torch.long).unsqueeze(0)
    mask_1 = torch.tensor(mask_1, dtype=torch.long).unsqueeze(0)

    ids_2 = inputs_title['input_ids']
    mask_2 = inputs_title['attention_mask']

    ids_2 = torch.tensor(ids_2, dtype=torch.long).unsqueeze(0)
    mask_2 = torch.tensor(mask_2, dtype=torch.long).unsqueeze(0)

    ids_1 = ids_1.to(device)
    mask_1 = mask_1.to(device)
    ids_2 = ids_2.to(device)
    mask_2 = mask_2.to(device)

    with torch.no_grad():
        outputs = model(ids_1, mask_1, ids_2, mask_2)

    outputs = torch.sigmoid(outputs).squeeze()
    outputs = np.round(outputs.cpu().numpy())
    
    out = np.insert(outputs, 0, id)
    return out

In [None]:
def submit():
    test_df = pd.read_csv('/content/drive/My Drive/bert-multilabel/test.csv')
    sample_submission = pd.read_csv('/content/drive/My Drive/bert-multilabel/sample_submission_UVKGLZE.csv')

    y = []
    for id, abstract, title in tqdm(zip(test_df['ID'], test_df['ABSTRACT'], test_df['TITLE']),
                        total=len(test_df)):
        out = predict(id, abstract, title)
        y.append(out)
    y = np.array(y)
    submission = pd.DataFrame(y, columns=sample_submission.columns).astype(int)
    return submission

In [None]:
submission = submit()
submission

HBox(children=(FloatProgress(value=0.0, max=8989.0), HTML(value='')))




Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,1,0,0
1,20974,0,1,0,0,0,0
2,20975,1,0,0,0,0,0
3,20976,0,1,0,0,0,0
4,20977,1,0,0,0,0,0
...,...,...,...,...,...,...,...
8984,29957,1,0,0,0,0,0
8985,29958,0,0,1,0,0,0
8986,29959,0,0,0,0,1,0
8987,29960,0,0,1,1,0,0


In [None]:
submission.to_csv("roberta_baseline.csv", index=False)