## Installing & importing necsessary libs

In [None]:
!pip install -q transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import LongformerModel, LongformerTokenizer
from tqdm.notebook import tqdm
from transformers import get_linear_schedule_with_warmup

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

## Data Preprocessing

In [None]:
df = pd.read_csv("../input/avjantahack/data/train.csv")
df['list'] = df[df.columns[3:]].values.tolist()
new_df = df[['ABSTRACT', 'list']].copy()
new_df.head()

## Model configurations

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 1024
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 3e-05
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

## Custom Dataset Class

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.abstract = dataframe.ABSTRACT
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.abstract)

    def __getitem__(self, index):
        abstract = str(self.abstract[index])
        abstract = " ".join(abstract.split())

        inputs = self.tokenizer.encode_plus(
            abstract,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            truncation = True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return{
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }

In [None]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Longformer model

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of roberta to get the final output for the model. 

class LongformerClass(torch.nn.Module):
    def __init__(self):
        super(LongformerClass, self).__init__()
        self.longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096')
        self.drop = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask):
        _, output= self.longformer(ids, attention_mask = mask)
        output = self.drop(output)
        output = self.linear(output)

        return output

model = LongformerClass()
model.to(device)

## Hyperparameters & Loss function

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]

optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)
num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

## Train & Eval Functions



In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0), total=len(training_loader)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)

            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Training Model

In [None]:
EPOCHS = 2
MODEL_PATH = "/kaggle/working/model.bin"
best_micro = 0
for epoch in range(EPOCHS):
    train(epoch)
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    if f1_score_micro > best_micro:
        torch.save(model.state_dict(), MODEL_PATH)
        best_micro = f1_score_micro

In [None]:
def predict(id, abstract):
    MAX_LENGTH = 512
    inputs = tokenizer.encode_plus(
        abstract,
        None,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        truncation = True
    )
    
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)

    ids = ids.to(device)
    mask = mask.to(device)

    with torch.no_grad():
        outputs = model(ids, mask)

    outputs = torch.sigmoid(outputs).squeeze()
    outputs = np.round(outputs.cpu().numpy())
    
    out = np.insert(outputs, 0, id)
    return out

In [None]:
def submit():
    test_df = pd.read_csv('../input/avjantahack/data/test.csv')
    sample_submission = pd.read_csv('../input/avjantahack/data/sample_submission_UVKGLZE.csv')

    y = []
    for id, abstract in tqdm(zip(test_df['ID'], test_df['ABSTRACT']),
                        total=len(test_df)):
        out = predict(id, abstract)
        y.append(out)
    y = np.array(y)
    submission = pd.DataFrame(y, columns=sample_submission.columns).astype(int)
    return submission

In [None]:
submission = submit()
submission

In [None]:
submission.to_csv("/kaggle/working/submission_longformer_base_line.csv", index=False)