<a href="https://colab.research.google.com/github/shikha-aggarwal/researchpaperlikes/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets transformers pytorch-lightning

In [32]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

import transformers

from datasets import (
    list_datasets,
    load_dataset,
    list_metrics,
    load_metric)

import json
import os
import re
import argparse
import random
import time
import string
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler

import matplotlib.pyplot as plt

## Colab accessing Google drive
from google.colab import drive

In [33]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
cd /content/drive/MyDrive/Colab\ Notebooks/research\ paper

/content/drive/MyDrive/Colab Notebooks/research paper


In [35]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

## Loading the dataset

In [36]:
from datasets import load_dataset
from ml_project.data_loaders import load_data

In [37]:
data_dir = './data'
tags_list = load_data.load_tags(data_dir)
article_tags_2d_list = load_data.load_article_tags(data_dir)
# citations_2d_np = load_data.load_citations(data_dir, num_articles = )
user_items_2d_np = load_data.load_user_article_likes(data_dir)
articles_df, user_article_likes_2d_np = load_data.load_articles_and_user_article_likes(data_dir)

In [38]:
articles_df.columns

Index(['doc.id', 'title', 'citeulike.id', 'raw.title', 'raw.abstract'], dtype='object')

In [39]:
articles_df.rename(columns = {'citeulike.id':'citeulike_id', 'doc.id': 'doc_id', 'raw.title': 'raw_title', 'raw.abstract': 'raw_abstract'}, inplace = True)

In [40]:
articles_df.head()

Unnamed: 0,doc_id,title,citeulike_id,raw_title,raw_abstract
0,1,the metabolic world of escherichia coli is not...,42.0,The metabolic world of Escherichia coli is not...,To elucidate the organizational and evolutiona...
1,2,reverse engineering of biological complexity,43.0,Reverse Engineering of Biological Complexity,Advanced technologies and biology have extreme...
2,3,exploring complex networks,44.0,Exploring complex networks,"The study of networks pervades all of science,..."
3,4,comparative assessment of largescale data sets...,46.0,Comparative assessment of large-scale data set...,Comprehensive protein protein interaction maps...
4,5,navigation in a small world,47.0,Navigation in a small world,The small-world phenomenon â the principle t...


In [41]:
articles_df.iloc[10].to_dict()

{'citeulike_id': 61.0,
 'doc_id': 11,
 'raw_abstract': ' Recent research has revealed general principles in the structural and functional organization of complex networks which are shared by various natural, social and technological systems. This review examines these principles as applied to the organization, development and function of complex brain networks. Specifically, we examine the structural properties of large-scale anatomical and functional brain networks and discuss how they might arise in the course of network growth and rewiring. Moreover, we examine the relationship between the structural substrate of neuroanatomy and more dynamic functional and effective connectivity patterns that underlie human cognition. We suggest that network analysis offers new fundamental insights into global and integrative aspects of brain function, including the origin of flexible and coherent cognitive states within the neural architecture.',
 'raw_title': 'Organization, development and functi

In [50]:
user_article_likes_2d_np.shape ## num_users * num_articles [less articles here because new articles added?]

(5551, 13584)

In [46]:
articles_df.shape

(16980, 5)

In [47]:
articles_df["tags"] = article_tags_2d_list

In [49]:
articles_df.head()

Unnamed: 0,doc_id,title,citeulike_id,raw_title,raw_abstract,tags
0,1,the metabolic world of escherichia coli is not...,42.0,The metabolic world of Escherichia coli is not...,To elucidate the organizational and evolutiona...,"[4276, 32443, 37837, 3378, 7650, 44590, 42810,..."
1,2,reverse engineering of biological complexity,43.0,Reverse Engineering of Biological Complexity,Advanced technologies and biology have extreme...,"[40070, 39891, 9827, 39406, 45156, 20392, 1831..."
2,3,exploring complex networks,44.0,Exploring complex networks,"The study of networks pervades all of science,...","[35531, 33478, 46208, 24430, 13634, 30698, 314..."
3,4,comparative assessment of largescale data sets...,46.0,Comparative assessment of large-scale data set...,Comprehensive protein protein interaction maps...,"[2383, 36288, 43395, 41495, 9102, 35490, 1858,..."
4,5,navigation in a small world,47.0,Navigation in a small world,The small-world phenomenon â the principle t...,"[30816, 24430, 40491, 32766, 20309, 1223, 3069..."


In [71]:
train_df, validate_df, test_df = np.split(articles_df.sample(frac=1, random_state=42), 
                       [int(.6*len(articles_df)), int(.8*len(articles_df))])

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
input_length = 100

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [92]:
len(tags_list)

46391

In [110]:
class ArticlesDataset(Dataset):
    def __init__(self, df, tokenizer, input_length):
        self.pandas_df = df
        self.tokenizer = tokenizer         
        self.input_length = input_length
  
    def __len__(self):
        return len(self.pandas_df)
    
    def clean_text(self, text):
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text
    
    def convert_to_features(self, example_batch):
        sentence_1 = self.clean_text(example_batch['raw_title'] + example_batch['raw_abstract'])  
        source = self.tokenizer.batch_encode_plus([sentence_1], 
                                                  max_length=self.input_length,
                                                  padding='max_length',
                                                  truncation=True,
                                                  return_tensors="pt")
        
        return source
  
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        num_classes = len(tags_list)
        source = self.convert_to_features(self.pandas_df.iloc[index])
        
        source_ids = source["input_ids"].squeeze()
        src_mask    = source["attention_mask"].squeeze()
        tags = self.pandas_df.iloc[index]['tags']
        indices = []
        for i in range(len(tags_list)):
            if tags_list[i] in tags:
                indices.append(i)
        targets = torch.nn.functional.one_hot(torch.tensor(indices, dtype = torch.int64),
                                              num_classes = len(tags_list))

        return {"input_ids": source_ids, "labels": targets}

In [88]:
# num_classes = len(tags_list)

# loss_fn = torch.nn.BCELoss()

# outputs_before_sigmoid = torch.randn(batch_size, num_classes)
# sigmoid_outputs = torch.sigmoid(outputs_before_sigmoid)
# target_classes = torch.randint(0, 2, (batch_size, num_classes))

# loss = loss_fn(sigmoid_outputs, target_classes)

# # alternatively, use BCE with logits, on outputs before sigmoid.
# loss_fn_2 = torch.nn.BCEWithLogitsLoss()
# loss2 = loss_fn_2(outputs_before_sigmoid, target_classes)
# assert loss == loss2

## Fine-tuning the model

In [94]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [95]:
args = TrainingArguments(
    "test_citation", ## output dir
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 5,
    weight_decay = 0.01,
    load_best_model_at_end = True,
)

In [105]:
import torch.nn.functional as F

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return F.cosine_similarity(predictions.squeeze(), labels.squeeze(), dim=-1)

In [111]:
class DistilbertFineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(DistilbertFineTuner, self).__init__()
        self.save_hyperparameters(hparams)
        self.hparams = hparams
        self.model = AutoModelForSequenceClassification.from_pretrained(self.hparams.tokenizer_name_or_path, num_labels = len(tags_list))
        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.model_dir = self.hparams.model_dir
        
    def forward(self, input_ids, attention_mask, labels):
        return self.model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = decoder_attention_mask,
            labels = labels,
            )
        
    def _step(self, batch):
        labels = batch['labels']
        ## set padding token label to = -100 so that it is ignored
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self.forward(
            input_ids = batch['input_ids'],
            labels = labels
        )
        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('train_loss', loss, on_step = True, on_epoch = True, 
                 prog_bar = True, logger = True)
        return {"loss": loss}

    def train_dataloader(self):
        train_dataset = ArticlesDataset(train_df, tokenizer, input_length = input_length)
        sampler = RandomSampler(train_dataset)

        dataloader = DataLoader(train_dataset,
                                sampler = sampler,
                                batch_size = self.hparams.train_batch_size,
                                drop_last = True,
                                num_workers = 2)
        return dataloader

    def val_dataloader(self):
        validation_dataset = ArticlesDataset(validation_df, tokenizer, input_length = input_length)
        sampler = RandomSampler(validation_dataset)

        return DataLoader(validation_dataset,
                          batch_size = self.hparams.eval_batch_size,
                          sampler = sampler,
                          num_workers = 2)
    

    def configure_optimizers(self):
        ## Set bias decay to zero
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },]
        optimizer = AdamW(optimizer_grouped_parameters, lr = self.hparams.learning_rate)
        # decreasing learning rate (linear) after increasing in num_warmup_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps = self.hparams.warmup_steps,
            num_training_steps = self.hparams.training_steps
        )
        self.lr_scheduler = scheduler
        self.log('configured optimizer: ', optimizer)
        return optimizer

In [112]:
saved_models_path = '/content/drive/MyDrive/Colab Notebooks/research paper/saved_models'
args_dict = dict(
    model_dir               = saved_models_path, # path to save the checkpoints
    model_name_or_path      = 't5-base',
    tokenizer_name_or_path  = 't5-base',
    learning_rate           = 1e-5,
    weight_decay            = 0.0,
    adam_epsilon            = 1e-8,
    warmup_steps            = 0,
    train_batch_size        = 16,
    eval_batch_size         = 16,
    num_train_epochs        = 2,
    accumulate_grad_batches = 10,
    training_steps          = 10000,
    # training_steps          = 10,
    n_gpu                   = 1,
    resume_from_checkpoint  = None, ## USE THIS!!
    val_check_interval      = 0.5, 
    early_stop_callback     = False,
    num_workers             = 0,
    fp_16                   = False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level               = 'O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm           = 1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
)
args = argparse.Namespace(**args_dict)
print(args)

logger = TensorBoardLogger("tb_logs", version = 1, name = 't5_trivia')


Namespace(accumulate_grad_batches=10, adam_epsilon=1e-08, early_stop_callback=False, eval_batch_size=16, fp_16=False, learning_rate=1e-05, max_grad_norm=1.0, model_dir='/content/drive/MyDrive/Colab Notebooks/research paper/saved_models', model_name_or_path='t5-base', n_gpu=1, num_train_epochs=2, num_workers=0, opt_level='O1', resume_from_checkpoint=None, tokenizer_name_or_path='t5-base', train_batch_size=16, training_steps=10000, val_check_interval=0.5, warmup_steps=0, weight_decay=0.0)


In [108]:
model = DistilbertFineTuner(
    args
)

In [None]:
dataset = load_dataset("glue", actual_task)