<a href="https://colab.research.google.com/github/shikha-aggarwal/researchpaperlikes/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### 1. Install and Import

In [None]:
!pip install datasets transformers pytorch-lightning --quiet

In [None]:
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

import transformers
from transformers import (
    DistilBertTokenizer,
    DistilBertForMultipleChoice,
    AdamW,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup
)

import json
import os
import re
import argparse
import random
import time
import string
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler

import matplotlib.pyplot as plt

## Colab accessing Google drive
from google.colab import drive
drive.mount('/content/drive')
## navigate here to your main project folder
## or place this file in the main folder.

from datasets import load_dataset
from ml_project.data_loaders import load_data

####2. Load dataset

In [171]:
data_dir = './data'
tags_list = load_data.load_tags(data_dir)
article_tags_2d_list = load_data.load_article_tags(data_dir)
user_items_2d_np = load_data.load_user_article_likes(data_dir)
articles_df, user_article_likes_2d_np = load_data.load_articles_and_user_article_likes(data_dir)

In [172]:
articles_df["tags"] = article_tags_2d_list

In [71]:
train_df, validate_df, test_df = np.split(articles_df.sample(frac=1, random_state=42), 
                       [int(.6*len(articles_df)), int(.8*len(articles_df))])

####3. Various constants

In [174]:
input_length = 100
MODEL_CHECKPOINT = 'distilbert-base-uncased'

#### 4. Dataset class for trainer

In [176]:
class ArticlesDataset(Dataset):
    def __init__(self, df, tokenizer, input_length):
        self.pandas_df = df
        self.tokenizer = DistilBertTokenizer.from_pretrained()         
        self.input_length = input_length
  

    def __len__(self):
        return len(self.pandas_df)
    

    def clean_text(self, text):
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text
    

    def convert_to_features(self, example_batch):
        sentence_1 = self.clean_text(example_batch['raw_title'] + \
                                     example_batch['raw_abstract'])  
        source = self.tokenizer.batch_encode_plus([sentence_1], 
                                                  max_length=self.input_length,
                                                  padding='max_length',
                                                  truncation=True,
                                                  return_tensors="pt")
        
        return source
  
  
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        num_classes = len(tags_list)
        source = self.convert_to_features(self.pandas_df.iloc[index])
        
        source_ids = source["input_ids"].squeeze()
        src_mask    = source["attention_mask"].squeeze()
        tags = self.pandas_df.iloc[index]['tags']
        indices = [0] * len(tags_list)

        ## not optimal
        for i in range(len(tags)):
            if tags[i] in tags_list:
                indices[tags_list.index(tags[i])] = 1
        
        return {"input_ids": source_ids, "attention_mask": src_mask, "labels": torch.tensor(indices)}

#### 5. PyTorch Lightning Module

In [155]:
class DistilbertFineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(DistilbertFineTuner, self).__init__()
        self.save_hyperparameters(hparams)
        self.hparams = hparams
        self.model = DistilBertForMultipleChoice.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.hparams.tokenizer_name_or_path)
        utoTokenizer.from_pretrained()
        self.model_dir = self.hparams.model_dir
        
    def forward(self, input_ids, labels):
        return self.model(
            input_ids = input_ids,
            labels = labels,
            )
        
    def _step(self, batch):
        labels = batch['labels']
        ## set padding token label to = -100 so that it is ignored
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
        outputs = self.forward(
            input_ids = batch['input_ids'],
            labels = labels
        )
        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('train_loss', loss, on_step = True, on_epoch = True, 
                 prog_bar = True, logger = True)
        return {"loss": loss}

    def train_dataloader(self):
        train_dataset = ArticlesDataset(train_df, tokenizer, input_length = input_length)
        sampler = RandomSampler(train_dataset)

        dataloader = DataLoader(train_dataset,
                                sampler = sampler,
                                batch_size = self.hparams.train_batch_size,
                                drop_last = True,
                                num_workers = 2)
        return dataloader

    def val_dataloader(self):
        validation_dataset = ArticlesDataset(validation_df, tokenizer, input_length = input_length)
        sampler = RandomSampler(validation_dataset)

        return DataLoader(validation_dataset,
                          batch_size = self.hparams.eval_batch_size,
                          sampler = sampler,
                          num_workers = 2)
    

    def configure_optimizers(self):
        ## Set bias decay to zero
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },]
        optimizer = AdamW(optimizer_grouped_parameters, lr = self.hparams.learning_rate)
        # decreasing learning rate (linear) after increasing in num_warmup_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps = self.hparams.warmup_steps,
            num_training_steps = self.hparams.training_steps
        )
        self.lr_scheduler = scheduler
        self.log('configured optimizer: ', optimizer)
        return optimizer

#### 6. Train

In [None]:
saved_models_path = '/content/drive/MyDrive/Colab Notebooks/research paper/saved_models'
args_dict = dict(
    model_dir               = saved_models_path,
    model_name_or_path      = 'distilbert-base-uncased',
    tokenizer_name_or_path  = ,
    num_labels              = len(tags_list),
    learning_rate           = 1e-5,
    weight_decay            = 0.0,
    adam_epsilon            = 1e-8,
    warmup_steps            = 0,
    train_batch_size        = 1,
    eval_batch_size         = 1,
    num_train_epochs        = 2,
    accumulate_grad_batches = 10,
    training_steps          = 10000,
    n_gpu                   = 1,
    resume_from_checkpoint  = None,
    val_check_interval      = 0.5, 
    early_stop_callback     = False,
    num_workers             = 0,
    fp_16                   = False, 
    opt_level               = 'O1',
    max_grad_norm           = 1.0,
)
args = argparse.Namespace(**args_dict)
logger = TensorBoardLogger("tb_logs", version = 1, name = 'distilbert')

pl.seed_everything(42)
Distilbert_finetuner = DistilbertFineTuner(args)

train_params = dict(
    default_root_dir = args.model_dir,
    accumulate_grad_batches = 1,
    gpus = args.n_gpu,
    max_epochs = args.num_train_epochs,
    precision = 16 if args.fp_16 else 32,
    amp_level = args.opt_level,
    resume_from_checkpoint = args.resume_from_checkpoint,
    gradient_clip_val = args.max_grad_norm,
    val_check_interval = args.val_check_interval,
    progress_bar_refresh_rate = 20,
    logger = logger
)

trainer = pl.Trainer(**train_params)

trainer.fit(Distilbert_finetuner)

#### TODO:

- This model will give us per article classification across all tags.
- When any article is added, we can run this model and store the corresponding classification vector.
- For a particular user, we can take either the last paper accessed or mean of the papers accessed to construct a tag_vector which can be used to find the nearest matching articles.