<a href="https://colab.research.google.com/github/shikha-aggarwal/researchpaperlikes/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### 1. Install and Import

In [None]:
!pip install datasets transformers pytorch-lightning --quiet

In [96]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

import transformers
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    AdamW,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup
)
from transformers.modeling_outputs import SequenceClassifierOutput

import json
import os
import re
import argparse
import random
import time
import string
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler

import matplotlib.pyplot as plt

## Colab accessing Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
cd '/content/drive/My Drive/Colab Notebooks/research paper/'

/content/drive/My Drive/Colab Notebooks/research paper


In [8]:
from datasets import load_dataset
from ml_project.data_loaders import load_data

####2. Load dataset

In [9]:
data_dir = './data'
tags_list = load_data.load_tags(data_dir)
article_tags_2d_list = load_data.load_article_tags(data_dir)
user_items_2d_np = load_data.load_user_article_likes(data_dir)
articles_df, user_article_likes_2d_np = load_data.load_articles_and_user_article_likes(data_dir)

In [10]:
articles_df["tags"] = article_tags_2d_list

In [31]:
articles_df.columns

Index(['doc.id', 'title', 'citeulike.id', 'raw.title', 'raw.abstract', 'tags'], dtype='object')

In [32]:
articles_df.rename(columns = {'doc.id': 'doc_id',
                              'citeulike.id': 'citeulike_id',
                              'raw.title': 'raw_title',
                              'raw.abstract': 'raw_abstract'}, 
                              inplace = True)

In [33]:
articles_df.columns

Index(['doc_id', 'title', 'citeulike_id', 'raw_title', 'raw_abstract', 'tags'], dtype='object')

In [34]:
train_df, validate_df, test_df = np.split(articles_df.sample(frac=1, random_state=42), 
                       [int(.6*len(articles_df)), int(.8*len(articles_df))])

####3. Various constants

In [16]:
input_length = 100
MODEL_CHECKPOINT = 'distilbert-base-uncased'
saved_models_path = '/content/drive/MyDrive/Colab Notebooks/research paper/saved_models'

#### 4. Dataset class for trainer

In [147]:
class ArticlesDataset(Dataset):

    def __init__(self, df, tokenizer, input_length):
        self.pandas_df = df
        self.tokenizer = tokenizer        
        self.input_length = input_length
  

    def __len__(self):
        return len(self.pandas_df)
    

    def clean_text(self, text):
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text
    

    def convert_to_features(self, example_batch):
        sentence_1 = self.clean_text(example_batch['raw_title'] + \
                                     example_batch['raw_abstract'])  
        source = self.tokenizer.batch_encode_plus([sentence_1], 
                                                  max_length=self.input_length,
                                                  padding='max_length',
                                                  truncation=True,
                                                  return_tensors="pt")
        
        return source
  
  
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()

        num_classes = len(tags_list)
        source = self.convert_to_features(self.pandas_df.iloc[index])
        
        source_ids = source["input_ids"].squeeze()
        src_mask    = source["attention_mask"].squeeze()
        tags = self.pandas_df.iloc[index]['tags']

        indices = [0] * len(tags_list)
        for tag in tags:
            indices[tag] = 1
        
        return {"input_ids": source_ids, "attention_mask": src_mask, "label": torch.tensor(indices, dtype=torch.float)}

#### 5. PyTorch Lightning Module

In [148]:
class DistilbertFineTuner(pl.LightningModule):

    def __init__(self, hparams):
        super(DistilbertFineTuner, self).__init__()
        self.save_hyperparameters(hparams)

        self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.model = AutoModel.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.config_used = AutoConfig.from_pretrained(self.hparams.tokenizer_name_or_path)
        self.linear_layer = nn.Linear(self.config_used.dim, self.config_used.dim)
        self.classifier_layer = nn.Linear(self.config_used.dim, self.hparams.output_dims)
        self.dropout = nn.Dropout(self.config_used.seq_classif_dropout)

        self.model_dir = self.hparams.model_dir
        

    def forward(self, input_ids, attention_mask, labels):
        distilbert_output = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask
            )
        hidden_state = distilbert_output[0]  # (batch, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # Take the last output (batch, dim)
        pooled_output = self.linear_layer(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier_layer(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))

        return SequenceClassifierOutput(
            loss = loss,
            logits = logits,
            hidden_states = distilbert_output.hidden_states,
            attentions = distilbert_output.attentions,
        )


    def _step(self, batch):
        outputs = self.forward(
            input_ids = batch['input_ids'],
            attention_mask = batch['attention_mask'],
            labels = batch['label']
        )
        loss = outputs[0]
        return loss


    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('train_loss', loss, on_step = True, on_epoch = True, 
                 prog_bar = True, logger = True)
        return {"loss": loss}


    def train_dataloader(self):
        train_dataset = ArticlesDataset(train_df,
                                        self.tokenizer,
                                        input_length = input_length)
        sampler = RandomSampler(train_dataset)

        dataloader = DataLoader(train_dataset,
                                sampler = sampler,
                                batch_size = self.hparams.train_batch_size,
                                drop_last = True,
                                num_workers = 2)
        return dataloader


    def val_dataloader(self):
        validation_dataset = ArticlesDataset(validate_df,
                                             self.tokenizer,
                                             input_length = input_length)
        sampler = RandomSampler(validation_dataset)

        return DataLoader(validation_dataset,
                          batch_size = self.hparams.eval_batch_size,
                          sampler = sampler,
                          num_workers = 2)
    

    def configure_optimizers(self):
        ## Set bias decay to zero
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },]
        optimizer = AdamW(optimizer_grouped_parameters, lr = self.hparams.learning_rate)
        # decreasing learning rate (linear) after increasing in num_warmup_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps = self.hparams.warmup_steps,
            num_training_steps = self.hparams.training_steps
        )
        self.lr_scheduler = scheduler
        self.log('configured optimizer: ', optimizer)
        return optimizer

#### 6. Train

In [149]:
args_dict = dict(
    model_dir               = saved_models_path,
    model_name_or_path      = 'distilbert-base-uncased',
    tokenizer_name_or_path  = 'distilbert-base-uncased',
    # num_labels              = len(tags_list),
    output_dims             = len(tags_list),
    learning_rate           = 1e-5,
    weight_decay            = 0.0,
    adam_epsilon            = 1e-8,
    warmup_steps            = 0,
    train_batch_size        = 4,
    eval_batch_size         = 4,
    num_train_epochs        = 2,
    accumulate_grad_batches = 10,
    training_steps          = 10000,
    n_gpu                   = 1,
    resume_from_checkpoint  = None,
    val_check_interval      = 0.5, 
    early_stop_callback     = False,
    num_workers             = 0,
    fp_16                   = False, 
    opt_level               = 'O1',
    max_grad_norm           = 1.0,
)
args = argparse.Namespace(**args_dict)
logger = TensorBoardLogger("tb_logs", version = 1, name = 'distilbert')

pl.seed_everything(42)
Distilbert_finetuner = DistilbertFineTuner(args)

train_params = dict(
    default_root_dir = args.model_dir,
    accumulate_grad_batches = 1,
    gpus = args.n_gpu,
    max_epochs = args.num_train_epochs,
    precision = 16 if args.fp_16 else 32,
    amp_level = args.opt_level,
    resume_from_checkpoint = args.resume_from_checkpoint,
    gradient_clip_val = args.max_grad_norm,
    val_check_interval = args.val_check_interval,
    progress_bar_refresh_rate = 20,
    logger = logger
)

trainer = pl.Trainer(**train_params)

trainer.fit(Distilbert_finetuner)

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type            | Params
-----------------------------------------------------
0 | model            | DistilBertModel | 66.4 M
1 | linear_layer     | Linear          | 590 K 
2 | classifier_layer | Linear          | 35.7 M
3 | dropout          | Dropout         | 0     
-----------------------------------------------------
102 M     Trainable params
0         Non-trainable params
102 M     Total params
410.513   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




### 7. Predictions

In [163]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
validation_dataset = ArticlesDataset(validate_df,
                                     tokenizer = tokenizer,
                                     input_length = input_length)

sampler = RandomSampler(validation_dataset)

loader = DataLoader(validation_dataset,
                  batch_size = 32,
                  sampler = sampler,
                  num_workers = 2)


it = iter(loader)

sample_batch = next(it)
print(sample_batch["input_ids"].shape)

torch.Size([32, 100])


In [164]:
Distilbert_finetuner.to('cuda')

test_outputs = Distilbert_finetuner.forward(
    sample_batch["input_ids"].cuda(),
    sample_batch["attention_mask"].cuda(),
    sample_batch["label"].cuda())

print(test_outputs[1].shape)

torch.Size([32, 46391])


In [170]:
for index in range(min(len(sample_batch["input_ids"]), 5)):
    print(tokenizer.decode(sample_batch["input_ids"][index]))
    list_tags_golden = []
    list_tags_output = []
    for i in range(len(tags_list)):
        if sample_batch["label"][index][i] == 1.:
            list_tags_golden.append(i)
            list_tags_output.append(test_outputs[1][index][i].item())
        # if test_outputs[1][index][i] > 0.5:
        #     list_tags_output.append(i)

    print(list_tags_golden)
    print(list_tags_output)

    print("*" * 10)

[CLS] learning to recognize reliable users and content in social media with coupled mutual reinforcementcommunity question answering ( cqa ) has emerged as a popular forum for users to pose questions for other users to answer. over the last few years, cqa portals such as naver and yahoo! answers have exploded in popularity, and now provide a viable alternative to general purpose web search. at the same time, the answers to past questions submitted in cqa sites comprise a valuable knowledge repository which could be a [SEP]
[1896, 2061, 3447, 7754, 9700, 12147, 12832, 13244, 14186, 14493, 14977, 15772, 17782, 22686, 25606, 28485, 28641, 32286, 33433, 35500, 36185, 40928, 41534, 42694]
[-0.011366932652890682, 0.006085234694182873, -0.015387983992695808, -0.016205474734306335, -0.011165891773998737, 0.01911221444606781, -0.0015007118927314878, 0.034339889883995056, -0.027376720681786537, 0.017517006024718285, 0.0064484551548957825, -0.03416009247303009, -0.02070225030183792, -0.0292108729

#### TODO:

- This model will give us per article classification across all tags.
- When any article is added, we can run this model and store the corresponding classification vector.
- For a particular user, we can take either the last paper accessed or mean of the papers accessed to construct a tag_vector which can be used to find the nearest matching articles.