 <h1 style="font-family:verdana;"> <center>Prompt Tuning BERT</center> </h1>


📌[The Paper:The Power of Scale for Parameter-Efficient Prompt Tuning](https://arxiv.org/pdf/2104.08691v1.pdf)

# Let's start


<p style="color:#159364; font-family:cursive;">INSTALL THE TRANSFORMERS PACKAGE FROM THE HUGGING FACE LIBRARY</center></p>


In [22]:
!pip install transformers



# <p style="color:#159364; font-family:cursive;">IMPORT THE LIBRARIES</center></p>

In [23]:
import os
import gc
import copy
import datetime
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import transformers
from transformers import BertTokenizer,BertForSequenceClassification, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from collections import defaultdict
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
import warnings
warnings.filterwarnings("ignore")


# <p style="color:#159364; font-family:cursive;">DEFINE PROMPT EMBEDDINGS CLASS</center></p>

Reference:https://github.com/kipgparker/

In [24]:
class PROMPTEmbedding(nn.Module):
    def __init__(self, 
                wte: nn.Embedding,
                n_tokens: int = 10, 
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        super(PROMPTEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens, 
                                                                               random_range, 
                                                                               initialize_from_vocab))
            
    def initialize_embedding(self, 
                             wte: nn.Embedding,
                             n_tokens: int = 10, 
                             random_range: float = 0.5, 
                             initialize_from_vocab: bool = True):
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(wte.weight.size(1), n_tokens).uniform_(-random_range, random_range)
            
    def forward(self, tokens):
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

# <p style="color:#159364; font-family:cursive;">LOOK AT THE DATA</center></p>

In [25]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv",usecols=["id","excerpt"])
print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df.sample(10)

Number of training sentences: 2,834



Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
204,c77f6c356,,,Though young Whittier was a wide-awake boy and...,-0.037242,0.488942
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
788,bacdd6a73,https://www.africanstorybook.org/,CC BY 4.0,"Once upon a time, there lived a prince who had...",0.642449,0.500603
2703,2131211ce,,,Then rose the King and moved his host by night...,-2.994848,0.540557
731,695bb0ad0,https://simple.wikipedia.org/wiki/Radar,CC BY-SA 3.0 and GFDL,Radar is a machine that uses radio waves for e...,-1.572898,0.442499
1668,ced008e99,,,The forms of hats that are least injurious are...,-1.461231,0.532954
2636,44ed4b49d,,,The sisters made up their minds from the first...,-0.030671,0.501501
2419,dac1a1d1a,,,The three girls ran lightly out of the basemen...,0.335737,0.494981
1182,f04e03fd8,,,"Jupiter, two hours high, was the herald of the...",-3.229761,0.551435
771,a666c1db9,https://simple.wikipedia.org/wiki/Capitalism,CC BY-SA 3.0 and GFDL,"In capitalism, people may sell or lend their p...",0.231689,0.517093


# <p style="color:#159364; font-family:cursive;">A BIT OF PREPROCESSING</center></p>

In [26]:
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False) 
    return text_df.str.replace("\'s",r"s",regex=True).values
df["excerpt"] = prep_text(df["excerpt"])
test_df["excerpt"] = prep_text(test_df["excerpt"])

# <p style="color:#159364; font-family:cursive;">CREATE FOLDS</center></p>

Code taken from:https://www.kaggle.com/abhishek/step-1-create-folds

In [27]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data


# create folds
df = create_folds(df, num_splits=5)

# <p style="color:#159364; font-family:cursive;">TRAINING CONFIGURATION</center></p>

In [28]:
class CONFIG:
    seed = 42
    max_len = 331
    train_batch = 16
    valid_batch = 32
    epochs = 10
    n_tokens=20
    learning_rate = 2e-5
    splits = 5
    scaler = amp.GradScaler()
    model='bert-base-cased'
    tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=True)
    tokenizer.save_pretrained('./tokenizer')
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# <p style="color:#159364; font-family:cursive;">REPRODUCIBILITY</center></p>

In [29]:
def set_seed(seed = CONFIG.seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(CONFIG.seed)

# <p style="color:#159364; font-family:cursive;">DEFINE THE DATASET CLASS</center></p>

In [30]:
class BERTDataset(Dataset):
    def __init__(self,df):
        self.text = df['excerpt'].values
        self.target = df['target'].values
        self.max_len = CONFIG.max_len
        self.tokenizer = CONFIG.tokenizer
        self.n_tokens=CONFIG.n_tokens
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        inputs['input_ids']=torch.cat((torch.full((1,self.n_tokens), 500).resize(CONFIG.n_tokens),torch.tensor(inputs['input_ids'], dtype=torch.long)))
        inputs['attention_mask'] = torch.cat((torch.full((1,self.n_tokens), 1).resize(CONFIG.n_tokens), torch.tensor(inputs['attention_mask'], dtype=torch.long)))

        return {
            'ids': inputs['input_ids'],
            'mask': inputs['attention_mask'],
    
            'target': torch.tensor(self.target[index], dtype=torch.float)
        }
    

# <p style="color:#159364; font-family:cursive;">MODEL:BERT FOR SEQUENCE CLASSIFICATION from 🤗 </center></p>

In [31]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 1,
    output_attentions = False,
    output_hidden_states = False, 
)
prompt_emb = PROMPTEmbedding(model.get_input_embeddings(), 
                      n_tokens=20, 
                      initialize_from_vocab=True)
model.set_input_embeddings(prompt_emb)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): PROMPTEmbedding(
        (wte): Embedding(30522, 768, padding_idx=0)
      )
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

# <p style="color:#159364; font-family:cursive;">GET THE PREPARED DATA</center></p>

In [32]:
def get_data(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = BERTDataset(df_train)
    valid_dataset = BERTDataset(df_valid)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG.train_batch, 
                              num_workers=4, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG.valid_batch, 
                              num_workers=4, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

# <p style="color:#159364; font-family:cursive;">FOLD:0</center></p>

In [33]:
train_dataloader,validation_dataloader=get_data(0)
len(train_dataloader)

142

# <p style="color:#159364; font-family:cursive;">OPTIMIZER</center></p>

In [34]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}
    ]  

optimizer = AdamW(optimizer_parameters, lr=CONFIG.learning_rate)


# <p style="color:#159364; font-family:cursive;">LEARNING RATE SCHEDULER</center></p>

In [35]:
# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_dataloader)*CONFIG.epochs
)

lrs = []
for epoch in range(1, CONFIG.epochs + 1):
    if scheduler is not None:
        scheduler.step()
    lrs.append(optimizer.param_groups[0]["lr"])
layout = go.Layout(template= "plotly_dark",title='Learning_rate')
fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=list(range(CONFIG.epochs)), y=lrs,
                    mode='lines+markers',
                    name='Learning_rate'))
fig.show()

# <p style="color:#159364; font-family:cursive;">DEFINE LOSS AND TIME FUNCTIONS</center></p>

In [36]:
def loss_fn(output,target):
     return torch.sqrt(nn.MSELoss()(output,target))
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# <p style="color:#159364; font-family:cursive;">DEFINE THE FUNCTION FOR TRAINING,VALIDATION AND RUNNING</center></p>

In [37]:
def run(model,optimizer,scheduler):
    set_seed(40)
    scaler=CONFIG.scaler
    training_stats = []
    total_t0 = time.time()
    best_rmse = np.inf
    epochs=CONFIG.epochs
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        t0 = time.time()
        total_train_loss = 0
        data_size=0
        model.train()
        for step, batch in enumerate(train_dataloader):
            tr_loss=[]
            b_input_ids = batch['ids'].to(CONFIG.device)
            b_input_mask = batch['mask'].to(CONFIG.device)
            b_labels = batch['target'].to(CONFIG.device)
            batch_size = b_input_ids.size(0)
            model.zero_grad() 
            with amp.autocast(enabled=True):
                output= model(b_input_ids,attention_mask=b_input_mask)          
                output=output["logits"].squeeze(-1)
                loss = loss_fn(output,b_labels)
                tr_loss.append(loss.item()/len(output))
            scheduler.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        avg_train_loss = np.mean(tr_loss)    
        training_time = format_time(time.time() - t0)
        gc.collect()
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        print("")
        print("Running Validation...")

        t0 = time.time()
        model.eval()
        val_loss = 0
        allpreds = []
        alltargets = []
        for batch in validation_dataloader:
            losses = []
            with torch.no_grad():
                device=CONFIG.device
                ids = batch["ids"].to(device)
                mask = batch["mask"].to(device)
                output = model(ids,mask)
                output = output["logits"].squeeze(-1)
                target = batch["target"].to(device)
                loss = loss_fn(output,target)
                losses.append(loss.item()/len(output))
                allpreds.append(output.detach().cpu().numpy())
                alltargets.append(target.detach().squeeze(-1).cpu().numpy())
                
        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        val_rmse=mean_squared_error(alltargets, allpreds, squared=False)
        losses = np.mean(losses)
        gc.collect() 
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(losses))
        print("  Validation took: {:}".format(validation_time))
        
        if val_rmse <= best_rmse:
            print(f"Validation RMSE Improved ({best_rmse} -> {val_rmse})")
            best_rmse = val_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "rmse{:.4f}_epoch{:.0f}.bin".format(best_rmse, epoch_i)
            torch.save(model.state_dict(), PATH)
            print("Model Saved")
            
        training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': losses,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    ) 
    print("")
    print("Training complete!")
    return training_stats  

# <p style="color:#159364; font-family:cursive;">VISUALIZATION FUNCTION </center></p>

In [38]:
def Visualizations(training_stats):
    pd.set_option('precision', 2)
    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('epoch')
    layout = go.Layout(template= "plotly_dark")
    fig = go.Figure(layout=layout)
    fig.add_trace(go.Scatter(x=df_stats.index, y=df_stats['Training Loss'],
                    mode='lines+markers',
                    name='Training Loss'))
    fig.add_trace(go.Scatter(x=df_stats.index, y=df_stats['Valid. Loss'],
                    mode='lines+markers',
                    name='Validation Loss'))
    fig.show()


 <p style="color:#159364; font-family:cursive;">RUN THE MODEL WITH PROMPT EMBEDDINGS ON FOLD 0 </center></p>

In [39]:
df=run(model,optimizer,scheduler)
Visualizations(df)


Training...

  Average training loss: 0.09
  Training epoch took: 0:01:29

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:07
Validation RMSE Improved (inf -> 0.8142972588539124)
Model Saved

Training...

  Average training loss: 0.05
  Training epoch took: 0:01:29

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:07
Validation RMSE Improved (0.8142972588539124 -> 0.7832609415054321)
Model Saved

Training...

  Average training loss: 0.06
  Training epoch took: 0:01:29

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:07
Validation RMSE Improved (0.7832609415054321 -> 0.738349199295044)
Model Saved

Training...

  Average training loss: 0.07
  Training epoch took: 0:01:29

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:07

Training...

  Average training loss: 0.04
  Training epoch took: 0:01:29

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:07

Training...

  Average trainin


![Upvote!](https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle)
