In [1]:
# Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import string
from nltk.corpus import stopwords

import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
!pip install transformers



In [3]:
import os
import gc
import copy
import datetime
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import transformers
from transformers import BertTokenizer,BertForSequenceClassification, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from collections import defaultdict
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
import warnings
warnings.filterwarnings("ignore")

In [4]:
class PROMPTEmbedding(nn.Module):
    def __init__(self,
                wte: nn.Embedding,
                n_tokens: int = 10,
                random_range: float = 0.5,
                initialize_from_vocab: bool = True):
        super(PROMPTEmbedding, self).__init__()
        self.wte = wte
        self.n_tokens = n_tokens
        self.learned_embedding = nn.parameter.Parameter(self.initialize_embedding(wte,
                                                                               n_tokens,
                                                                               random_range,
                                                                               initialize_from_vocab))

    def initialize_embedding(self,
                             wte: nn.Embedding,
                             n_tokens: int = 10,
                             random_range: float = 0.5,
                             initialize_from_vocab: bool = True):
        if initialize_from_vocab:
            return self.wte.weight[:n_tokens].clone().detach()
        return torch.FloatTensor(wte.weight.size(1), n_tokens).uniform_(-random_range, random_range)

    def forward(self, tokens):
        input_embedding = self.wte(tokens[:, self.n_tokens:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0), 1, 1)
        return torch.cat([learned_embedding, input_embedding], 1)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/My Drive/soft_prompts/train.csv"

df = pd.read_csv(file_path)
test_df = pd.read_csv("/content/drive/My Drive/soft_prompts/test.csv",usecols=["id","excerpt"])
print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df.sample(10)

Mounted at /content/drive
Number of training sentences: 2,834



Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
2029,28692531a,,,"""That poor woman looks sad and discouraged,"" s...",0.847764,0.606161
2497,c67974fcb,,,"After a great, great while, it was afternoon, ...",-1.074494,0.546319
2768,b56b87069,https://drive.google.com/file/d/0B6pcGxi_-Y9Pd...,CC BY 3.0,Have you heard the word 'sphere' before? Do yo...,0.176243,0.490506
2203,6dc503900,,,We moved westward about mid-afternoon over a r...,-1.440499,0.495897
300,e83d8c94b,https://simple.wikipedia.org/wiki/Cold_War,CC BY-SA 3.0 and GFDL,"In February 1917, Tsar (King) Nicholas II of t...",-0.856382,0.451953
954,1f3711a4c,https://www.commonlit.org/texts/the-south-secedes,CC BY 4.0,"After Davis' and Lincoln's inaugurations, pres...",-0.550399,0.481398
374,04245deb8,https://simple.wikipedia.org/wiki/Frog,CC BY-SA 3.0 and GFDL,Frogs are amphibians of the order Anura. There...,-0.644071,0.503932
421,3fbefb41a,https://simple.wikipedia.org/wiki/Information,CC BY-SA 3.0 and GFDL,"The word ""information"" is used in many differe...",-0.286443,0.472696
2370,da4cec179,,,"It is true that, to a traveller approaching th...",-2.421414,0.509662
934,b81cf476e,https://www.africanstorybook.org/,CC BY 4.0,"Once upon a time, there was a man who had thre...",0.607158,0.49603


In [6]:
def prep_text(text_df):
    text_df = text_df.str.replace("\n","",regex=False)
    return text_df.str.replace("\'s",r"s",regex=True).values
df["excerpt"] = prep_text(df["excerpt"])
test_df["excerpt"] = prep_text(test_df["excerpt"])

In [7]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1

    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))

    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )

    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits)

    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f

    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data


# create folds
df = create_folds(df, num_splits=5)

In [8]:
class CONFIG:
    seed = 42
    max_len = 331
    train_batch = 16
    valid_batch = 32
    epochs = 10
    n_tokens=20
    learning_rate = 2e-5
    splits = 5
    scaler = amp.GradScaler()
    model='bert-base-cased'
    tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=True)
    tokenizer.save_pretrained('./tokenizer')
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
def set_seed(seed = CONFIG.seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed(CONFIG.seed)

In [10]:
class BERTDataset(Dataset):
    def __init__(self,df):
        self.text = df['excerpt'].values
        self.target = df['target'].values
        self.max_len = CONFIG.max_len
        self.tokenizer = CONFIG.tokenizer
        self.n_tokens=CONFIG.n_tokens

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        inputs['input_ids']=torch.cat((torch.full((1,self.n_tokens), 500).resize(CONFIG.n_tokens),torch.tensor(inputs['input_ids'], dtype=torch.long)))
        inputs['attention_mask'] = torch.cat((torch.full((1,self.n_tokens), 1).resize(CONFIG.n_tokens), torch.tensor(inputs['attention_mask'], dtype=torch.long)))

        return {
            'ids': inputs['input_ids'],
            'mask': inputs['attention_mask'],

            'target': torch.tensor(self.target[index], dtype=torch.float)
        }

In [11]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 1,
    output_attentions = False,
    output_hidden_states = False,
)
prompt_emb = PROMPTEmbedding(model.get_input_embeddings(),
                      n_tokens=20,
                      initialize_from_vocab=True)
model.set_input_embeddings(prompt_emb)
model.to('cuda')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): PROMPTEmbedding(
        (wte): Embedding(30522, 768, padding_idx=0)
      )
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [12]:
def get_data(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = BERTDataset(df_train)
    valid_dataset = BERTDataset(df_valid)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG.train_batch,
                              num_workers=4, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG.valid_batch,
                              num_workers=4, shuffle=False, pin_memory=True)

    return train_loader, valid_loader

In [13]:
train_dataloader,validation_dataloader=get_data(0)
len(train_dataloader)

142

In [14]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.0001},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_parameters, lr=CONFIG.learning_rate)

In [15]:
# Defining LR Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader)*CONFIG.epochs
)

lrs = []
for epoch in range(1, CONFIG.epochs + 1):
    if scheduler is not None:
        scheduler.step()
    lrs.append(optimizer.param_groups[0]["lr"])
layout = go.Layout(template= "plotly_dark",title='Learning_rate')
fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=list(range(CONFIG.epochs)), y=lrs,
                    mode='lines+markers',
                    name='Learning_rate'))
fig.show()

In [16]:
def loss_fn(output,target):
     return torch.sqrt(nn.MSELoss()(output,target))
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
def run(model,optimizer,scheduler):
    set_seed(40)
    scaler=CONFIG.scaler
    training_stats = []
    total_t0 = time.time()
    best_rmse = np.inf
    epochs=CONFIG.epochs
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        t0 = time.time()
        total_train_loss = 0
        data_size=0
        model.train()
        for step, batch in enumerate(train_dataloader):
            tr_loss=[]
            b_input_ids = batch['ids'].to(CONFIG.device)
            b_input_mask = batch['mask'].to(CONFIG.device)
            b_labels = batch['target'].to(CONFIG.device)
            batch_size = b_input_ids.size(0)
            model.zero_grad()
            with amp.autocast(enabled=True):
                output= model(b_input_ids,attention_mask=b_input_mask)
                output=output["logits"].squeeze(-1)
                loss = loss_fn(output,b_labels)
                tr_loss.append(loss.item()/len(output))
            scheduler.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        avg_train_loss = np.mean(tr_loss)
        training_time = format_time(time.time() - t0)
        gc.collect()
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        print("")
        print("Running Validation...")

        t0 = time.time()
        model.eval()
        val_loss = 0
        allpreds = []
        alltargets = []
        for batch in validation_dataloader:
            losses = []
            with torch.no_grad():
                device=CONFIG.device
                ids = batch["ids"].to(device)
                mask = batch["mask"].to(device)
                output = model(ids,mask)
                output = output["logits"].squeeze(-1)
                target = batch["target"].to(device)
                loss = loss_fn(output,target)
                losses.append(loss.item()/len(output))
                allpreds.append(output.detach().cpu().numpy())
                alltargets.append(target.detach().squeeze(-1).cpu().numpy())

        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        val_rmse=mean_squared_error(alltargets, allpreds, squared=False)
        losses = np.mean(losses)
        gc.collect()
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(losses))
        print("  Validation took: {:}".format(validation_time))

        if val_rmse <= best_rmse:
            print(f"Validation RMSE Improved ({best_rmse} -> {val_rmse})")
            best_rmse = val_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "rmse{:.4f}_epoch{:.0f}.bin".format(best_rmse, epoch_i)
            torch.save(model.state_dict(), PATH)
            print("Model Saved")

        training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': losses,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    print("")
    print("Training complete!")
    return training_stats

In [18]:
def Visualizations(training_stats):
#     pd.set_option('precision', 2)
    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('epoch')
    layout = go.Layout(template= "plotly_dark")
    fig = go.Figure(layout=layout)
    fig.add_trace(go.Scatter(x=df_stats.index, y=df_stats['Training Loss'],
                    mode='lines+markers',
                    name='Training Loss'))
    fig.add_trace(go.Scatter(x=df_stats.index, y=df_stats['Valid. Loss'],
                    mode='lines+markers',
                    name='Validation Loss'))
    fig.show()

In [19]:
df=run(model,optimizer,scheduler)


Training...

  Average training loss: 0.08
  Training epoch took: 0:00:59

Running Validation...
  Validation Loss: 0.03
  Validation took: 0:00:14
Validation RMSE Improved (inf -> 0.8127800822257996)
Model Saved

Training...

  Average training loss: 0.08
  Training epoch took: 0:00:58

Running Validation...
  Validation Loss: 0.03
  Validation took: 0:00:14
Validation RMSE Improved (0.8127800822257996 -> 0.7371999621391296)
Model Saved

Training...

  Average training loss: 0.07
  Training epoch took: 0:00:58

Running Validation...
  Validation Loss: 0.03
  Validation took: 0:00:14
Validation RMSE Improved (0.7371999621391296 -> 0.7235888242721558)
Model Saved

Training...

  Average training loss: 0.03
  Training epoch took: 0:00:58

Running Validation...
  Validation Loss: 0.04
  Validation took: 0:00:14

Training...

  Average training loss: 0.03
  Training epoch took: 0:00:58

Running Validation...
  Validation Loss: 0.03
  Validation took: 0:00:14

Training...

  Average traini

In [20]:
# pd.set_option("display.max_columns",5)
# pd.set_option("precision", 5)
Visualizations(df)

In [23]:
def run(model, optimizer, scheduler):
    set_seed(40)
    scaler = CONFIG.scaler
    training_stats = []
    total_t0 = time.time()
    best_rmse = np.inf
    epochs = CONFIG.epochs
    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        t0 = time.time()
        total_train_loss = 0
        data_size = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            tr_loss = []
            b_input_ids = batch['ids'].to(CONFIG.device)
            b_input_mask = batch['mask'].to(CONFIG.device)
            b_labels = batch['target'].to(CONFIG.device)
            batch_size = b_input_ids.size(0)
            model.zero_grad()
            with amp.autocast(enabled=True):
                output = model(b_input_ids, attention_mask=b_input_mask)
                output = output["logits"].squeeze(-1)
                loss = loss_fn(output, b_labels)
                tr_loss.append(loss.item() / len(output))
            scheduler.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        avg_train_loss = np.mean(tr_loss)
        training_time = format_time(time.time() - t0)
        gc.collect()
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))
        print("")
        print("Running Validation...")

        t0 = time.time()
        model.eval()
        val_loss = 0
        allpreds = []
        alltargets = []
        inference_times = []
        allpreds = []
        alltargets = []
        losses = []

        for batch in validation_dataloader:
          t0_batch = time.time()
          with torch.no_grad():
            device = CONFIG.device
            ids = batch["ids"].to(device)
            mask = batch["mask"].to(device)
            output = model(ids, mask)
            output = output["logits"].squeeze(-1)
            target = batch["target"].to(device)
            loss = loss_fn(output, target)
            losses.append(loss.item() / len(output))
            allpreds.append(output.detach().cpu().numpy())
            alltargets.append(target.detach().squeeze(-1).cpu().numpy())
          inference_time_batch = time.time() - t0_batch
          inference_times.append(inference_time_batch)

        average_inference_time = np.mean(inference_times)
        print(f"Average Inference Time per Batch: {average_inference_time} seconds")

        allpreds = np.concatenate(allpreds)
        alltargets = np.concatenate(alltargets)
        val_rmse = mean_squared_error(alltargets, allpreds, squared=False)
        losses = np.mean(losses)
        gc.collect()
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(losses))
        print("  Validation took: {:}".format(validation_time))

        if val_rmse <= best_rmse:
            print(f"Validation RMSE Improved ({best_rmse} -> {val_rmse})")
            best_rmse = val_rmse
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "rmse{:.4f}_epoch{:.0f}.bin".format(best_rmse, epoch_i)
            torch.save(model.state_dict(), PATH)
            print("Model Saved")

        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': losses,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    # Calculate total training time
    total_training_time = format_time(time.time() - total_t0)

    return training_stats, total_training_time


In [26]:
training_stats, total_training_time = run(model, optimizer, scheduler)


Training...

  Average training loss: 0.02
  Training epoch took: 0:00:59

Running Validation...
Average Inference Time per Batch: 0.7063767247729831 seconds
  Validation Loss: 0.02
  Validation took: 0:00:14
Validation RMSE Improved (inf -> 0.7112174034118652)
Model Saved

Training...

  Average training loss: 0.03
  Training epoch took: 0:00:58

Running Validation...
Average Inference Time per Batch: 0.715358681148953 seconds
  Validation Loss: 0.02
  Validation took: 0:00:14
Validation RMSE Improved (0.7112174034118652 -> 0.7112174034118652)
Model Saved

Training...

  Average training loss: 0.02
  Training epoch took: 0:00:58

Running Validation...
Average Inference Time per Batch: 0.7171236276626587 seconds
  Validation Loss: 0.02
  Validation took: 0:00:14
Validation RMSE Improved (0.7112174034118652 -> 0.7112174034118652)
Model Saved

Training...

  Average training loss: 0.02
  Training epoch took: 0:00:58

Running Validation...
Average Inference Time per Batch: 0.719511389732