In [None]:
## This is the code written to fine-tune Sentbert-PP model on PARADE dataset

# Please upload PARADE_dataset-main.zip to colab notebook, zip file present in code base
# Code works on GPU

# **Install Libraries and unzip Parade Dataset**

In [None]:
!pip install transformers
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, htt

In [None]:
from transformers import *
import torch
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.utils import shuffle



In [None]:
!unzip PARADE_dataset-main.zip

Archive:  PARADE_dataset-main.zip
   creating: PARADE_dataset-main/
  inflating: PARADE_dataset-main/PARADE_test.txt  
  inflating: PARADE_dataset-main/PARADE_train.txt  
  inflating: PARADE_dataset-main/PARADE_validation.txt  
  inflating: PARADE_dataset-main/README.md  


# **Preprocess PARADE Dataset**

In [None]:
def pre_process_parade(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()[1:]
        data = {'input1':[], 'input2':[], 'similarity_score':[]}
        for line in lines:
            split_line = line.split('\t')
            data['input1'].append(split_line[3])
            data['input2'].append(split_line[4])
            data['similarity_score'].append(int(split_line[1]))
            
    return data

In [None]:
train_data_parade = pre_process_parade('./PARADE_dataset-main/PARADE_train.txt')

train_data = {'input1':[], 'input2':[], 'similiarity_score':[]}

train_data['input1'] = train_data_parade['input1']
train_data['input2'] = train_data_parade['input2']
train_data['similarity_score'] = train_data_parade['similarity_score']

# **Finetune Sentbert-PP**

In [None]:
def finetune_sentbert_pp():

    model = SentenceTransformer('paraphrase-TinyBERT-L6-v2')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    train_ex = []
    for i in range(len(train_data['input1'])):
        train_ex.append(InputExample(texts=[train_data['input1'][i], train_data['input2'][i]], label=train_data['similarity_score'][i]))

    train_dataloader = DataLoader(train_ex, shuffle=True, batch_size=32)
    train_loss = losses.ContrastiveLoss(model=model)

    model.fit([(train_dataloader, train_loss)], epochs=3, show_progress_bar=True)
    return model

In [None]:
model = finetune_sentbert_pp()

Downloading (…)35b5f/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)964b235b5f/README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading (…)4b235b5f/config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)35b5f/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading (…)964b235b5f/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b235b5f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

loading configuration file /root/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-TinyBERT-L6-v2/config.json
Model config BertConfig {
  "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-TinyBERT-L6-v2/",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file /root/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-TinyBERT-L6-v2/pytorch_model.bin
All model checkpoi

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/236 [00:00<?, ?it/s]

Iteration:   0%|          | 0/236 [00:00<?, ?it/s]

Iteration:   0%|          | 0/236 [00:00<?, ?it/s]

# **Save the Fine-tuned model to Google Drive**

In [None]:
torch.save(model.state_dict(),'finetuned-sentbert-pp-parade-train.pt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r "/content/finetuned-sentbert-pp-parade-train.pt" "/content/drive/MyDrive"