In [None]:
# Code to finetune SPECTER model and Sci-NCL model
# By default it finetunes SPECTER model
# To fine-tune Sci-NCL model, please comment/uncomment marked portions in code (indicated by comments)

# Takes around 3 hours to complete on GPU
# Dataset is loaded from google drive

# **Install libraries and Prepare dataset**

In [None]:
!pip install transformers
!pip install datasets

In [None]:
!gdown https://drive.google.com/uc?id=1b0g06yMQCjOoO0VaRAs3fNk5Kvqni8Ka  

Downloading...
From: https://drive.google.com/uc?id=1b0g06yMQCjOoO0VaRAs3fNk5Kvqni8Ka
To: /content/hic_cs_papers.zip
100% 56.0M/56.0M [00:00<00:00, 145MB/s]


In [None]:
!unzip hic_cs_papers.zip

Archive:  hic_cs_papers.zip
  inflating: dataset.arrow           
  inflating: dataset_info.json       
  inflating: state.json              


In [None]:
from datasets import load_from_disk

dataset = load_from_disk('.')

In [None]:
print(dataset)

Dataset({
    features: ['query', 'candidates'],
    num_rows: 6165
})


In [None]:
# most of papers in the dataset are of CS domain, check using this code
# takes a random paper from dataset and prints it
import random

n = random.randrange(0,6165,1)
n = n % 6165
dataset[n]['query']

{'doc_id': '3744479',
 'title': 'Analyzing and capturing articulated hand motion in image sequences',
 'abstract': 'Capturing the human hand motion from video involves the estimation of the rigid global hand pose as well as the nonrigid finger articulation. The complexity induced by the high degrees of freedom of the articulated hand challenges many visual tracking techniques. For example, the particle filtering technique is plagued by the demanding requirement of a huge number of particles and the phenomenon of particle degeneracy. This paper presents a novel approach to tracking the articulated hand in video by learning and integrating natural hand motion priors. To cope with the finger articulation, this paper proposes a powerful sequential Monte Carlo tracking algorithm based on importance sampling techniques, where the importance function is based on an initial manifold model of the articulation configuration space learned from motion-captured data. In addition, this paper present

In [None]:
def prepare_triples(dataset_,tokenizer):
  triples = []
  for x in dataset_:
    query = x['query']
    anchor = query['title'] + tokenizer.sep_token + query['abstract']
    pos_list = []
    neg_list = []
    candidates = x['candidates']
    for c in candidates:
      if c['score']==1:
        pos_list.append(c)
      else:
        neg_list.append(c)
    n1 = len(pos_list)
    n2 = len(neg_list)
    random.shuffle(pos_list)
    random.shuffle(neg_list)

    n = min(n1,n2)
    if n==0:
      continue
    for i in range(n):
      positive = pos_list[i]['title'] + tokenizer.sep_token + pos_list[i]['abstract']
      negative = neg_list[i]['title'] + tokenizer.sep_token + neg_list[i]['abstract']
      triples.append((anchor,positive,negative))
  return triples

# **Finetuning the models using triplet loss**

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#class for triples dataset
class TripletDataset(Dataset):
    def __init__(self, triples, tokenizer, max_length):
        self.triples = triples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        anchor, positive, negative = self.triples[idx]
        anchor_encoding = self.tokenizer(anchor, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        positive_encoding = self.tokenizer(positive, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        negative_encoding = self.tokenizer(negative, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        return {
            'anchor': anchor_encoding,
            'positive': positive_encoding,
            'negative': negative_encoding
        }

In [None]:
# loss function
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def forward(self, anchor_embeddings, positive_embeddings, negative_embeddings):
        distance_pos = torch.nn.functional.pairwise_distance(anchor_embeddings, positive_embeddings, 2)
        distance_neg = torch.nn.functional.pairwise_distance(anchor_embeddings, negative_embeddings, 2)
        loss = torch.nn.functional.relu(self.margin + distance_pos - distance_neg)
        return loss.mean()

In [None]:
##### Comment below 2 lines to finetune scincl after restarting runtime #####
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

##### Uncomment this to finetune scincl after restarting runtime #####
# tokenizer = AutoTokenizer.from_pretrained('malteos/scincl')
# model = AutoModel.from_pretrained('malteos/scincl')

Downloading (…)okenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
#fine-tuning code, written using PyTorch
def finetune_model(train_dataset, optimizer, loss_fn,batch_size):
    model.train()
    tot_loss = 0.0
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for step, batch in enumerate(train_loader):
      if step%100==0:
        print('Total steps:',end=' ')
        print(step)
      for k1 in batch.keys():
        for k2 in batch[k1].keys():
          batch[k1][k2] = batch[k1][k2].squeeze() 
      
      anchor = batch['anchor']
      positive = batch['positive']
      negative = batch['negative']
      
      anchor.to(device)
      positive.to(device)
      negative.to(device)
    
      anchor_outputs = model(**anchor)
      positive_outputs = model(**positive)
      negative_outputs = model(**negative)

      anchor_outputs = anchor_outputs.last_hidden_state
      positive_outputs = positive_outputs.last_hidden_state
      negative_outputs = negative_outputs.last_hidden_state

      loss = loss_fn(anchor_outputs[:,0,:], positive_outputs[:,0,:], negative_outputs[:,0,:])
      tot_loss += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print('Total loss:',end=' ')
    print(tot_loss)

In [None]:
batch_size = 4
epochs = 2
lr = 2e-5

model.to(device)

triples = prepare_triples(dataset,tokenizer)

train_dataset = TripletDataset(triples, tokenizer, 512)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_func = TripletLoss()

In [None]:
print(len(train_dataset))

16907


In [None]:
for epoch in range(epochs):
  print('Epoch:',end=' ')
  print(epoch)
  finetune_model(train_dataset, optimizer, loss_func,batch_size)

Epoch: 0
Total steps: 0
Total steps: 100
Total steps: 200
Total steps: 300
Total steps: 400
Total steps: 500
Total steps: 600
Total steps: 700
Total steps: 800
Total steps: 900
Total steps: 1000
Total steps: 1100
Total steps: 1200
Total steps: 1300
Total steps: 1400
Total steps: 1500
Total steps: 1600
Total steps: 1700
Total steps: 1800
Total steps: 1900
Total steps: 2000
Total steps: 2100
Total steps: 2200
Total steps: 2300
Total steps: 2400
Total steps: 2500
Total steps: 2600
Total steps: 2700
Total steps: 2800
Total steps: 2900
Total steps: 3000
Total steps: 3100
Total steps: 3200
Total steps: 3300
Total steps: 3400
Total steps: 3500
Total steps: 3600
Total steps: 3700
Total steps: 3800
Total steps: 3900
Total steps: 4000
Total steps: 4100
Total steps: 4200
Total loss: 3200.567248106003
Epoch: 1
Total steps: 0
Total steps: 100
Total steps: 200
Total steps: 300
Total steps: 400
Total steps: 500
Total steps: 600
Total steps: 700
Total steps: 800
Total steps: 900
Total steps: 1000
Tota

# **Save model to Google Drive**

In [None]:
##### Comment below line if fine-tuning scincl #####
torch.save(model.state_dict(), "specter_triplet_4.pt")

##### Uncomment below line if finetuning scincl #####
# torch.save(model.state_dict(), "scincl_triplet_4.pt")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
##### Comment below line if fine-tuning scincl #####
!cp -r "/content/specter_triplet_4.pt" "/content/drive/MyDrive"

##### Uncomment below line if fine-tuning scincl #####
# !cp -r "/content/scincl_triplet_4.pt" "/content/drive/MyDrive"