In [1]:
import lightning as lt
import torch
import torch.nn as nn
from transformers import BertModel,BertTokenizer
from torch import optim
from torch.utils.data import Dataset, DataLoader
import random
from pytorch_metric_learning import losses
import torchmetrics

## Downloading abstract

In [None]:
import urllib

In [None]:
res = urllib.request.urlopen('https://ieeexplore.ieee.org/document/9551191')

In [None]:
r = res.read()

In [None]:
import bs4
soup = bs4.BeautifulSoup(r, "html.parser")

In [None]:
soup.find_all("meta" ,{"property":"og:description"})[0]['content']

In [None]:
for i in range(100):
    try:
        start = 9551191
        res = urllib.request.urlopen(f'https://ieeexplore.ieee.org/document/{start+i}')
        r = res.read()
        soup = bs4.BeautifulSoup(r, "html.parser")
#         print(soup.find_all("meta" ,{"property":"og:description"})[0]['content'])
        with open(f'data/orginal/{i}.txt', 'w') as f:
            f.write(soup.find_all("meta" ,{"property":"og:description"})[0]['content'])
            
    except:
        continue

## Dataset

In [2]:
import glob
import os
class ParaDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.paraphrased_dir = os.path.join(root_dir,'paraphrased')
        self.original_dir = os.path.join(root_dir,'orginal')
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        self.original = set(
            [
                os.path.basename(x) for x in
                glob.glob(os.path.join(self.original_dir, "*.txt"), recursive=True)
            ]
        
        )
        self.paraphrased = set(
            [
                os.path.basename(x) for x in
                glob.glob(os.path.join(self.paraphrased_dir, "*.txt"), recursive=True)
            ]
        )
        self.text_files = list(self.original.intersection(self.paraphrased)) 

    def __len__(self):
        return len(self.text_files)
    
    
    def __getitem__(self, idx):
        
        
        get_same_pair = random.randint(0,1)
        
        if get_same_pair:
            with open(os.path.join(self.original_dir, self.text_files[idx]), 'r') as f:
                input1 = f.read()
            with open(os.path.join(self.paraphrased_dir, self.text_files[idx]), 'r') as f:
                input2 = f.read()
        else:
            with open(os.path.join(self.original_dir, self.text_files[idx]), 'r') as f:
                input1 = f.read()
            diff_text = random.choice(list(self.paraphrased-set(self.text_files[idx])))
            
            with open(os.path.join(self.paraphrased_dir, diff_text), 'r') as f:
                input2 = f.read()
        encoded_text_1 = self.tokenizer(
                input1,
                return_tensors="pt",
                padding="max_length",
                max_length=200,
                add_special_tokens=True,
                truncation=True,
            )
        
        encoded_text_2 = self.tokenizer(
                input2,
                return_tensors="pt",
                padding="max_length",
                max_length=200,
                add_special_tokens=True,
                truncation=True,
            )
        
        return encoded_text_1, encoded_text_2, get_same_pair
            

In [3]:
dataset = ParaDataset('data')

## Model architecture

In [12]:
class SiameseNetwork(lt.LightningModule):
    def __init__(self, lr = 0.0001):
        super(SiameseNetwork, self).__init__()
        
        self.lr = lr
        
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")

        for param in self.bert_model.parameters():
            param.requires_grad = False
        
        self.fc = nn.Sequential(
            nn.Linear(768*2, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )
        self.val_precision = \
        torchmetrics.classification.precision_recall.BinaryPrecision()
        self.train_precision = \
        torchmetrics.classification.precision_recall.BinaryPrecision()
        
        self.val_recall = \
        torchmetrics.classification.precision_recall.BinaryPrecision()
        self.train_recall = \
        torchmetrics.classification.precision_recall.BinaryPrecision()
        
        
    def configure_optimizers(self):
        return optim.Adadelta(self.parameters(), lr=self.lr)
    
    def forward_once(self,x):
        input_bert = x["input_ids"].view(x["input_ids"].size(0), 200 )
        atten_bert = x["attention_mask"].view(x["attention_mask"].size(0),200)
        _, pool  = self.bert_model(
            input_ids=input_bert, attention_mask=atten_bert, return_dict=False
        )
        return pool
    
    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        
        output = torch.cat([output1, output2], 1)
        
        output = self.fc(output)
        
        return output
    
    def training_step(self, train_batch, batch_idx):
        input1, input2, y = train_batch
        
        out = self(input1, input2)
        criterion = losses.ContrastiveLoss()
        loss = criterion(out, y)
        self.train_precision(out.reshape(-1).round(), y)
        self.train_recall(out.reshape(-1).round(), y)
        
        self.log("Training Loss", loss, on_epoch=True)
        self.log("Training Precision", self.train_precision, on_epoch=True)
        self.log("Training Recall", self.train_recall, on_epoch=True)
        
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        input1, input2, y = val_batch
        
        out = self(input1, input2)
        criterion = losses.ContrastiveLoss()
        loss = criterion(out, y)
        self.val_precision(out.reshape(-1).round(), y)
        self.val_recall(out.reshape(-1).round(), y)
        
        self.log("Validation Loss", loss, on_epoch=True)
        self.log("Validation Precision", self.val_precision, on_epoch=True)
        self.log("Validation Recall", self.val_recall, on_epoch=True)

In [13]:
net = SiameseNetwork()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
train_set, val_set = torch.utils.data.random_split(
        dataset,
        [0.8, 0.2],
    )

In [15]:
BATCH_SIZE = 2
MAX_EPOCH = 100

In [16]:
train_dataloader = DataLoader(
        train_set,
        batch_size=BATCH_SIZE,
        num_workers=10,
        drop_last=True,
        shuffle=True,
        #sampler=sampler,
    )

In [17]:
 test_dataloader = DataLoader(
        val_set,
        batch_size=BATCH_SIZE,
        shuffle=True,
        #sampler = test_sampler,
        num_workers=10,
        drop_last=True,
    )

In [18]:
trainer = lt.Trainer(
        max_epochs=MAX_EPOCH,
        #gradient_clip_val=1,
#         logger=llogger,
#         callbacks = [checkpoint_callback]
    )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
        net,
        train_dataloader,
        test_dataloader,
    )

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type            | Params
----------------------------------------------------
0 | bert_model      | BertModel       | 109 M 
1 | fc              | Sequential      | 394 K 
2 | val_precision   | BinaryPrecision | 0     
3 | train_precision | BinaryPrecision | 0     
4 | val_recall      | BinaryPrecision | 0     
5 | train_recall    | BinaryPrecision | 0     
----------------------------------------------------
394 K     Trainable params
109 M     Non-trainable params
109 M     Total params
439.506   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
len(val_set)