In [3]:
"""
In the first phase, a domain classifier is trained on a pair of datasets.
The goal is to teach the model how to distinguish between the two datasets
by learning their unique features and characteristics.
"""

'\nIn the first phase, a domain classifier is trained on a pair of datasets.\nThe goal is to teach the model how to distinguish between the two datasets\nby learning their unique features and characteristics.\n'

In [4]:
# Import the required libraries
import torch.nn.functional as F
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load the two datasets
data1 = pd.read_csv('sample_data.csv', nrows = 4000)
data1["label"] = "Patents"
data1['text'] = data1['patent_title'].astype(str) + ',' + data1['patent_abstract'].astype(str) + ',' + data1['summary_text'].astype(str)

data2 = pd.read_csv('abstract_title_text_RD.csv', nrows = 4000)
data2["label"] = "RD"

data1 = data1[["text", "label"]].astype(str)
data2 = data2[["text", "label"]].astype(str)

In [51]:
# Merge the datasets into a single dataframe
data = pd.concat([data1, data2], ignore_index=True)

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize the input data and convert to tensors
def tokenize(data):
    label_map = {label: i for i, label in enumerate(set(data['label'].to_list()))}
    labels = [label_map[label] for label in data['label'].to_list()]
    labels = torch.tensor(labels)
    return tokenizer(data['text'].to_list(), padding=True, truncation=True, max_length=400, return_tensors='pt'), labels

train_encodings, train_labels = tokenize(train_data)
val_encodings, val_labels = tokenize(val_data)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 1
total_steps = len(train_data) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Reset the loss for this epoch
    total_loss = 0
    total_mask = []
    all_diffs = []
    
    # Train the model on batches of data
    for i in range(0, len(train_data), 32):
        # Clear gradients
        optimizer.zero_grad()

        # Move the data to the device
        batch_encodings = {key: val[i:i+32].to(device) for key, val in train_encodings.items()}
        batch_labels = train_labels[i:i+32].to(device)

        # Forward pass
        outputs = model(**batch_encodings, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        # print(outputs.logits)
        # apply softmax along the second dimension (classes)
        probs = F.softmax(outputs.logits, dim=1)
        # convert probabilities tensor to a NumPy array
        probs_np = probs.detach().cpu().numpy()
        diff = abs(probs_np[:, 1] - probs_np[:, 0]).tolist()
        all_diffs.append(diff)

        # # create boolean mask to select rows with probability between 0.5 and 0.7
        # mask = np.logical_and(probs_np[:,0] > 0.4, probs_np[:,0] < 0.6)
        # total_mask.append(mask)
        # # select rows using boolean mask
        # selected_rows = data1[mask]

        # # print selected rows
        # print(selected_rows)
        # print(probs)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()
        scheduler.step()

    # Print the average loss for this epoch
    print(f"Epoch {epoch+1} loss: {total_loss/len(train_data)}")

    # Set the model to evaluation mode
    model.eval()

    # Evaluate the model on the validation set
    with torch.no_grad():
        val_loss = 0
        total = 0
        total_correct = 0
        num_correct = 0
        for i in range(0, len(val_data), 32):
            # Move the data to the device
            batch_encodings = {key: val[i:i+32].to(device) for key, val in val_encodings.items()}
            batch_labels = val_labels[i:i+32].to(device)

            # Forward pass
            outputs = model(**batch_encodings, labels=batch_labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            total = total + len(predictions)
            num_correct = torch.sum(predictions == batch_labels).item()
            total_correct = total_correct + num_correct
        print(total_correct/total)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1 loss: 0.0021838047000346705
0.99875


In [52]:
"""
In the second phase, the source domain training samples are ranked based on the output from the domain classifier. 
This ranking process identifies which samples in the source domain are most similar to the target domain. 
A subset of the top-ranked data points is then selected from the source domain training set.
"""

'\nIn the second phase, the source domain training samples are ranked based on the output from the domain classifier. \nThis ranking process identifies which samples in the source domain are most similar to the target domain. \nA subset of the top-ranked data points is then selected from the source domain training set.\n'

In [54]:
import itertools
flattened = list(itertools.chain.from_iterable(all_diffs))

[0.1351872980594635,
 0.18749549984931946,
 0.18018165230751038,
 0.15372371673583984,
 0.255646675825119,
 0.1785520613193512,
 0.19542551040649414,
 0.021359652280807495,
 0.26059383153915405,
 0.14540478587150574,
 0.25760963559150696,
 0.20162546634674072,
 0.1452009677886963,
 0.007253885269165039,
 0.14641380310058594,
 0.23690757155418396,
 0.26235729455947876,
 0.2821299433708191,
 0.11306524276733398,
 0.23385867476463318,
 0.1729470193386078,
 0.09736809134483337,
 0.1443699598312378,
 0.11820340156555176,
 0.16527783870697021,
 0.19538092613220215,
 0.0249767005443573,
 0.047631144523620605,
 0.16127610206604004,
 0.09536507725715637,
 0.10507968068122864,
 0.1187359094619751,
 0.1210012435913086,
 0.46510830521583557,
 0.4032003879547119,
 0.2391432821750641,
 0.3408139944076538,
 0.2807919979095459,
 0.3477397859096527,
 0.06221318244934082,
 0.008292943239212036,
 0.051774442195892334,
 0.005140513181686401,
 0.38611742854118347,
 0.25598832964897156,
 0.3064034879207611,

In [55]:
train_data['diff'] = flattened

In [59]:
# sort the dataframe by the 'diff' column in ascending order
train_data_sorted = train_data.sort_values('diff')

# show only the rows where the 'label' column is 'RD'
rd_rows = train_data_sorted[train_data_sorted['label'] == 'RD']
rd_rows.reset_index(inplace = True)
# print the resulting dataframe
display(rd_rows)

Unnamed: 0,index,text,label,diff
0,5934,Bichromates of organic amines and onium salts ...,RD,0.000813
1,4883,Diazotype compositions and photographic elemen...,RD,0.001396
2,4735,Photosensitive silver halide materials \n\n902...,RD,0.004962
3,5115,274002\n\nA method for synchronizing two clock...,RD,0.005141
4,4568,Pseudo-end-capped Polyamic-acids \n\n8535\n\nN...,RD,0.008293
...,...,...,...,...
3217,7542,14447\n\nPositive-working immobile photographi...,RD,0.999324
3218,7034,296023\n\nConstruction to Reduce Stress in TCM...,RD,0.999333
3219,6345,Improvements in the analyses of metals \n\n124...,RD,0.999339
3220,6927,295032\n\nStepper Motor Phase Protection Circu...,RD,0.999380
