In [1]:
"""
In the first phase, a domain classifier is trained on a pair of datasets.
The goal is to teach the model how to distinguish between the two datasets
by learning their unique features and characteristics.
"""

'\nIn the first phase, a domain classifier is trained on a pair of datasets.\nThe goal is to teach the model how to distinguish between the two datasets\nby learning their unique features and characteristics.\n'

In [2]:
# Import the required libraries
import torch.nn.functional as F
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

2023-07-05 12:59:47.211378: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-05 12:59:47.920364: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import re
# function to remove all digits from the abstract
def clean_abstract(text):
    return re.sub(r"\d", "", text)

In [4]:
data1 = pd.read_csv('final_patent.csv')
data1 = data1.dropna()
data1 = data1.reset_index(drop=True)
data1['text'] = data1['patent_title'].astype(str) + ' ' + data1['patent_abstract'].astype(str)
data1['text'] = data1['text'].apply(lambda x: x.lower())
data1['text'] = data1['text'].apply(clean_abstract)
data1["label"] = "Patents"
data1[["text", "label"]] = data1[["patent_abstract", "label"]].astype(str)
data1["year"] = pd.to_datetime(data1['patent_date']).dt.year
data1 = data1[["text", "cpc_code", "year", "label"]]

  data1 = pd.read_csv('final_patent.csv')


In [5]:
data1.head(2)

Unnamed: 0,text,cpc_code,year,label
0,A frequency modulated (coherent) laser detecti...,G01,2018,Patents
1,The injection molding machine includes a fixed...,B29,2018,Patents


In [6]:
grouped_data = data1.groupby(['cpc_code', 'year'])

data1 = grouped_data.apply(lambda x: x.sample(n=min(200, len(x)), random_state=42))
data1 = data1.reset_index(drop=True)
data1


Unnamed: 0,text,cpc_code,year,label
0,An improved combine waste collection system i...,A01,1976,Patents
1,A class of aminimides structurally characteri...,A01,1976,Patents
2,A flow control valve for use with fluid under...,A01,1976,Patents
3,A tractor having a pair of parallel catch arm...,A01,1976,Patents
4,A portable electric fence post is disclosed f...,A01,1976,Patents
...,...,...,...,...
937644,A three-dimensional memory device is provided....,H10,2022,Patents
937645,A display device is disclosed that includes on...,H10,2022,Patents
937646,"A memory device may be provided, including a b...",H10,2022,Patents
937647,An energy recovery unit (8) for use in a vehic...,H10,2022,Patents


In [7]:
data2 = pd.read_csv('abstract_title_text_RD.csv')
data2 = data2.dropna()
data2 = data2.reset_index(drop=True)

data2['abstract'] = data2['abstract'].apply(lambda x: x.lower())
data2['abstract'] = data2['abstract'].apply(clean_abstract)
data2['abstract'] = data2['abstract'].str.rsplit('.', 1).str[0]
data2['year'] = pd.to_datetime(data2['date']).dt.year
# test_data = rd

data2 = data2.dropna()
data2 = data2.reset_index(drop=True)
data2["label"] = "RD"
data2[["text", "label"]] = data2[["abstract", "label"]].astype(str)
data2 = data2[["text", "year", "label"]]

  data2['abstract'] = data2['abstract'].str.rsplit('.', 1).str[0]


In [8]:
data2.head(2)

Unnamed: 0,text,year,label
0,esterification of acids low-molecular-weight p...,1968,RD
1,advanced metallurgy process an advanced metal...,1985,RD


In [9]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import transformers
from transformers import AutoTokenizer, AutoConfig
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
from tqdm import tqdm
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=250):
    input_ids = []
    attention_mask = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i + chunk_size]
        encs = tokenizer.batch_encode_plus(
            text_chunk,
            max_length=maxlen,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )        
        input_ids.append(encs['input_ids'])
        attention_mask.append(encs['attention_mask'])

    return {
        'input_ids': torch.cat(input_ids, dim=0).squeeze(),
        'attention_mask': torch.cat(attention_mask, dim=0).squeeze()
    }

In [11]:
# Merge the datasets into a single dataframe
data = pd.concat([data1, data2], ignore_index=True)

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

train_data_text = train_data['text'].to_list()
val_data_text = val_data['text'].to_list()


In [12]:
data

Unnamed: 0,text,cpc_code,year,label
0,An improved combine waste collection system i...,A01,1976,Patents
1,A class of aminimides structurally characteri...,A01,1976,Patents
2,A flow control valve for use with fluid under...,A01,1976,Patents
3,A tractor having a pair of parallel catch arm...,A01,1976,Patents
4,A portable electric fence post is disclosed f...,A01,1976,Patents
...,...,...,...,...
988408,a fast adder using an optical carry chain ca...,,1994,RD
988409,"aiaa guidance, navigation, and control confer...",,2011,RD
988410,automatic personalized interests graph constr...,,2013,RD
988411,stabilizer compound for dye enhanced photothe...,,1978,RD


In [13]:
train_encodings = fast_encode(train_data_text, tokenizer)
val_encodings = fast_encode(val_data_text, tokenizer)

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

100%|██████████| 3089/3089 [02:09<00:00, 23.79it/s]
100%|██████████| 773/773 [00:32<00:00, 23.78it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
# Tokenize the input data and convert to tensors
def labels(data):
    label_map = {label: i for i, label in enumerate(set(data['label'].to_list()))}
    labels = [label_map[label] for label in data['label'].to_list()]
    labels = torch.tensor(labels)
    return labels
train_labels = labels(train_data)
val_labels = labels(val_data)

In [15]:
# Load the two datasets
# data1 = pd.read_csv('sample_data.csv', nrows = 4000)
# data1['text'] = data1['patent_title'].astype(str) + ',' + data1['patent_abstract'].astype(str) + ',' + data1['summary_text'].astype(str)

In [16]:
# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 1
total_steps = len(train_data) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Reset the loss for this epoch
    total_loss = 0
    total_mask = []
    all_diffs = []
    
    # Train the model on batches of data
    for i in range(0, len(train_data), 32):
        # Clear gradients
        optimizer.zero_grad()

        # Move the data to the device
        batch_encodings = {key: val[i:i+32].to(device) for key, val in train_encodings.items()}
        batch_labels = train_labels[i:i+32].to(device)

        # Forward pass
        outputs = model(**batch_encodings, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        # print(outputs.logits)
        # apply softmax along the second dimension (classes)
        probs = F.softmax(outputs.logits, dim=1)
        # convert probabilities tensor to a NumPy array
        probs_np = probs.detach().cpu().numpy()
        diff = abs(probs_np[:, 1] - probs_np[:, 0]).tolist()
        all_diffs.append(diff)

        # # create boolean mask to select rows with probability between 0.5 and 0.7
        # mask = np.logical_and(probs_np[:,0] > 0.4, probs_np[:,0] < 0.6)
        # total_mask.append(mask)
        # # select rows using boolean mask
        # selected_rows = data1[mask]

        # # print selected rows
        # print(selected_rows)
        # print(probs)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()
        scheduler.step()

    # Print the average loss for this epoch
    print(f"Epoch {epoch+1} loss: {total_loss/len(train_data)}")

    # Set the model to evaluation mode
    model.eval()

    # Evaluate the model on the validation set
    with torch.no_grad():
        val_loss = 0
        total = 0
        total_correct = 0
        num_correct = 0
        for i in range(0, len(val_data), 32):
            # Move the data to the device
            batch_encodings = {key: val[i:i+32].to(device) for key, val in val_encodings.items()}
            batch_labels = val_labels[i:i+32].to(device)

            # Forward pass
            outputs = model(**batch_encodings, labels=batch_labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            total = total + len(predictions)
            num_correct = torch.sum(predictions == batch_labels).item()
            total_correct = total_correct + num_correct
        print(total_correct/total)




Epoch 1 loss: 0.00010090165303754426
0.9983863053474502


In [17]:
"""
In the second phase, the source domain training samples are ranked based on the output from the domain classifier. 
This ranking process identifies which samples in the source domain are most similar to the target domain. 
A subset of the top-ranked data points is then selected from the source domain training set.
"""

'\nIn the second phase, the source domain training samples are ranked based on the output from the domain classifier. \nThis ranking process identifies which samples in the source domain are most similar to the target domain. \nA subset of the top-ranked data points is then selected from the source domain training set.\n'

In [18]:
import itertools
flattened = list(itertools.chain.from_iterable(all_diffs))

In [19]:
train_data['diff'] = flattened

In [20]:
# sort the dataframe by the 'diff' column in ascending order
train_data_sorted = train_data.sort_values('diff')

# show only the rows where the 'label' column is 'Patent'
patent_rows = train_data_sorted[train_data_sorted['label'] == 'Patents']
patent_rows.reset_index(inplace = True)
# print the resulting dataframe
display(patent_rows)

Unnamed: 0,index,text,cpc_code,year,label,diff
0,733708,"A furnace for preparing test specimens, such ...",F27,1976,Patents,0.001711
1,415226,High molecular weight poly(dihalophosphazenes...,C07,1981,Patents,0.007467
2,533788,"A device having at least one, preferably a pl...",D04,1981,Patents,0.008148
3,851654,A high-hat cymbal stand of the type including...,G10,1984,Patents,0.009876
4,908975,Voltage controlled variable attenuators are de...,H03,2014,Patents,0.012645
...,...,...,...,...,...,...
750232,829987,A wireless device access system employs short-...,G07,2016,Patents,0.999998
750233,844425,Each display control block processes color pi...,G09,1995,Patents,0.999998
750234,387202,A submerged biofiltration purifying apparatus ...,C02,2012,Patents,0.999999
750235,353932,A variable counterweight system includes a cou...,B66,2011,Patents,0.999999


In [21]:
train_data_sorted.to_csv("selection_based_on_paper.csv")

In [26]:
train_data_sorted[:790]

Unnamed: 0,text,cpc_code,year,label,diff
733708,"A furnace for preparing test specimens, such ...",F27,1976,Patents,0.001711
958372,optical radiation coupling into an optical fi...,,1996,RD,0.006974
415226,High molecular weight poly(dihalophosphazenes...,C07,1981,Patents,0.007467
533788,"A device having at least one, preferably a pl...",D04,1981,Patents,0.008148
851654,A high-hat cymbal stand of the type including...,G10,1984,Patents,0.009876
...,...,...,...,...,...
378828,"Although U.S. Pat. No. 8,182,784 teaches the r...",C01,2017,Patents,0.772762
942280,improved process for preparing diazinon diazi...,,1990,RD,0.773025
946890,test method for priority circuit of data proc...,,1984,RD,0.773394
982632,ethylidene diacetate as a precursor to n-viny...,,1995,RD,0.773489
