In [None]:
"""
In the first phase, a domain classifier is trained on a pair of datasets.
The goal is to teach the model how to distinguish between the two datasets
by learning their unique features and characteristics.
"""

In [None]:
# Import the required libraries
import torch.nn.functional as F
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [None]:
import re
# function to remove all digits from the abstract
def clean_abstract(text):
    return re.sub(r"\d", "", text)

In [None]:
data1 = pd.read_csv('final_patent.csv')
data1 = data1.dropna()
data1 = data1.reset_index(drop=True)
data1['text'] = data1['patent_title'].astype(str) + ' ' + data1['patent_abstract'].astype(str)
data1['text'] = data1['text'].apply(lambda x: x.lower())
data1['text'] = data1['text'].apply(clean_abstract)
data1["label"] = "Patents"
data1[["text", "label"]] = data1[["patent_abstract", "label"]].astype(str)
data1["year"] = pd.to_datetime(data1['patent_date']).dt.year
data1 = data1[["text", "cpc_code", "year", "label"]]

In [None]:
data1.head(2)

In [None]:
grouped_data = data1.groupby(['cpc_code', 'year'])

data1 = grouped_data.apply(lambda x: x.sample(n=min(200, len(x)), random_state=42))
data1 = data1.reset_index(drop=True)
data1


In [None]:
data2 = pd.read_csv('abstract_title_text_RD.csv')
data2 = data2.dropna()
data2 = data2.reset_index(drop=True)

data2['abstract'] = data2['abstract'].apply(lambda x: x.lower())
data2['abstract'] = data2['abstract'].apply(clean_abstract)
data2['abstract'] = data2['abstract'].str.rsplit('.', 1).str[0]
data2['year'] = pd.to_datetime(data2['date']).dt.year
# test_data = rd

data2 = data2.dropna()
data2 = data2.reset_index(drop=True)
data2["label"] = "RD"
data2[["text", "label"]] = data2[["abstract", "label"]].astype(str)
data2 = data2[["text", "year", "label"]]

In [None]:
data2.head(2)

In [None]:
!pip install transformers

In [None]:
import transformers
from transformers import AutoTokenizer, AutoConfig
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
from tqdm import tqdm
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=250):
    input_ids = []
    attention_mask = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i + chunk_size]
        encs = tokenizer.batch_encode_plus(
            text_chunk,
            max_length=maxlen,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )        
        input_ids.append(encs['input_ids'])
        attention_mask.append(encs['attention_mask'])

    return {
        'input_ids': torch.cat(input_ids, dim=0).squeeze(),
        'attention_mask': torch.cat(attention_mask, dim=0).squeeze()
    }

In [None]:
# Merge the datasets into a single dataframe
data = pd.concat([data1, data2], ignore_index=True)

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2)

train_data_text = train_data['text'].to_list()
val_data_text = val_data['text'].to_list()


In [None]:
data

In [None]:
train_encodings = fast_encode(train_data_text, tokenizer)
val_encodings = fast_encode(val_data_text, tokenizer)

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Tokenize the input data and convert to tensors
def labels(data):
    label_map = {label: i for i, label in enumerate(set(data['label'].to_list()))}
    labels = [label_map[label] for label in data['label'].to_list()]
    labels = torch.tensor(labels)
    return labels
train_labels = labels(train_data)
val_labels = labels(val_data)

In [None]:
# Load the two datasets
# data1 = pd.read_csv('sample_data.csv', nrows = 4000)
# data1['text'] = data1['patent_title'].astype(str) + ',' + data1['patent_abstract'].astype(str) + ',' + data1['summary_text'].astype(str)

In [None]:
# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 1
total_steps = len(train_data) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Reset the loss for this epoch
    total_loss = 0
    total_mask = []
    all_diffs = []
    
    # Train the model on batches of data
    for i in range(0, len(train_data), 32):
        # Clear gradients
        optimizer.zero_grad()

        # Move the data to the device
        batch_encodings = {key: val[i:i+32].to(device) for key, val in train_encodings.items()}
        batch_labels = train_labels[i:i+32].to(device)

        # Forward pass
        outputs = model(**batch_encodings, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        # print(outputs.logits)
        # apply softmax along the second dimension (classes)
        probs = F.softmax(outputs.logits, dim=1)
        # convert probabilities tensor to a NumPy array
        probs_np = probs.detach().cpu().numpy()
        diff = abs(probs_np[:, 1] - probs_np[:, 0]).tolist()
        all_diffs.append(diff)

        # # create boolean mask to select rows with probability between 0.5 and 0.7
        # mask = np.logical_and(probs_np[:,0] > 0.4, probs_np[:,0] < 0.6)
        # total_mask.append(mask)
        # # select rows using boolean mask
        # selected_rows = data1[mask]

        # # print selected rows
        # print(selected_rows)
        # print(probs)

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()
        scheduler.step()

    # Print the average loss for this epoch
    print(f"Epoch {epoch+1} loss: {total_loss/len(train_data)}")

    # Set the model to evaluation mode
    model.eval()

    # Evaluate the model on the validation set
    with torch.no_grad():
        val_loss = 0
        total = 0
        total_correct = 0
        num_correct = 0
        for i in range(0, len(val_data), 32):
            # Move the data to the device
            batch_encodings = {key: val[i:i+32].to(device) for key, val in val_encodings.items()}
            batch_labels = val_labels[i:i+32].to(device)

            # Forward pass
            outputs = model(**batch_encodings, labels=batch_labels)
            loss = outputs.loss
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            total = total + len(predictions)
            num_correct = torch.sum(predictions == batch_labels).item()
            total_correct = total_correct + num_correct
        print(total_correct/total)


In [None]:
"""
In the second phase, the source domain training samples are ranked based on the output from the domain classifier. 
This ranking process identifies which samples in the source domain are most similar to the target domain. 
A subset of the top-ranked data points is then selected from the source domain training set.
"""

In [None]:
import itertools
flattened = list(itertools.chain.from_iterable(all_diffs))

In [None]:
train_data['diff'] = flattened

In [None]:
# sort the dataframe by the 'diff' column in ascending order
train_data_sorted = train_data.sort_values('diff')

# show only the rows where the 'label' column is 'Patent'
patent_rows = train_data_sorted[train_data_sorted['label'] == 'Patents']
patent_rows.reset_index(inplace = True)
# print the resulting dataframe
display(patent_rows)