In [None]:
!pip install tensorflow-datasets --quiet
!pip install transformers --quiet
!pip install pydot --quiet

In [None]:
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

# Pretty print
from pprint import pprint

# Datasets load_dataset function
from datasets import load_dataset, load_metric, load_from_disk

In [None]:
from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

# Dataset Acquisition

In [None]:

def running_in_colab():
    """
    Check if the Jupyter Notebook is running in Google Colab.

    Returns:
        bool: True if running in Google Colab, False otherwise.
    """
    try:
        import google.colab

        return True
    except ImportError:
        return False

In [None]:
#mount data on drive

if running_in_colab():
    from google.colab import drive

    drive.mount("drive")

Mounted at drive


In [None]:
# Enter path here
train_path = '/content/drive/MyDrive/patent_data/VDB/train'
val_path = '/content/drive/MyDrive/patent_data/VDB/val'

In [None]:

# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

if os.path.exists(train_path) and os.path.exists(val_path):
  print('Train and validation sets are cached to: ')
  pprint(train_path)
  pprint(val_path)
  train_set = load_from_disk(train_path)
  val_set = load_from_disk(val_path)
else:
  # loading data, this is just a small set
  dataset_dict = load_dataset('HUPD/hupd',
                              name='all',
                              data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
                              icpr_label=None,
                              train_filing_start_date='2015-01-01',
                              train_filing_end_date='2016-12-31',
                              val_filing_start_date='2017-01-01',
                              val_filing_end_date='2017-12-31')

  # Re-labeling/mapping.
  train_set = dataset_dict['train'].map(map_decision_to_string)
  val_set = dataset_dict['validation'].map(map_decision_to_string)
  train_set = train_set.filter(lambda data: data["decision"] <= 1)
  val_set = val_set.filter(lambda data: data["decision"] <= 1)
  train_set = train_set.filter(lambda data: 'A61K' in data["ipc_label"])
  val_set = val_set.filter(lambda data: 'A61K' in data["ipc_label"])
  train_set.save_to_disk(train_path)
  val_set.save_to_disk(val_path)

Train and validation sets are cached to: 
'/content/drive/MyDrive/patent_data/VDB/train'
'/content/drive/MyDrive/patent_data/VDB/val'


In [None]:
# Print info about the sizes of the train and validation sets
print(f'Train dataset size: {train_set.shape}')
print(f'Validation dataset size: {val_set.shape}')


Train dataset size: (14216, 14)
Validation dataset size: (1351, 14)


In [None]:
train_set[:3]

{'patent_number': ['12598047', '13001814', '13144833'],
 'decision': [0, 1, 0],
 'title': ['MULTIPOTENT STEM CELLS AND USES THEREOF',
  'Xanthine Oxidase Inhibitor And Uric Acid Production Inhibitor',
  'ROSACEA TREATMENTS AND KITS FOR PERFORMING THEM'],
 'abstract': ['The invention provides a quiescent stem cell having the capacity to differentiate into ectoderm, mesoderm and endoderm, and which does not express cell surface markers including MHC class I, MHC class II, CD44, CD45, CD13, CD34, CD49c, CD73, CD105 and CD90. The invention further provides a proliferative stem cell, which expresses genes including Oct-4, Nanog, Sox2, GDF3, P16INK4, BMI, Notch, HDAC4, TERT, Rex-1 and TWIST but does not express cell surface markers including MHC class I, MHC class II, CD44, CD45, CD13, CD34, CD49c, CD73, CD105 and CD90. The cells of the invention can be isolated from adult mammals, have embryonic cell characteristics, and can form embryoid bodies. Methods for obtaining the stem cells, as wel

In [None]:
train_set = train_set.filter(lambda data: data["decision"] <= 1)
val_set = val_set.filter(lambda data: data["decision"] <= 1)

train_set = train_set.filter(lambda data: 'A61K' in data["ipc_label"])
val_set = val_set.filter(lambda data: 'A61K' in data["ipc_label"])

X_train = train_set['abstract']
Y_train = train_set['decision']

X_test = val_set['abstract']
Y_test = val_set['decision']

In [None]:
#Look at total count of each decision in train_set['decision']
zeros_counts = Y_train.count(0)
ones_counts = Y_train.count(1)
print(f'(Rejected: {zeros_counts})')
print(f'(Accepted: {ones_counts})')

(Rejected: 5812)
(Accepted: 8404)


In [None]:
lengths = []
for abstract in X_train:
  lengths.append(len(abstract))

print(max(lengths))

3103


#BERT

In [None]:
checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(checkpoint)
bert_model = TFBertModel.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
# BERT Tokenization of training and test data

# num_train_examples = 7000      # set number of train examples - 1500 for realtime demo
# num_test_examples = 5000        # set number of test examples - 500 for realtime demo

MAX_SEQUENCE_LENGTH = 512                # set max_length of the input sequence

x_train = bert_tokenizer(X_train,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')

x_test = bert_tokenizer(X_test,
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length',
              return_tensors='tf')


In [None]:
x_train

{'input_ids': <tf.Tensor: shape=(14216, 512), dtype=int32, numpy=
array([[  101,  1109, 11918, ...,     0,     0,     0],
       [  101,   138,   193, ...,     0,     0,     0],
       [  101, 23287, 10453, ...,     0,     0,     0],
       ...,
       [  101,  1130, 19456, ...,     0,     0,     0],
       [  101,  1188, 23979, ...,     0,     0,     0],
       [  101,  1130, 14850, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(14216, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(14216, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype

In [None]:
x_train.token_type_ids

<tf.Tensor: shape=(14216, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>

In [None]:
x_train.attention_mask

<tf.Tensor: shape=(14216, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [None]:
def create_bert_classification_model(bert_model,
                                     max_sequence_length=MAX_SEQUENCE_LENGTH,
                                     hidden_size = 256,
                                     dropout=0.3,
                                     learning_rate=0.00001):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes
    """
    bert_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # bert_out = bert_model(bert_inputs)
    bert_out = bert_model(input_ids=bert_inputs['input_ids'],
                      token_type_ids=bert_inputs['token_type_ids'],
                      attention_mask=bert_inputs['attention_mask'])

    # pooler_token = bert_out[1]
    cls_token = bert_out[0][:, 0, :]

    dense1 = tf.keras.layers.Dense(hidden_size, activation='relu', name='dense1')(cls_token)

    dropout1 = tf.keras.layers.Dropout(dropout)(dense1)

    dense2 = tf.keras.layers.Dense(hidden_size, activation='relu', name='dense2')(dropout1)

    dropout2 = tf.keras.layers.Dropout(dropout)(dense2)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(dropout2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model

In [None]:
#let's get a fresh instance of the bert_model -- good practice
bert_model = TFBertModel.from_pretrained(checkpoint)
bert_classification_model = create_bert_classification_model(bert_model)

In [None]:
bert_classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids_layer (InputLaye  [(None, 512)]                0         []                            
 r)                                                                                               
                                                                                                  
 attention_mask_layer (Inpu  [(None, 512)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 512)]                0         []                            
 tLayer)                                                                                      

In [None]:
# bert_classification_model_history = bert_classification_model.fit(
#     [x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
#     Y_train,
#     validation_data=([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask], Y_test),
#     batch_size=10,
#     epochs=3
# )

# Convert EagerTensor to NumPy arrays
x_train_arrays = {
    'input_ids_layer': x_train['input_ids'].numpy(),
    'token_type_ids_layer': x_train['token_type_ids'].numpy(),
    'attention_mask_layer': x_train['attention_mask'].numpy()
}

x_test_arrays = {
    'input_ids_layer': x_test['input_ids'].numpy(),
    'token_type_ids_layer': x_test['token_type_ids'].numpy(),
    'attention_mask_layer': x_test['attention_mask'].numpy()
}

# Convert data to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_arrays, Y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test_arrays, Y_test))

# Shuffle and batch the datasets
train_dataset = train_dataset.shuffle(buffer_size=len(Y_train)).batch(4)
test_dataset = test_dataset.batch(4)

# Train the model
bert_classification_model_history = bert_classification_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5,
    verbose=1
)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
