## pip install the transformers and datasets libraries.

In [3]:
model_name =  'distilbert-base-uncased'

##  Import relevant libraries and dependencies

In [5]:
pip install -U matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.53.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.1.2-py3-none-any.whl.metadata (5.1 kB)
Downloading matplotlib-3.9.1-cp312-cp312-macosx_11_0_arm64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

from transformers import pipeline, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


use the `load_dataset` function to load all the patent applications that were filed to the USPTO in January 2016. We specify the date ranges of the training and validation sets as January 1-21, 2016 and January 22-31, 2016, respectively.

In [7]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

Loading dataset with config: PatentsConfig(name='sample', version=0.0.0, data_dir='sample', data_files={'train': ['https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather']}, description='Patent data from January 2016, for debugging')


Downloading data: 100%|██████████| 6.67M/6.67M [00:00<00:00, 38.4MB/s]


Using metadata file: /Users/srikanthnaidu/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710


Downloading data: 100%|██████████| 388M/388M [00:05<00:00, 66.4MB/s] 


Reading metadata file: /Users/srikanthnaidu/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710
Filtering train dataset by filing start date: 2016-01-01
Filtering train dataset by filing end date: 2016-01-21
Filtering val dataset by filing start date: 2016-01-22
Filtering val dataset by filing end date: 2016-01-31


Generating train split: 16153 examples [00:07, 2087.05 examples/s]
Generating validation split: 9094 examples [00:04, 1977.43 examples/s]

Loading is done!





In [8]:
dataset_dict.shape

{'train': (16153, 14), 'validation': (9094, 14)}

In [9]:
validation_dict = dataset_dict['validation']
print(len(validation_dict))

9094


In [10]:
validation_dict[:1]

{'patent_number': ['13144833'],
 'decision': ['REJECTED'],
 'title': ['ROSACEA TREATMENTS AND KITS FOR PERFORMING THEM'],
 'abstract': ['Regimen for the treatment of rosacea include the application of an anti-redness composition to at least a portion of the cleansed area of skin afflicted with rosacea. The regimen may include the application of one or more of a polymetal complex, a composition containing metronidazole, and/or a protective composition. Kits containing components useful in performing such regimens are also described.'],
 'claims': ['1. A treatment regimen comprising: cleansing at least a portion of an area of skin afflicted with rosacea with an antimicrobial or cleanser; applying an anti-redness composition to at least a portion of the cleansed area; and applying a protective composition to at least a portion of the cleansed, and moisturized area. 2. A treatment regimen as in claim 1 further comprising the step of applying a composition containing metronidazole to at lea

In [11]:
keys = list(validation_dict[:1].keys())

In [12]:
keys

['patent_number',
 'decision',
 'title',
 'abstract',
 'claims',
 'background',
 'summary',
 'description',
 'cpc_label',
 'ipc_label',
 'filing_date',
 'patent_issue_date',
 'date_published',
 'examiner_id']

In [13]:
type(validation_dict[:1])

dict

## save to csv file

In [20]:
import csv

with open('patent_application_4.csv', 'w') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames = keys)
	writer.writeheader()
	writer.writerows([validation_dict[35]])

In [23]:
import pandas as pd

df = pd.read_csv('patent_application_4.csv')

In [24]:
df

Unnamed: 0,patent_number,decision,title,abstract,claims,background,summary,description,cpc_label,ipc_label,filing_date,patent_issue_date,date_published,examiner_id
0,14891246,ACCEPTED,CIRCUITRY AND METHOD FOR REGULATING A CURRENT ...,A circuitry for regulating a current for an el...,1. A circuitry for regulating a current for an...,,,The present disclosure pertains to the field o...,H02M3155,H02M3155,20160122,20170704,20160616,95689.0
