In [2]:
import os, sys, json
from datetime import datetime, timedelta
import requests
import pandas as pd

import config.config as config
import src.data_pull as pull


## Get API Fields to Query 

Call API based on indicated fields in CSV.

In [3]:
today = datetime.now().strftime("%Y%m%d")
req_csv = os.path.join(os.getcwd(), 'data/api_fields_20230322.csv') ## Change this file if needed
export_file = os.path.join(os.getcwd(), f'data/patent_view_{today}.json')

In [4]:
df_api = pull.read_required_fields(req_csv)
str_format = pull.format_fields_for_api(df_api)

#### Pull a Sample of 30,000 datapoints

In [8]:
## currently pulls 30,000 patents per call (10,000/ call * 3 loops)
## query looks for dates after a year prior to now 
query_date = (datetime.now()-timedelta(days=365)).strftime("%Y-%m-%d")
list_dfs = list()
for i in range(3):
    ## Make API call, looking for response code of 200 for a successful pull
    api_url = f'https://api.patentsview.org/patents/query?q={{"_gte":{{"patent_date":"{query_date}"}}}}&f={str_format}&o={{"page":{i+1},"per_page":10000}}'
    df_ = pull.call_api(api_url)
    list_dfs.append(df_)
df_raw = pull.combine_called_dfs(list_dfs)
pull.write_json(df_raw, export_file)

### Pull All Data from the last year to now

In [58]:
## query looks for dates after a year prior to now 
query_date = (datetime.now()-timedelta(days=365)).strftime("%Y-%m-%d")
list_dfs = list()
check=True
i = 0
while check == True:
    ## Make API call, looking for response code of 200 for a successful pull
    api_url = f'https://api.patentsview.org/patents/query?q={{"_gte":{{"patent_date":"{query_date}"}}}}&f={str_format}&o={{"page":{i+1},"per_page":10000}}'
    df_ = pull.call_api(api_url)
    list_dfs.append(df_)
    i += 1
    if type(df_) == int:
        print(df_)
        break
    if df_.empty:
        check = False
df_raw = pull.combine_called_dfs(list_dfs)
pull.write_json(df_raw, export_file)

## Pull Data Between a Specific Date Range 

In [4]:
start_date = '{"_gte":{"patent_date":"2020-07-01"}}' ## adjust date as needed
end_date = '{"_lte":{"patent_date":"2020-12-31"}}' ## adjust date as needed 

In [6]:
## currently pulls 30,000 patents per call (10,000/ call * 3 loops)
## query looks for dates after a year prior to now 
query_date = (datetime.now()-timedelta(days=365)).strftime("%Y-%m-%d")
list_dfs = list()
i = 0 
check = True
while check == True: 
    ## Make API call, looking for response code of 200 for a successful pull
    api_url = f'https://api.patentsview.org/patents/query?q={{"_and":[{start_date},{end_date}]}}&f={str_format}&o={{"page":{i+1},"per_page":10000}}'
    df_ = pull.call_api(api_url)
    list_dfs.append(df_)
    i += 1
    if type(df_) == int:
        print(df_)
        break
    if df_.empty:
        check = False 

In [8]:
os.getcwd()

'/Users/mckenziequinn/github/Project_Patent_Classification/DataEngineering'

In [7]:
df = pull.combine_called_dfs(list_dfs)
pull.write_json(df, '/data/api_pull_20200701_20201231.json')

## Get Additional data fields for Specific API Fields 

When calling specific patent number, you need to batch the ids in groups of 400 ids per API call, otherwise you run into errors with making the request.

In [33]:
path = os.path.join(os.getcwd(),'data/2020-12-31.csv')
export_file = os.path.join(os.getcwd(),f'data/{today}_updated.csv')

In [146]:
## Need to batch ids into groups of 400 due to API limits
patent_ids = pull.read_specific_patents(path)
patent_id = pull.to_matrix(patent_ids,400)

In [148]:
list_dfs = list()
for p in patent_ids:
    pid = str(p)
    api_url = f'https://api.patentsview.org/patents/query?q={{"patent_number":{pid}}}&f={str_format}&o={{"page":{1},"per_page":10000}}'
    df_temp = pull.call_api(api_url)
    list_dfs.append(df_temp)

In [149]:
df = pull.combine_called_dfs(list_dfs)
pull.write_json(df, export_file)
