## Importing

In [1]:
from google.cloud import bigquery

from open_patstat.utils.gcp import create_table, load_gcs_file, delete_table
from open_patstat.utils.schema import Schema

## Initializing the Client anf Job Config

In [2]:
# Before running this line, make sure that you have defined the environment variable...
# ..."GOOGLE_APPLICATION_CREDENTIALS" which points to the JSON file containing authentication key
client = bigquery.Client()

In [3]:
# Initializing the Job_config
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
dataset_ref = client.dataset('patstat')

In [5]:
Schema().tls206

[SchemaField('person_id', 'INTEGER', 'REQUIRED', None, ()),
 SchemaField('person_name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('person_address', 'STRING', 'NULLABLE', None, ()),
 SchemaField('person_ctry_code', 'STRING', 'NULLABLE', None, ()),
 SchemaField('doc_std_name_id', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('doc_std_name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('psn_id', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('psn_name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('psn_level', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('psn_sector', 'STRING', 'NULLABLE', None, ())]

## Creating and Adding Tables

In [7]:
########## Creating and Loading tables ##########
#################################################

# Tables list to be loaded
#tables_list = ['tls201', 'tls209', 'tls204', 'tls207', 'tls206', 'tls211', 'tls212']
tables_list = ['tls206']
               #'tls202', 'tls203', 'tls205', 'tls210', 
               #'tls214', 'tls215', 'tls216', 'tls222', 
               #'tls223', 'tls224', 'tls226', 'tls227', 
               #'tls229', 'tls230', 
               #'tls231', 'tls801', 'tls803',
               #'tls901', 'tls902', 'tls904', 'tls906' ]

# Google Bucket directory address, which contains all data files
#gs_add = 'gs://patstat_2018g/new/'
gs_add = 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/'

# Loading the tables in the list
for i, table in enumerate(tables_list):
    # Creating the table
    print('Step {} has started! {} is creating...'.format(i, table))
    create_table(client,
             dataset_id='patstat',
             table_id=table,
             schema=getattr(Schema(),table))
    # Adding files to the table from GCP bucket
    table_ref = dataset_ref.table(table)
    job_config.schema = getattr(Schema(),table)
    #load_gcs_file(client, gs_add+table+'_*.gz', 
    #              table_ref, job_config)
    
    load_job = client.load_table_from_uri(
        source_uris=gs_add+table+'_*.gz',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
    load_job.result()

Step 0 has started! tls206 is creating...


In [7]:
load_job.errors

[{'reason': 'invalid',
  'message': 'Provided Schema does not match Table usptobias:patstat.tls231. Field fee_renewal_year has changed type from DATE to INTEGER'}]

# Creating PAtentsview tables

In [2]:
client = bigquery.Client()
# Initializing the Job_config
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 20000
job_config.source_format = bigquery.SourceFormat.CSV
#job_config.field_delimiter = '\t'
dataset_ref = client.dataset('patentsview')

In [3]:
########## Creating and Loading tables ##########
#################################################

# Tables list to be loaded
#tables_list = ['rawexaminer', 'rawlawyer']
#tables_list = ['patent', 'application']
tables_list = ['mapping']

# Google Bucket directory address, which contains all data files
#gs_add = 'gs://patstat_2018g/new/'
gs_add = 'gs://uspto-data/'

# Loading the tables in the list
for i, table in enumerate(tables_list):
    # Creating the table
    print('Step {} has started! {} is creating...'.format(i, table))
    create_table(client,
             dataset_id='patentsview',
             table_id=table,
             schema=getattr(Schema(),table))
    # Adding files to the table from GCP bucket
    table_ref = dataset_ref.table(table)
    job_config.schema = getattr(Schema(),table)
    #load_gcs_file(client, gs_add+table+'_*.gz', 
    #              table_ref, job_config)
    
    load_job = client.load_table_from_uri(
        source_uris=gs_add+table+'.csv.gz',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
    load_job.result()

Step 0 has started! mapping is creating...


BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 20001; errors: 20001. Please look into the errors[] collection for more details.

In [7]:
load_job.errors

[{'reason': 'invalid',
  'location': 'gs://uspto-data/patent.tsv.gz',
  'message': 'Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 6181207; errors: 201. Please look into the errors[] collection for more details.'},
 {'reason': 'invalid',
  'location': 'gs://uspto-data/patent.tsv.gz',
  'message': 'Error while reading data, error message: CSV table references column position 10, but line starting at position:2411318766 contains only 6 columns.'},
 {'reason': 'invalid',
  'location': 'gs://uspto-data/patent.tsv.gz',
  'message': 'Error while reading data, error message: CSV table references column position 10, but line starting at position:2411318947 contains only 1 columns.'},
 {'reason': 'invalid',
  'location': 'gs://uspto-data/patent.tsv.gz',
  'message': 'Error while reading data, error message: CSV table references column position 10, but line starting at position:2411319002 contains only 1 columns.'},
 {'reason': 'invalid',
  'loca