In [1]:
from google.cloud import bigquery

from open_patstat.utils.gcp import create_table, load_gcs_file, delete_table
from open_patstat.utils.schema import Schema

In [2]:
client = bigquery.Client()

In [3]:
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
dataset_ref = client.dataset('patstat')

# 1. Loading Tables

## 1.1. Table "tls201" (main table)

In [4]:
create_table(client,
             dataset_id='patstat',
             table_id='tls201',
             schema=Schema().tls201)

In [5]:
table_ref = dataset_ref.table('tls201_s')
job_config.schema = Schema().tls201
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn_01/tls201_*.gz', 
              table_ref, job_config)
#load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn_01/tls201_part01.txt', 
#              table_ref, job_config)

Starting job lgs-da4c45c0-e673-4ba5-bfff-b103083da844
Job took 9.093069076538086 seconds


## 1.2. Table "tls209_appln_ipc" (IPC codes assigned to each application)

In [7]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls209',
             schema=Schema().tls209)

In [8]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls209')
job_config.schema = Schema().tls209
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls209_*.gz', 
              table_ref, job_config)

Starting job lgs-88f2b615-3abd-4d3d-93ad-ef855e397bcf
Job took 267.2587592601776 seconds


In [9]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls209_s',
             schema=Schema().tls209)

In [10]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls209_s')
job_config.schema = Schema().tls209
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls209_part01.txt', 
              table_ref, job_config)

Starting job lgs-97d56603-15b6-4d21-ba28-016a6117f4bc
Job took 3.600205659866333 seconds


## 1.3. Table "tls204" (the priority status of patents)

In [5]:
create_table(client,
             dataset_id='patstat',
             table_id='tls204',
             schema=Schema().tls204)

In [6]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls204')
job_config.schema = Schema().tls204
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls204_*.gz', 
              table_ref, job_config)

Starting job lgs-816820f0-9966-4b84-a2ae-25fc42b4fe62
Job took 216.11920499801636 seconds


In [9]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls204_s',
             schema=Schema().tls204)

In [10]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls204_s')
job_config.schema = Schema().tls204
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls204_part01.txt', 
              table_ref, job_config)

Starting job lgs-506b7785-bd98-4b75-a829-28c88a20187f
Job took 6.117683410644531 seconds


## 1.4. Table "tls207": the correspondence between patent application and inventors

In [7]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls207',
             schema=Schema().tls207)

In [8]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls207')
job_config.schema = Schema().tls207
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls207_*.gz', 
              table_ref, job_config)

Starting job lgs-ad379b9e-f584-4829-8ae0-ebed6eb950af
Job took 379.3931813240051 seconds


In [11]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls207_s',
             schema=Schema().tls207)

In [12]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls207_s')
job_config.schema = Schema().tls207
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls207_part01.txt', 
              table_ref, job_config)

Starting job lgs-15d0c099-f0c0-4706-9e15-af4fcc411147
Job took 5.712496280670166 seconds


## 1.5. Table "tls206": Details on names and addresses of applicants

In [None]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls206',
             schema=Schema().tls206)

In [None]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls206')
job_config.schema = Schema().tls206
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls206_*.gz', 
              table_ref, job_config)

In [13]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls206_s',
             schema=Schema().tls206)

In [15]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls206_s')
job_config.schema = Schema().tls206
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls206_part01.txt', 
              table_ref, job_config)

Starting job lgs-a1667418-f504-487d-94f9-0f9eead2d590
Job took 8.971823930740356 seconds


## 1.6. Table "tls211": 
## Information on the patent offices of destination (publication authorities) of all INPADOC family members and excludes the PCT publication authority (WO)

In [5]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls211',
             schema=Schema().tls211)

In [6]:
# Adding files to the main table from GCP bucket
table_ref = dataset_ref.table('tls211')
job_config.schema = Schema().tls211

In [7]:
load_job = client.load_table_from_uri(
        source_uris='gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls211_*.gz',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )

In [8]:
load_job.result()

<google.cloud.bigquery.job.LoadJob at 0x7f45f5fd0940>

In [None]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls211_s',
             schema=Schema().tls211)

In [4]:
# Creating Sample dataset
create_table(client,
             dataset_id='patstat',
             table_id='tls211_s',
             schema=Schema().tls211)

THIS TABLE ALREADY EXISTS IN usptobias:patstat


In [None]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls211_s')
job_config.schema = Schema().tls211
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls211_part01.txt', 
              table_ref, job_config)

## 1.7. Table "tls212": Citations

In [None]:
# Creating main table
create_table(client,
             dataset_id='patstat',
             table_id='tls212',
             schema=Schema().tls212)

In [None]:
# Adding files to the sample table from GCP bucket
table_ref = dataset_ref.table('tls212')
job_config.schema = Schema().tls212
load_gcs_file(client, 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/tls212_*.gz', 
              table_ref, job_config)

## 1.8. Checking to see if the data is loaded correctly on the tables

In [None]:
# Check table creation and displays meta data
# Run in command line
bq show my-dataset:patstat.tls201