## Importing

In [2]:
from google.cloud import bigquery

from open_patstat.utils.gcp import create_table, load_gcs_file, delete_table
from open_patstat.utils.schema import Schema

## Initializing the Client anf Job Config

In [4]:
# Before running this line, make sure that you have defined the environment variable...
# ..."GOOGLE_APPLICATION_CREDENTIALS" which points to the JSON file containing authentication key
client = bigquery.Client()

In [3]:
# Initializing the Job_config
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
dataset_ref = client.dataset('patstat')

## Creating and Adding Tables

In [None]:
########## Creating and Loading tables ##########
#################################################

# Tables list to be loaded
tables_list = ['tls201', 'tls209', 'tls204', 'tls207', 'tls206', 'tls211', 'tls212']

# Google Bucket directory address, which contains all data files
gs_add = 'gs://patstat_2018g/data_PATSTAT_Global_2018_Autumn/'

# Loading the tables in the list
for table in tables_list:
    # Creating the table
    create_table(client,
             dataset_id='patstat',
             table_id=table,
             schema=getattr(Schema(),table))
    # Adding files to the table from GCP bucket
    table_ref = dataset_ref.table(table)
    job_config.schema = getattr(Schema(),table)
    load_gcs_file(client, gs_add+table+'_*.gz', 
                  table_ref, job_config)

In [29]:
dir(job_config)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_del_sub_prop',
 '_fill_from_default',
 '_get_sub_prop',
 '_job_type',
 '_properties',
 '_set_sub_prop',
 'allow_jagged_rows',
 'allow_quoted_newlines',
 'autodetect',
 'clustering_fields',
 'create_disposition',
 'destination_encryption_configuration',
 'destination_table_description',
 'destination_table_friendly_name',
 'encoding',
 'fieldDelimiter',
 'field_delimiter',
 'from_api_repr',
 'ignore_unknown_values',
 'labels',
 'max_bad_records',
 'null_marker',
 'quote_character',
 'schema',
 'schema_update_options',
 'skip_leading_rows',
 'source_format',
 'time_partitioning',
 'to_api_repr',
 'use_avro_logical_types',

In [36]:
client = bigquery.Client()

# Initializing the Job_config
job_config = bigquery.LoadJobConfig()
job_config.skip_leading_rows = 1
job_config.max_bad_records = 10
job_config.source_format = bigquery.SourceFormat.CSV
job_config.fieldDelimiter = ';'
job_config.field_delimiter = ';'
job_config.encoding = 'ISO-8859-1'

dataset_ref = client.dataset('results_docdb')

In [37]:
INT = 'INTEGER'
STR = 'STRING'
NULL = 'NULLABLE'
REQ = 'REQUIRED'

schema = [
    bigquery.SchemaField('docdb_family_id', INT, REQ, None, ()),
    bigquery.SchemaField('appln_id_1', INT, REQ, None, ()),
    bigquery.SchemaField('appln_kind_1', STR, NULL, None, ()),
    bigquery.SchemaField('appln_auth_1', STR, NULL, None, ()),
    bigquery.SchemaField('is_1_pf', INT, NULL, None, ()),
    bigquery.SchemaField('nb_of_pf_claimed_by1', INT, NULL, None, ()),
    bigquery.SchemaField('family_size_in_us', INT, NULL, None, ()),
    bigquery.SchemaField('appln_id_2', INT, NULL, None, ()),
    bigquery.SchemaField('appln_kind_2', STR, NULL, None, ()),
    bigquery.SchemaField('appln_auth_2', STR, NULL, None, ()),
    bigquery.SchemaField('is_2_pf', INT, NULL, None, ()),
    bigquery.SchemaField('nb_of_pf_claimed_by2', INT, NULL, None, ()),
    bigquery.SchemaField('family_size_in_auth2', INT, NULL, None, ()),
    bigquery.SchemaField('exact_twins', STR, NULL, None, ())
]

In [38]:
########## Creating and Loading tables ##########
#################################################

# Tables list to be loaded
tables_list = ['familyInformation']

# Google Bucket directory address, which contains all data files
gs_add = 'gs://family_information/'

# Loading the tables in the list
for table in tables_list:
    # Creating the table
    create_table(client,
             dataset_id='results_docdb',
             table_id=table,
             schema=schema)
    # Adding files to the table from GCP bucket
    table_ref = dataset_ref.table(table)
    job_config.schema = schema
    load_job = client.load_table_from_uri(
        source_uris=gs_add+table+'.csv.gz',
        destination=table_ref,
        # job_id=job_id,
        job_id_prefix='lgs-',
        job_config=job_config,
    )
    load_job.result()

In [None]:
load_job.errors

In [17]:
? job_config.source_format