# Exploring BigQuery

- BigQuery metadata, i.e. information about a table
- BigQuery query limits with QueryJobConfig()


## Setup

In [None]:
from google.cloud import bigquery
from google.colab import auth
import pandas as pd

auth.authenticate_user()

In [None]:
billing_project_id = 'cool-monolith-286222'

# Create client object
client = bigquery.Client(project=billing_project_id)

## Table Metadata Exploration

### List the tables

In [16]:
# Construct a reference to the "Global Biodiversity Information Facility" dataset
dataset_ref = client.dataset("gbif", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Get all the tables in the dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset
for table in tables:
  print(table.table_id)

occurrences


In [17]:
table_id = "occurrences"
table_id

'occurrences'

### Look at the table schema

In [18]:
# Construct a reference to the "mobility report" table
table_ref = dataset.table("occurrences")

# API request - fetch the table
table = client.get_table(table_ref)

# See the table's schema - name, field type, mode, description
table.schema

[SchemaField('gbifid', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('datasetkey', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('occurrenceid', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('kingdom', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('phylum', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('class', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('order', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('family', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('genus', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('species', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('infraspecificepithet', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('taxonrank', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('scientificname', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('verbatimscientificname', 'STRING', 'NULLABLE', None, None, (), None)

In [19]:
# convert the table.schema into a data frame
fields = pd.DataFrame( [ x.to_api_repr() for x in table.schema ] )
fields.head()


Unnamed: 0,name,type,mode,fields
0,gbifid,STRING,NULLABLE,
1,datasetkey,STRING,NULLABLE,
2,occurrenceid,STRING,NULLABLE,
3,kingdom,STRING,NULLABLE,
4,phylum,STRING,NULLABLE,


In [20]:
fields.shape

(50, 4)

In [21]:
# Preview the first five lines of the table as a data frame
client.list_rows(table, max_results=5).to_dataframe().transpose()


Unnamed: 0,0,1,2,3,4
gbifid,3599137728,3172947391,3523468642,3169999285,3693280574
datasetkey,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,4fa7b334-ce0d-4e88-aaae-2e0c138d049e,4fa7b334-ce0d-4e88-aaae-2e0c138d049e
occurrenceid,URN:catalog:CLO:EBIRD:OBS1173704723,URN:catalog:CLO:EBIRD:OBS937842969,URN:catalog:CLO:EBIRD_CAN:OBS1173830660,URN:catalog:CLO:EBIRD:OBS937959988,URN:catalog:CLO:EBIRD:OBS1173990708
kingdom,Animalia,Animalia,Animalia,Animalia,Animalia
phylum,Chordata,Chordata,Chordata,Chordata,Chordata
class,Aves,Aves,Aves,Aves,Aves
order,Passeriformes,Passeriformes,Passeriformes,Passeriformes,Passeriformes
family,Corvidae,Tyrannidae,Icteridae,Mimidae,Hirundinidae
genus,Corvus,Myiarchus,Icterus,Dumetella,Stelgidopteryx
species,Corvus brachyrhynchos,Myiarchus crinitus,Icterus galbula,Dumetella carolinensis,Stelgidopteryx serripennis


##  Add safe config settings

BigQuery allows you to query up to 1 TB per month. You can quickly reach this limit if you are not careful. Luckily, there are ways to assess and limit the amount of data you are querying.

Set constants for sizes

In [22]:
ONE_MB = 1_000*1_000
ONE_GB = 1_000*ONE_MB

### Sample Queries - Dry Run

You can use a 'dry run' to estimate the size of a query before running it.

In [23]:
project_id = "bigquery-public-data"
dataset_id = "gbif"
table_id = "occurrences"

queries = []

queries += [ f"""
        SELECT countrycode
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        LIMIT 10
        """ ]

queries += [ f"""
        SELECT countrycode
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        """ ]

queries += [ f"""
        SELECT countrycode
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

queries += [ f"""
        SELECT year
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE year = 2012
        """ ]

queries += [ f"""
        SELECT countrycode, class
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        """ ]

queries += [ f"""
        SELECT countrycode, class
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

queries += [ f"""
        SELECT countrycode, class
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT countrycode
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT count(countrycode)
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT count(1)
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT count(countrycode)
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

queries += [ f"""
        SELECT count(1)
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

len(queries)


12

In [24]:
for query in queries:
  dry_run_config = bigquery.QueryJobConfig(dry_run = True)
  dry_run_query_job = client.query(query, job_config= dry_run_config)
  size = dry_run_query_job.total_bytes_processed
  print(query)
  print(f"{size:_}")
  print()


        SELECT countrycode
        FROM bigquery-public-data.gbif.occurrences
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        LIMIT 10
        
35_002_295_626


        SELECT countrycode
        FROM bigquery-public-data.gbif.occurrences
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        
35_002_295_626


        SELECT countrycode
        FROM bigquery-public-data.gbif.occurrences
        WHERE countrycode ='US'
        
11_685_171_796


        SELECT year
        FROM bigquery-public-data.gbif.occurrences
        WHERE year = 2012
        
22_694_457_936


        SELECT countrycode, class
        FROM bigquery-public-data.gbif.occurrences
        WHERE class = "Magnoliopsida" AND countrycode ='US'
        
35_002_295_626


        SELECT countrycode, class
        FROM bigquery-public-data.gbif.occurrences
        WHERE countrycode ='US'
        
35_002_295_626


        SELECT countrycode, class
        FROM bigquery-public-data.gbif.occurren

Sample Query 1 - Safe Config
You can also specify a limit for how much data you want to scan.

In [25]:
# Create a list of queries
queries = []

queries += [ f"""
        SELECT count(1) as `total`
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT count(1) as `total`
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

queries += [ f"""
        SELECT count(countrycode) as `total`
        FROM {project_id}.{dataset_id}.{table_id}
        """ ]

queries += [ f"""
        SELECT count(countrycode) as `total`
        FROM {project_id}.{dataset_id}.{table_id}
        WHERE countrycode ='US'
        """ ]

len(queries)

4

In [32]:
# safe_config needs to be included with every client.query() request
safe_config = bigquery.QueryJobConfig(
    maximum_bytes_billed=ONE_GB,
    # totalBytesProcessed=ONE_GB,
    # total_bytes_processed=ONE_GB,
)
# Use a try...except block to catch when the safe_config paramenter prevents a query
for query in queries:
  print(query)
  try:
    df = client.query(query, job_config=safe_config).to_dataframe()
    print(df.head())
  except:
    print("Blocked by safe_config")



        SELECT count(1) as `total`
        FROM bigquery-public-data.gbif.occurrences
        
        total
0  2961438667

        SELECT count(1) as `total`
        FROM bigquery-public-data.gbif.occurrences
        WHERE countrycode ='US'
        
        total
0  1088444124

        SELECT count(countrycode) as `total`
        FROM bigquery-public-data.gbif.occurrences
        
        total
0  2921292949

        SELECT count(countrycode) as `total`
        FROM bigquery-public-data.gbif.occurrences
        WHERE countrycode ='US'
        
        total
0  1088444124


In [33]:
f"{ONE_GB:_}"

'1_000_000_000'

In [34]:
2669694827 > ONE_GB

True