# Bigquery

> Everything needed to get started to query from and query to 

In [1]:
#| default_exp bigquery

In [2]:
#| hide
from nbdev.showdoc import *

In [13]:
#| export 
import logging
import pandas as pd
from google.cloud import bigquery

In [14]:
#| export 
logger = logging.getLogger(__name__)

## Initilize Client with Authentication Scopes

:::{.callout-note}
If you are not using authentication scopes, then the following is equivalent to:

```python
from google.cloud import bigquery
client = bigquery.Client()
```
:::

In [27]:
#| exports
def create_client(
    # key_path: str = None,
    auth_scopes: [str] = [],  # eg: ["bigquery", "drive"]
):
    """Create a BigQuery client with the given auth scopes."""
    if len(auth_scopes) == 0:
        logger.debug("Using default BigQuery client")
        return bigquery.Client()
    else:
        # to pull files from drive or other services, we need to authenticate
        import google.auth
        credentials, project = google.auth.default(
            [f"https://www.googleapis.com/auth/{service}" for service in auth_scopes]
        )
        logger.debug(f"Using BigQuery client with auth scopes: {auth_scopes}")
        return bigquery.Client(
            credentials=credentials,
            project=project,
        )

`create_client()` is a simple wrapper around `bigquery.Client()` that uses the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to authenticate. This is the recommended way to authenticate with BigQuery.

To set up `GOOGLE_APPLICATION_CREDENTIALS`:
```sh
export GOOGLE_APPLICATION_CREDENTIALS="/path/to/key.json"
```
:::{.callout-note}
replace `/path/to/key.json` with the path to your key file.
:::
---

## Slot Memory Usage

In [25]:
def memory_usage(
    query:str, # Query
    client: bigquery.Client = None # BigQuery's Client Object
) -> str : # GB(s) required to process this query
    """Memory required to process this query
    This number is the same that appears in the bigquery
    console: "This query will process _ when run."

    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)

    if client:
        query_job = client.query(query, job_config=job_config)
    else:
        with create_client() as client:
            query_job = client.query(query, job_config=job_config)

    _gb = query_job.total_bytes_processed / 2**30
    return str(round(_gb, 2)) + "GB"

In [26]:
# tests
memory_usage("SELECT 1;")

'0.0GB'

---

## Query to Dataframe

In [10]:
#| exports
def query_to_dataframe( sql: str, bq_client: bigquery.Client=None) -> pd.DataFrame:
    """
    Query -> DataFrame
    """
    logging.info(f"Querying BigQuery: {sql}")
    if bq_client is None:
        logging.debug("Creating new BigQuery client")
        with bigquery.Client() as bq_client:
            dataframe = bq_client.query(sql).to_dataframe()
    else:
        dataframe = bq_client.query(sql).to_dataframe()

    logging.info(f"dataframe.shape: {dataframe.shape}")
    
    return dataframe

In [12]:
# tests
display(query_to_dataframe("SELECT 1;"))
client = bigquery.Client()
display(query_to_dataframe("SELECT 1;", client))

Unnamed: 0,f0_
0,1


Unnamed: 0,f0_
0,1


---

In [7]:
#| hide
import nbdev; nbdev.nbdev_export()