#### Accessing BigQuery from python

* Options
    1. Google BigQuery client library
    2. pandas-gbq
    3. sqlalchemy and pybigquery
    
The Google client library is the most feature rich, but is more verbose. 
    * It has faster table writes, since it serializes the data to parquet, whereas the other librares serialize to CSV. 
    * It can write directly to cloud storage

In [13]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1

In [9]:
import time

In [10]:
## Install pybigquery with pip first
import sqlalchemy as sq

In [11]:
import pandas as pd

In [12]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/spfohl/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-nigam-starr' 

In [46]:
query = '''
SELECT * 
FROM starr_omop_cdm5_deid_20200404.person
LIMIT 10000
'''

In [47]:
# Testing sqlalchemy
engine = sq.create_engine(
    'bigquery://', 
)



In [48]:
%%time
df = pd.read_sql(query, engine, chunksize=1000)

CPU times: user 33.9 ms, sys: 6.4 ms, total: 40.3 ms
Wall time: 1.32 s


In [49]:
%%time
t0 = time.time()
for temp in df:
    print(time.time() - t0)
    print(temp.shape)
    t0 = time.time()

0.9370927810668945
(1000, 21)
0.053906917572021484
(1000, 21)
0.05286836624145508
(1000, 21)
0.05265927314758301
(1000, 21)
0.05297541618347168
(1000, 21)
1.0779664516448975
(1000, 21)
0.053075551986694336
(1000, 21)
0.05483222007751465
(1000, 21)
0.052811622619628906
(1000, 21)
0.05293583869934082
(1000, 21)
CPU times: user 672 ms, sys: 10.2 ms, total: 682 ms
Wall time: 2.45 s


In [7]:
df.head()

Unnamed: 0,measurement_id,person_id,measurement_concept_id,measurement_DATE,measurement_DATETIME,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,...,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value,trace_id,unit_id,load_table_id
0,932321812,30172122,3016879,2019-05-28,2019-05-28 13:05:00,,44818702,0,,0,...,995861.0,145864253.0,,3397-7,3016879,mAbs,Negative,,measurement.meas.rule.2.shc_order_results,shc_order_results
1,156771269,32541687,3010156,2006-08-07,2006-08-07 11:58:00,,44818702,0,2.6,0,...,982549.0,35862132.0,,30522-7,3010156,mg/L,2.6,,measurement.meas.rule.2.shc_order_results,shc_order_results
2,598921449,31895156,3001526,2016-02-28,2016-02-28 19:14:00,,44818702,0,,0,...,1030703.0,90734259.0,,3299-5,3001526,ug/mL,<2.0,,measurement.meas.rule.2.shc_order_results,shc_order_results
3,454653557,31179254,3011960,2014-05-16,2014-05-16 09:24:00,,44818702,0,5.4,0,...,944841.0,68976691.0,,30934-4,3011960,pg/mL,5.4,,measurement.meas.rule.2.shc_order_results,shc_order_results
4,978020308,32254888,40758990,2019-10-26,2019-10-26 14:13:00,,44818702,0,28360.0,0,...,1024562.0,154978545.0,,55869-2,40758990,mIU/mL,28360,,measurement.meas.rule.2.shc_order_results,shc_order_results


In [8]:
%%time
## Testing pandas-gbq
df= pd.read_gbq(query, dialect='standard')

  progress_bar_type=progress_bar_type,


CPU times: user 9.25 s, sys: 791 ms, total: 10 s
Wall time: 23.5 s


In [18]:
%%time
## Testing pandas-gbq with storage API
df = pd.read_gbq(query, dialect='standard', use_bqstorage_api=True)

TypeError: read_gbq() got an unexpected keyword argument 'chunksize'

In [17]:
## Testing the client library
# https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
# https://cloud.google.com/bigquery/docs/paging-results

credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

# Make clients.
bqclient = bigquery.Client(
    credentials=credentials,
    project=your_project_id,
)

bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient(
    credentials=credentials
)


In [114]:
# %%time
query = '''
    SELECT person_id, gender_concept_id
    FROM starr_omop_cdm5_deid_20200404.person
'''

In [102]:
%%time

result = (
    bqclient
    .query(query)
    .result(page_size=100000)
    .to_dataframe_iterable()
)

result_dict = {}
for i, temp in enumerate(result):
    result_dict[i] = temp
    print(temp.shape)

(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
(100000, 2)
CPU times: user 7.08 s, sys: 886 ms, total: 7.97 s
Wall time: 27.4 s


In [118]:
%%time
result = (
    bqclient
    .query(query)
    .result(page_size=1024)
    .to_dataframe_iterable(bqstorage_client=bqstorageclient)
)

combine_every = 100
result_dict = {}
result_list = []
for i, temp in enumerate(result):
    if i == 0:
        print('Num rows: {}'.format(temp.shape[0]))
    result_dict[i] = temp
    if i % combine_every == 0:
        result_list.append(pd.concat(result_dict))
        result_dict = {}
if len(list(result_dict.keys())) > 0:
    result_list.append(pd.concat(result_dict))
result_df = pd.concat(result_list)
print('Num iterations: {}'.format(i))

Num rows: 1024
Num iterations: 2850
CPU times: user 8.14 s, sys: 578 ms, total: 8.72 s
Wall time: 8.39 s


In [122]:
result_df.head()

Unnamed: 0,Unnamed: 1,person_id,gender_concept_id
0,0,30360313,0
0,1,30359483,8507
0,2,30762518,8532
0,3,30629316,0
0,4,32507595,8507


In [119]:
result_df.shape

(2919074, 2)

In [116]:
1024 * 2850

2918400

In [104]:
temp = pd.concat(result_dict)

In [111]:
result_dict[0].shape[0] * len(result_dict.keys())

1000448

In [106]:
temp.shape

(1000000, 2)

In [105]:
result_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,