# BigQuery Data

Set the GCP Project:

In [1]:
PROJECT_ID = "statmike-mlops"

Make a client connection to BigQuery

In [2]:
from google.cloud import bigquery
bq = bigquery.Client(project=PROJECT_ID)

List BigQuery datasets in the project:

In [3]:
datum=[]
for ds in list(bq.list_datasets()): datum.append(ds.dataset_id)
print(datum)

[]


Set Parameters for creating datasets and loading data files to tables:

In [4]:
LOCATION = 'us'

URI = 'gs://statmike-models/digits/data'
FILE = 'digits.csv'
FILE_URI = '%s/%s' % (URI,FILE)

DATASET_ID = 'digits'
TABLE_ID = 'digits_source'

Create the dataset if missing:

In [5]:
if DATASET_ID not in datum:
    dataset = bigquery.Dataset(bigquery.dataset.DatasetReference(PROJECT_ID, DATASET_ID))
    dataset.location = LOCATION
    dataset = bq.create_dataset(DATASET_ID)

Load data to a table in the dataset:

In [6]:
dataset_ref = bq.dataset(DATASET_ID)
table_ref = dataset_ref.table(TABLE_ID)
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
job_config.source_format = bigquery.SourceFormat.CSV
job_config.autodetect = True

job = bq.load_table_from_uri(FILE_URI, table_ref, job_config=job_config)
print("Starting job {}".format(job.job_id))
job.result()
      
bq_table = bq.get_table(table_ref) 
print("Loaded {} rows and {} columns to {}.".format(bq_table.num_rows,len(bq_table.schema),bq_table))

Starting job 51c7edf3-b676-4efe-a4cd-d1bd8f6924ae
Loaded 1797 rows and 66 columns to Table(TableReference(DatasetReference('statmike-mlops', 'digits'), 'digits_source')).


Use the BigQuery magic to review a few records (this uses the BQ storage API):

In [7]:
%%bigquery
SELECT * FROM `statmike-mlops.digits.digits_source` LIMIT 5

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p56,p57,p58,p59,p60,p61,p62,p63,target,target_OE
0,0.0,5.0,16.0,15.0,5.0,0.0,0.0,0.0,0.0,2.0,...,0.0,6.0,16.0,16.0,16.0,16.0,7.0,0.0,2,Even
1,0.0,5.0,16.0,12.0,1.0,0.0,0.0,0.0,0.0,5.0,...,0.0,8.0,16.0,16.0,16.0,16.0,4.0,0.0,2,Even
2,0.0,5.0,15.0,16.0,6.0,0.0,0.0,0.0,0.0,11.0,...,0.0,6.0,16.0,16.0,16.0,13.0,3.0,0.0,2,Even
3,0.0,4.0,15.0,15.0,8.0,0.0,0.0,0.0,0.0,8.0,...,0.0,7.0,14.0,11.0,0.0,0.0,0.0,0.0,2,Even
4,0.0,6.0,16.0,16.0,16.0,15.0,10.0,0.0,0.0,9.0,...,0.0,9.0,16.0,11.0,0.0,0.0,0.0,0.0,5,Odd


Create a prepped version of the data with test/train splits:

In [8]:
%%bigquery
CREATE OR REPLACE TABLE `statmike-mlops.digits.digits_prepped` AS
SELECT *, 
    CASE WHEN MOD(ABS(FARM_FINGERPRINT(GENERATE_UUID())),10) < 8 THEN 'TRAIN' ELSE 'TEST' END AS SPLITS
FROM `statmike-mlops.digits.digits_source`

In [9]:
%%bigquery
SELECT splits, count(*) as Count
FROM `statmike-mlops.digits.digits_prepped`
GROUP BY splits

Unnamed: 0,splits,Count
0,TRAIN,1458
1,TEST,339


Retrieve a subset of the data to a Pandas dataframe:

In [10]:
%%bigquery digits
SELECT * FROM `statmike-mlops.digits.digits_prepped` WHERE target = 2

In [11]:
digits.head()

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,...,p57,p58,p59,p60,p61,p62,p63,target,target_OE,SPLITS
0,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2,Even,TRAIN
1,0.0,0.0,0.0,8.0,15.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,12.0,14.0,4.0,0.0,2,Even,TEST
2,0.0,0.0,0.0,0.0,9.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,15.0,9.0,0.0,2,Even,TEST
3,0.0,0.0,0.0,0.0,11.0,15.0,4.0,0.0,0.0,0.0,...,0.0,0.0,1.0,11.0,16.0,12.0,0.0,2,Even,TRAIN
4,0.0,0.0,0.0,5.0,14.0,12.0,2.0,0.0,0.0,0.0,...,0.0,0.0,6.0,12.0,13.0,3.0,0.0,2,Even,TRAIN


# Remove Resources
- delete table `statmike-mlops.digits.digits_prepped`
- delete table `statmike-mlops.digits.digits_source`
- delete dataset `statmike-mlops`

In [6]:
bq.delete_table('statmike-mlops.digits.digits_prepped',not_found_ok=True)
bq.delete_table('statmike-mlops.digits.digits_source',not_found_ok=True)
bq.delete_dataset('statmike-mlops.digits',not_found_ok=True)