<a href="https://colab.research.google.com/github/isb-cgc/Community-Notebooks/blob/John-staging/MachineLearning/TensorFlow_BQ_Import.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to train a TensorFlow model and import it into BigQuery ML
Check out other notebooks at our [Community Notebooks Repository](https://github.com/isb-cgc/Community-Notebooks)!

- **Title:** How to build an RNA-seq logistic regression classifier with BigQuery ML
- **Author:** John Phan
- **Created:** 2021-09-16
- **Purpose:** Demonstrate use of TensorFlow to train a deep neural network, then import it into BigQuery ML.
- **URL:** https://github.com/isb-cgc/Community-Notebooks/blob/master/MachineLearning/TensorFlow_BQ_Import.ipynb
- **Note:** This example uses information from the work published by [Bosquet et al.](https://molecular-cancer.biomedcentral.com/articles/10.1186/s12943-016-0548-9)

## Import Dependencies



In [5]:
# Load dependencies
from google.cloud import bigquery
from google.colab import auth

import pandas as pd
from sklearn.preprocessing import StandardScaler


## Authenticate

Before using BigQuery, we need to get authorization for access to BigQuery and the Google Cloud. For more information see ['Quick Start Guide to ISB-CGC'](https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/HowToGetStartedonISB-CGC.html). Alternative authentication methods can be found [here](https://googleapis.dev/python/google-api-core/latest/auth.html)

In [6]:
# if you're using Google Colab, authenticate to gcloud with the following
auth.authenticate_user()

# alternatively, use the gcloud SDK
#!gcloud auth application-default login

# Parameters

In [12]:
# set the google project that will be billed for this notebook's computations
google_project = 'cgc-05-0051' ## CHANGE ME

# bq project for storing ML model
bq_project = 'isb-project-zero' ## CHANGE ME

# bq dataset for storing ML model
bq_dataset = 'jhp_scratch' ## CHANGE ME

# name of temporary table for data
bq_subset_table = 'subset_ov_therapy'

# name of ML model
bq_ml_model = 'tcga_ov_therapy_ml_dnn_import'

# in this example, we'll be using the Ovarian cancer TCGA dataset
cancer_type = 'TCGA-OV'

# genes used for prediction model, taken from Bosquet et al.
gene_list = ['RHOT1','MYO7A','ZBTB10','MATK','ST18','RPS23','GCNT1','DROSHA','NUAK1','CCPG1',\
'PDGFD','KLRAP1','MTAP','RNF13','THBS1','MLX','FAP','TIMP3','PRSS1','SLC7A11',\
'OLFML3','RPS20','MCM5','POLE','STEAP4','LRRC8D','WBP1L','ENTPD5','SYNE1','DPT',\
'COPZ2','TRIO','PDPR']

# clinical data table
clinical_table = 'isb-cgc-bq.TCGA_versioned.clinical_gdc_2019_06'

# RNA seq data table
rnaseq_table = 'isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current'

## BigQuery Client

In [13]:
# Create a client to access the data within BigQuery
client = bigquery.Client(google_project)

## Create a Table with a Subset of the Gene Expression Data

In [18]:
# Construct a query to create a subset table
subset_table_query = """
  BEGIN
  CREATE OR REPLACE TABLE `{bq_project}.{bq_dataset}.{bq_subset_table}` AS
  SELECT * FROM (
    SELECT
      labels.case_barcode as sample,
      labels.data_partition as data_partition,
      labels.response_label AS label,
      ge.gene_name AS gene_name,
      -- Multiple samples may exist per case, take the max value
      MAX(LOG(ge.HTSeq__FPKM_UQ+1)) AS gene_expression
    FROM `{rnaseq_table}` AS ge

    INNER JOIN (
      SELECT
        *
      FROM (
        SELECT
          case_barcode,
          primary_therapy_outcome_success,
          CASE
            -- Complete Reponse    --> label as 1
            -- All other responses --> label as 0
            WHEN primary_therapy_outcome_success = 'Complete Remission/Response' THEN 1
            WHEN (primary_therapy_outcome_success IN (
              'Partial Remission/Response','Progressive Disease','Stable Disease'
            )) THEN 0
          END AS response_label,
          CASE 
            WHEN MOD(ABS(FARM_FINGERPRINT(case_barcode)), 10) < 5 THEN 'training'
            WHEN MOD(ABS(FARM_FINGERPRINT(case_barcode)), 10) >= 8 THEN 'testing'
            ELSE 'validation'
          END AS data_partition
          FROM `{clinical_table}`
          WHERE
            project_short_name = '{cancer_type}'
            AND primary_therapy_outcome_success IS NOT NULL
      )
    ) labels
    ON labels.case_barcode = ge.case_barcode
    WHERE gene_name IN ({genes})
    GROUP BY sample, label, data_partition, gene_name
  )
  PIVOT (
    MAX(gene_expression) FOR gene_name IN ({genes})
  );
  END;
""".format(
  bq_project=bq_project,
  bq_dataset=bq_dataset,
  bq_subset_table=bq_subset_table,
  rnaseq_table=rnaseq_table,
  clinical_table=clinical_table,
  cancer_type=cancer_type,
  genes="".join(["'", "','".join(gene_list), "'"])
)

# Display the query
print(subset_table_query)


  BEGIN
  CREATE OR REPLACE TABLE `isb-project-zero.jhp_scratch.subset_ov_therapy` AS
  SELECT * FROM (
    SELECT
      labels.case_barcode as sample,
      labels.data_partition as data_partition,
      labels.response_label AS label,
      ge.gene_name AS gene_name,
      -- Multiple samples may exist per case, take the max value
      MAX(LOG(ge.HTSeq__FPKM_UQ+1)) AS gene_expression
    FROM `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current` AS ge

    INNER JOIN (
      SELECT
        *
      FROM (
        SELECT
          case_barcode,
          primary_therapy_outcome_success,
          CASE
            -- Complete Reponse    --> label as 1
            -- All other responses --> label as 0
            WHEN primary_therapy_outcome_success = 'Complete Remission/Response' THEN 1
            WHEN (primary_therapy_outcome_success IN (
              'Partial Remission/Response','Progressive Disease','Stable Disease'
            )) THEN 0
          END AS response_label,
          CASE 
     

In [19]:
# Execute the query
subset_query_result = client.query(subset_table_query).result()

## Preview Data

In [20]:
subset_table_data = client.query(("""
  SELECT
    * --usually not recommended to use *, but in this case, we want to see all of the 33 genes
  FROM `{bq_project}.{bq_dataset}.{bq_subset_table}`
""").format(
    bq_project=bq_project,
    bq_dataset=bq_dataset,
    bq_subset_table=bq_subset_table
)).result().to_dataframe()

print(subset_table_data.info())
subset_table_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample          264 non-null    object 
 1   data_partition  264 non-null    object 
 2   label           264 non-null    int64  
 3   RHOT1           264 non-null    float64
 4   MYO7A           264 non-null    float64
 5   ZBTB10          264 non-null    float64
 6   MATK            264 non-null    float64
 7   ST18            264 non-null    float64
 8   RPS23           264 non-null    float64
 9   GCNT1           264 non-null    float64
 10  DROSHA          264 non-null    float64
 11  NUAK1           264 non-null    float64
 12  CCPG1           264 non-null    float64
 13  PDGFD           264 non-null    float64
 14  KLRAP1          264 non-null    float64
 15  MTAP            264 non-null    float64
 16  RNF13           264 non-null    float64
 17  THBS1           264 non-null    flo

Unnamed: 0,sample,data_partition,label,RHOT1,MYO7A,ZBTB10,MATK,ST18,RPS23,GCNT1,DROSHA,NUAK1,CCPG1,PDGFD,KLRAP1,MTAP,RNF13,THBS1,MLX,FAP,TIMP3,PRSS1,SLC7A11,OLFML3,RPS20,MCM5,POLE,STEAP4,LRRC8D,WBP1L,ENTPD5,SYNE1,DPT,COPZ2,TRIO,PDPR
0,TCGA-24-1558,testing,0,11.961503,9.587616,11.860877,8.664065,5.760303,13.301421,10.311867,12.499989,11.352255,10.512449,9.722775,9.317007,10.895527,12.564453,11.925985,12.363178,8.851584,10.974945,11.417588,10.320678,12.726925,15.332607,12.614316,11.334822,9.792002,12.360472,13.140625,11.317403,7.794225,6.000773,10.454949,11.232880,11.484763
1,TCGA-09-0366,testing,0,11.471221,13.083844,11.032946,9.368575,6.514522,14.431774,10.429706,12.246309,11.030046,10.338611,9.908280,10.607853,10.528634,12.408062,11.317887,12.697506,7.927159,9.003331,11.224832,8.847873,13.417340,15.323085,12.559651,11.113749,10.571279,12.652026,12.450844,10.751433,9.187535,7.541934,10.844163,11.636717,11.180792
2,TCGA-09-2048,testing,0,12.149185,10.583810,10.907843,8.157137,4.971881,15.796032,10.267848,11.855467,11.581988,10.684032,11.873592,11.039716,10.076076,12.810521,11.820169,13.153011,8.661241,10.477173,10.627153,10.025140,14.335939,17.019480,12.056855,10.839572,7.646045,12.399880,12.509908,9.737153,8.176625,10.498734,11.550909,10.704064,10.912658
3,TCGA-WR-A838,testing,0,11.869370,9.663104,12.447222,8.401963,4.563516,14.995789,11.376140,12.324806,10.170812,9.973672,10.446829,10.476043,11.283932,11.983236,12.027341,12.043835,7.625511,9.077782,10.972978,10.919711,10.951341,15.627070,11.865495,11.466443,6.941096,12.844375,12.342970,11.301430,8.773457,7.333808,9.243414,11.152946,11.485608
4,TCGA-25-1870,testing,0,11.852857,9.705547,12.112409,8.796984,5.959471,15.050327,10.092732,12.614002,10.906440,10.534284,10.410359,11.059859,10.792500,12.441040,12.428180,12.412475,8.883372,9.650237,11.493565,10.407203,12.442503,16.205360,11.737656,11.057688,7.943384,12.645147,12.940854,10.715806,8.872785,8.474850,10.818460,11.675147,11.915795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,TCGA-OY-A56Q,validation,1,11.659994,9.760316,13.326234,8.112855,5.167509,14.633357,10.658599,12.300204,10.451264,10.291106,9.828109,10.488744,11.399197,12.955278,10.055339,11.544506,6.202885,10.474527,9.958144,10.549357,13.619409,16.361063,12.132793,11.234670,7.019154,12.279743,13.171523,10.925970,7.665943,6.557040,8.917901,11.978215,13.020138
260,TCGA-09-1665,validation,1,11.714189,11.159942,12.345458,8.613600,5.224369,14.438913,10.344938,11.262470,9.813097,11.555447,9.782696,10.364112,10.573735,12.668031,12.699625,12.581383,8.861349,8.220235,12.324373,10.497973,11.453266,17.180440,12.570302,11.495284,8.862564,11.956014,11.692210,10.609660,8.495378,8.618310,11.655796,12.488971,10.152196
261,TCGA-04-1514,validation,1,11.964629,11.062341,11.562130,7.528090,3.997486,15.123933,10.078170,12.621936,11.383891,9.873516,11.223519,10.928764,11.295664,11.722918,12.147000,11.985848,8.460977,10.335974,11.516413,8.879413,12.535974,15.164296,11.927324,11.492937,7.188567,12.423648,12.662514,11.028765,8.608052,7.365841,10.533711,12.836300,12.736526
262,TCGA-61-2098,validation,1,11.980709,10.627622,11.351743,7.926095,4.472007,14.729060,11.283380,12.984888,12.203466,10.100225,10.985075,10.436229,10.126177,12.842831,12.193679,12.471903,7.183368,11.028365,9.978184,9.602543,12.762961,15.420000,12.607360,11.824341,8.872797,12.795017,13.021355,10.497475,9.185307,8.073857,10.881208,12.343773,12.031481


In [21]:
# Prepare the data for TensorFlow modeling
exp_data = subset_table_data.copy()
exp_data.pop('sample') # drop the sample name column
exp_data

# split data into train, test, and val
train_data = exp_data[exp_data['data_partition'] == 'training']
train_data.pop('data_partition')
val_data = exp_data[exp_data['data_partition'] == 'validation']
val_data.pop('data_partition')
test_data = exp_data[exp_data['data_partition'] == 'testing']
test_data.pop('data_partition')

# get labels
data = dict()
data['train_y'] = train_data.pop('label')
data['val_y'] = val_data.pop('label')
data['test_y'] = test_data.pop('label')

# scale the data
scaler = StandardScaler()
data['train_x'] = scaler.fit_transform(train_data)
data['val_x'] = scaler.transform(val_data)
data['test_x'] = scaler.transform(test_data)
data['scaler'] = scaler

In [22]:
# Build DNN model

import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense

input_features = data['train_x'].shape[1]

input = Input(shape=(input_features,), name='input')
x = Dense(16, activation='relu', name='hidden1', kernel_regularizer='l2')(input)
x = Dense(8, activation='relu', name='hidden2', kernel_regularizer='l2')(x)
x = Dense(4, activation='relu', name='hidden3', kernel_regularizer='l2')(x)
output = Dense(1, activation='sigmoid', name='output', kernel_regularizer='l2')(x)

model = Model(inputs=input, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model

model.fit(x=data['train_x'], y=data['train_y'], \
          batch_size=32, epochs=200, verbose=1, validation_data=(data['val_x'], data['val_y']))

In [None]:
## Save Model

model.save('test-dnn-model')
!gsutil cp -r test-dnn-model gs://jhp-scratch/ml-models/

In [69]:
# Construct query to import the model
model_import_query = """
  CREATE OR REPLACE MODEL `{bq_project}.{bq_dataset}.{bq_ml_model}`

  OPTIONS (
    MODEL_TYPE='TENSORFLOW',
    MODEL_PATH='gs://jhp-scratch/ml-models/test-dnn-model/*'
  )
""".format(
    bq_project=bq_project,
    bq_dataset=bq_dataset,
    bq_ml_model=bq_ml_model
)
print(model_import_query)


  CREATE OR REPLACE MODEL `isb-project-zero.jhp_scratch.tcga_ov_therapy_ml_dnn_import`

  OPTIONS (
    MODEL_TYPE='TENSORFLOW',
    MODEL_PATH='gs://jhp-scratch/ml-models/test-dnn-model/*'
  )



In [70]:
# Run the query
client.query(model_import_query)

In [11]:
ml_predict_query = """
  SELECT
    output
  FROM ML.PREDICT (MODEL `{bq_project}.{bq_dataset}.{bq_ml_model}`, 
    (
      SELECT [{gene_list}] AS input
      FROM `{bq_project}.{bq_dataset}.{bq_tmp_table}`
      WHERE data_partition = 'testing' -- Use the testing dataset
    )
  )
""".format(
  bq_project=bq_project,
  bq_dataset=bq_dataset,
  bq_ml_model=bq_ml_model,
  bq_tmp_table=bq_tmp_table,
  gene_list=",".join(gene_list)
)

print(ml_predict_query)


  SELECT
    output
  FROM ML.PREDICT (MODEL `isb-project-zero.jhp_scratch.tcga_ov_therapy_ml_dnn_import`, 
    (
      SELECT [RHOT1,MYO7A,ZBTB10,MATK,ST18,RPS23,GCNT1,DROSHA,NUAK1,CCPG1,PDGFD,KLRAP1,MTAP,RNF13,THBS1,MLX,FAP,TIMP3,PRSS1,SLC7A11,OLFML3,RPS20,MCM5,POLE,STEAP4,LRRC8D,WBP1L,ENTPD5,SYNE1,DPT,COPZ2,TRIO,PDPR] AS input
      FROM `isb-project-zero.jhp_scratch.tmp_data`
      WHERE data_partition = 'testing' -- Use the testing dataset
    )
  )



In [12]:
ml_predict = client.query(ml_predict_query).result().to_dataframe()
ml_predict



Unnamed: 0,output
0,0.374415
1,0.996113
2,0.979482
3,0.851250
4,0.026677
...,...
125,0.920286
126,0.999871
127,0.991652
128,0.937644


In [32]:
predicted_labels = round(ml_predict)
testing_data = exp_data[exp_data['data_partition'] == 'testing']

predicted = predicted_labels['output'].reset_index(drop=True)
orig = testing_data['label'].reset_index(drop=True)

accuracy = 1-sum(abs(orig-predicted))/len(predicted)


In [33]:
accuracy

0.6615384615384615