See more details at https://cloud.google.com/bigquery/docs/bigqueryml-ncaa

In [None]:
!pip install --upgrade google-cloud-bigquery

In [1]:
from google.cloud import bigquery
client = bigquery.Client()

In [20]:
# to get the existing project & datasets
datasets = list(client.list_datasets())
project = client.project

if datasets:
    print('Datasets in project {}:'.format(project))
    for dataset in datasets:  # API request(s)
        print('\t{}'.format(dataset.dataset_id))
else:
    print('{} project does not contain any datasets.'.format(project))

Datasets in project qwiklabs-gcp-4b48b1635e1f4540:
	bqml_test


In [None]:
# create dataset from scratch:
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
dataset.location = 'US'
client.create_dataset(dataset)  # shouldn't need this if already created, steps below created in web UI

In [26]:
# attempting to set existing dataset
# want to use bqml_test instead of bqml_tutorial
# do NOT do this - just point to the right dataset/table in the actual CREATE MODEL query below

# dataset = bigquery.Dataset(client.project + '.' + datasets[0].dataset_id)  # this should grab the dataset if it's already there
# dataset.location = 'us'


In [6]:
#get the query from https://bigquery.cloud.google.com/savedquery/1057666841514:77711b21274b4c6485c907483ef2f6fe
#note that the public dataset is only avaliable in US, so the destination table should also be US
#Apparently BQ doesn't support cross - location BQ transfers yet

In [None]:
# query to make a cross of the data, joining on the game key to link the two opposing teams (going from Long data to wide data):
# official: https://bigquery.cloud.google.com/savedquery/1057666841514:9d4ec2ed8a864e7e8d8c3c9a65faa178
# experiment to see if perma-link survives: https://console.cloud.google.com/bigquery?sq=297656834223:1f642e9d233240749ea521b12b2063c1


In [2]:
# literally creating the model
# need to make sure the CREATE OR REPLACE MODEL table reference points to the correct project
sql = """
    CREATE OR REPLACE MODEL `bqml_test.ncaa_model`
    OPTIONS (
        model_type='linear_reg',
        max_iteration=50 ) AS
    SELECT
        * EXCEPT (
            game_id, season, scheduled_date,
            total_three_points_made,
            total_three_points_att),
        total_three_points_att as label
    FROM
        `bqml_test.wide_games`
    WHERE
        # remove the game to predict
        game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
"""
df = client.query(sql).to_dataframe()
print(df)

Empty DataFrame
Columns: []
Index: []


In [4]:
# check model training runs
sql = """
    SELECT
        *
    FROM
        ML.TRAINING_INFO(MODEL `bqml_test.ncaa_model`)
"""
df = client.query(sql).to_dataframe()
print(df)

   training_run  iteration        loss   eval_loss  duration_ms  learning_rate
0             0          8   54.366519  119.470519         8240           0.05
1             0          7   64.202786  120.194577         9021           0.05
2             0          6   77.925533  145.478604         9130           0.05
3             0          5   97.229483  149.119663        10156           0.05
4             0          4  124.648797  195.916149         8669           0.05
5             0          3  164.101166  208.669329         8875           0.05
6             0          2  222.055832  297.970166         8936           0.05
7             0          1  311.387630  340.900917         9575           0.05
8             0          0  524.885056  587.763086         7120           0.20


In [7]:
# Check evluation table
sql = """
    WITH eval_table AS (
        SELECT
            *,
            total_three_points_att AS label
        FROM
            `bqml_test.wide_games` )
    SELECT
        *
    FROM
        ML.EVALUATE(MODEL `bqml_test.ncaa_model`,
            TABLE eval_table)
"""
df = client.query(sql).to_dataframe()
print(df)


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0             6.399684            67.68776                0.055167   

   median_absolute_error  r2_score  explained_variance  
0               5.426127  0.106071            0.435231  


In [9]:
# prediction: find the real game vs the actual outcomes
# this was kept holdout from the original training data
sql = """
    WITH game_to_predict AS (
        SELECT
            *
        FROM
            `bqml_test.wide_games`
        WHERE
            game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
    SELECT
        truth.game_id AS game_id,
        total_three_points_att,
        predicted_total_three_points_att
    FROM (
        SELECT
            game_id,
            predicted_label AS predicted_total_three_points_att
        FROM
            ML.PREDICT(MODEL `bqml_test.ncaa_model`,
            table game_to_predict) ) AS predict
    JOIN (
        SELECT
            game_id,
            total_three_points_att AS total_three_points_att
        FROM
            game_to_predict) AS truth
    ON
        predict.game_id = truth.game_id
"""
df = client.query(sql).to_dataframe()
print(df)

                                game_id  total_three_points_att  \
0  f1063e80-23c7-486b-9a5e-faa52beb2d83                      50   

   predicted_total_three_points_att  
0                         40.870294  
