# BigQuery - BQML

Multiclass Logistic Regression with BigQuery

Make a client connection to BigQuery

In [3]:
from google.cloud import bigquery
bq = bigquery.Client()

Use BigQuery ML to train multiclass logistic regression model:

In [5]:
%%bigquery
CREATE OR REPLACE MODEL `statmike-mlops.digits.digits_lr`
OPTIONS
  ( model_type='LOGISTIC_REG',
    auto_class_weights=TRUE,
    input_label_cols=['target']
  ) AS
SELECT * EXCEPT(splits,target_OE)
FROM `statmike-mlops.digits.digits_prepped`
WHERE splits = 'TRAIN'

Review the iterations from training:

In [6]:
%%bigquery
SELECT *
FROM ML.TRAINING_INFO(MODEL `statmike-mlops.digits.digits_lr`)
ORDER BY iteration

Unnamed: 0,training_run,iteration,loss,eval_loss,learning_rate,duration_ms
0,0,0,0.195084,0.194153,0.2,5226
1,0,1,0.142491,0.140804,0.4,6938
2,0,2,0.088367,0.087468,0.8,6787
3,0,3,0.052991,0.053895,1.6,6013
4,0,4,0.033716,0.035607,3.2,6559
5,0,5,0.022649,0.024922,6.4,8261
6,0,6,0.016511,0.020521,12.8,6139
7,0,7,0.014517,0.018335,3.2,6267
8,0,8,0.013195,0.017735,6.4,11918
9,0,9,0.011317,0.016401,12.8,8169


Review the model evaluation statistics on the Test/Train splits:

In [39]:
%%bigquery
SELECT 'TRAIN' as SPLIT, * FROM ML.EVALUATE (MODEL `statmike-mlops.digits.digits_lr`,
    (SELECT * FROM `statmike-mlops.digits.digits_prepped` WHERE SPLITS='TRAIN'))
UNION ALL
SELECT 'TEST' as SPLIT, * FROM ML.EVALUATE (MODEL `statmike-mlops.digits.digits_lr`,
    (SELECT * FROM `statmike-mlops.digits.digits_prepped` WHERE SPLITS='TEST'))

Unnamed: 0,SPLIT,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,TRAIN,0.981269,0.981143,0.981158,0.981179,0.117261,0.999693
1,TEST,0.966968,0.968487,0.967033,0.967204,0.126452,0.999802


Review the confusion matrix for each split:

In [9]:
%%bigquery
SELECT *
FROM ML.CONFUSION_MATRIX (MODEL `statmike-mlops.digits.digits_lr`,(
    SELECT *
    FROM `statmike-mlops.digits.digits_prepped`
    WHERE splits = 'TRAIN')
  );

Unnamed: 0,expected_label,_0,_1,_2,_3,_4,_5,_6,_7,_8,_9
0,0,146,0,0,0,0,0,0,0,0,0
1,1,0,141,0,0,0,0,1,0,3,1
2,2,0,0,143,0,0,0,0,0,0,0
3,3,0,0,0,137,0,1,0,0,1,0
4,4,0,0,0,0,148,0,0,0,1,0
5,5,0,1,0,0,0,143,1,0,0,1
6,6,0,1,0,0,1,0,138,0,0,0
7,7,0,0,0,0,0,0,0,136,0,1
8,8,0,5,0,0,0,2,0,0,130,0
9,9,0,0,0,1,0,3,0,0,2,144


In [10]:
%%bigquery
SELECT *
FROM ML.CONFUSION_MATRIX (MODEL `statmike-mlops.digits.digits_lr`,(
    SELECT *
    FROM `statmike-mlops.digits.digits_prepped`
    WHERE splits = 'TEST')
  );

Unnamed: 0,expected_label,_0,_1,_2,_3,_4,_5,_6,_7,_8,_9
0,0,32,0,0,0,0,0,0,0,0,0
1,1,0,34,0,0,0,0,0,0,2,0
2,2,0,0,34,0,0,0,0,0,0,0
3,3,1,0,0,41,0,0,0,0,2,0
4,4,0,1,0,0,31,0,0,0,0,0
5,5,0,0,0,0,0,35,0,0,0,1
6,6,0,1,0,0,0,0,40,0,0,0
7,7,0,0,0,0,2,0,0,40,0,0
8,8,0,0,1,0,0,0,0,0,36,0
9,9,0,0,0,0,0,0,0,0,1,29


Create a pandas dataframe with predictions for the full table:

In [40]:
%%bigquery pred
SELECT *
FROM ML.PREDICT(MODEL `statmike-mlops.digits.digits_lr`,(
    SELECT *
    FROM `statmike-mlops.digits.digits_prepped`)
  )

In [45]:
pred.columns

Index(['predicted_target', 'predicted_target_probs', 'p0', 'p1', 'p2', 'p3',
       'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', 'p11', 'p12', 'p13', 'p14',
       'p15', 'p16', 'p17', 'p18', 'p19', 'p20', 'p21', 'p22', 'p23', 'p24',
       'p25', 'p26', 'p27', 'p28', 'p29', 'p30', 'p31', 'p32', 'p33', 'p34',
       'p35', 'p36', 'p37', 'p38', 'p39', 'p40', 'p41', 'p42', 'p43', 'p44',
       'p45', 'p46', 'p47', 'p48', 'p49', 'p50', 'p51', 'p52', 'p53', 'p54',
       'p55', 'p56', 'p57', 'p58', 'p59', 'p60', 'p61', 'p62', 'p63', 'target',
       'target_OE', 'SPLITS'],
      dtype='object')

In [47]:
pred[['target', 'predicted_target', 'predicted_target_probs', 'SPLITS']].head()

Unnamed: 0,target,predicted_target,predicted_target_probs,SPLITS
0,0,0,"[{'label': 0, 'prob': 0.9652141899711171}, {'l...",TRAIN
1,0,0,"[{'label': 0, 'prob': 0.9004698083685381}, {'l...",TRAIN
2,1,1,"[{'label': 1, 'prob': 0.9359740238892645}, {'l...",TRAIN
3,9,9,"[{'label': 9, 'prob': 0.9906638040722163}, {'l...",TRAIN
4,9,8,"[{'label': 8, 'prob': 0.4050868599139842}, {'l...",TRAIN


Review the first rows data and prediction:

In [68]:
import json
record = pred.loc[:0,'p0':'p63'].to_json(orient='records')
json.loads(record)

[{'p0': 0.0,
  'p1': 0.0,
  'p2': 0.0,
  'p3': 9.0,
  'p4': 14.0,
  'p5': 6.0,
  'p6': 0.0,
  'p7': 0.0,
  'p8': 0.0,
  'p9': 0.0,
  'p10': 10.0,
  'p11': 13.0,
  'p12': 4.0,
  'p13': 13.0,
  'p14': 2.0,
  'p15': 0.0,
  'p16': 0.0,
  'p17': 2.0,
  'p18': 14.0,
  'p19': 0.0,
  'p20': 0.0,
  'p21': 10.0,
  'p22': 6.0,
  'p23': 0.0,
  'p24': 0.0,
  'p25': 4.0,
  'p26': 9.0,
  'p27': 0.0,
  'p28': 0.0,
  'p29': 6.0,
  'p30': 8.0,
  'p31': 0.0,
  'p32': 0.0,
  'p33': 5.0,
  'p34': 8.0,
  'p35': 0.0,
  'p36': 0.0,
  'p37': 8.0,
  'p38': 7.0,
  'p39': 0.0,
  'p40': 0.0,
  'p41': 2.0,
  'p42': 11.0,
  'p43': 1.0,
  'p44': 0.0,
  'p45': 9.0,
  'p46': 5.0,
  'p47': 0.0,
  'p48': 0.0,
  'p49': 0.0,
  'p50': 6.0,
  'p51': 11.0,
  'p52': 4.0,
  'p53': 13.0,
  'p54': 3.0,
  'p55': 0.0,
  'p56': 0.0,
  'p57': 0.0,
  'p58': 1.0,
  'p59': 11.0,
  'p60': 16.0,
  'p61': 12.0,
  'p62': 0.0,
  'p63': 0.0}]

In [48]:
[sorted(x, key = lambda x: x['label']) for x in [pred.predicted_target_probs[0]]]

[[{'label': 0, 'prob': 0.9652141899711171},
  {'label': 1, 'prob': 2.4376716160483155e-05},
  {'label': 2, 'prob': 0.002578678260762048},
  {'label': 3, 'prob': 0.0004530015313821015},
  {'label': 4, 'prob': 0.001599293545129465},
  {'label': 5, 'prob': 0.0013267086436507503},
  {'label': 6, 'prob': 0.02363371530789797},
  {'label': 7, 'prob': 0.000965304095139063},
  {'label': 8, 'prob': 0.0007580118000224815},
  {'label': 9, 'prob': 0.0034467201287386355}]]