### Week 6:

Train a multi-class classification model on AutoML.   

https://towardsdatascience.com/the-best-of-both-worlds-calling-auto-ml-from-bigquery-9dfd433a45d6    
https://cloud.google.com/blog/products/ai-machine-learning/use-automl-tables-from-a-jupyter-notebook 


Using the NOAA dataset again:

* The target column should be the "element" column, filtered for the weather types (i.e. WT**)
* The feature columns should be id, and date, and the columns from the `bigquery-public-data.ghcn_d.ghcnd_stations` at least

For the others,they will need to be investigated to see if they have any relevant features:
`bigquery-public-data.ghcn_d.ghcnd_countries`
`bigquery-public-data.ghcn_d.ghcnd_inventory`
`bigquery-public-data.ghcn_d.ghcnd_states`


This time, we are not filtering for just a specific city, i.e. Chicago, because we want to know if there are patterns by location

In [1]:
from app_creds import set_env
set_env()

#from google.cloud import automl
from google.cloud import automl_v1beta1

from google.cloud import bigquery
# Construct a BigQuery client object.
bq_client = bigquery.Client()

## Get the weather-type categorical variables from the weather dataset

https://docs.opendata.aws/noaa-ghcn-pds/readme.html

__WT** = Weather Type where ** has one of the following values:__    

01 = Fog, ice fog, or freezing fog (may include heavy fog)    
02 = Heavy fog or heaving freezing fog (not always distinguished from fog)    
03 = Thunder    
04 = Ice pellets, sleet, snow pellets, or small hail    
05 = Hail (may include small hail)    
06 = Glaze or rime    
07 = Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction    
08 = Smoke or haze    
09 = Blowing or drifting snow    
10 = Tornado, waterspout, or funnel cloud    
11 = High or damaging winds    
12 = Blowing spray    
13 = Mist    
14 = Drizzle    
15 = Freezing drizzl    
16 = Rain (may include freezing rain, drizzle, and freezing drizzle)    
17 = Freezing rain    
18 = Snow, snow pellets, snow grains, or ice crystals    
19 = Unknown source of precipitation    
21 = Ground fog    
22 = Ice fog or freezing fog   

SELECT recordID, groupID    
FROM (    
  SELECT     
    recordID, groupID,     
    RAND() AS rnd, ROW_NUMBER() OVER(PARTITION BY groupID ORDER BY rnd) AS pos    
  FROM yourTable    
)    
WHERE pos <= 100    
ORDER BY groupID, recordID    

In [216]:
get_data_query = """
WITH get_counts AS (
SELECT
  yd.id, COUNT(*) num_rows
FROM
  `bigquery-public-data.ghcn_d.ghcnd_2022` as yd
  JOIN `bigquery-public-data.ghcn_d.ghcnd_stations` sd
  ON yd.id = sd.id
WHERE yd.qflag IS NULL
AND yd.element LIKE 'WT%%'
GROUP BY yd.id
HAVING num_rows > 50)
SELECT
  date, element, state
FROM
(
SELECT
  yd.date, yd.element, sd.state,
  RAND() AS rnd, ROW_NUMBER() OVER(PARTITION BY yd.element) AS pos
FROM
  `bigquery-public-data.ghcn_d.ghcnd_2022` as yd 
  JOIN `bigquery-public-data.ghcn_d.ghcnd_stations` sd
  ON yd.id = sd.id
  -- JOIN get_counts gc ON yd.id = gc.id
WHERE yd.qflag IS NULL
AND yd.element LIKE 'WT%%'
AND yd.element <> 'WT10'
)
WHERE pos <= 1000
--and element <> 'WT07'
"""
_2022_query_job = bq_client.query(get_data_query)

# A dry run query completes immediately, it should give me an estimate of costs
print("This query will process {} bytes.".format(_2022_query_job.total_bytes_processed))

weather_and_state_dataframe = _2022_query_job.to_dataframe()

This query will process 784742510 bytes.


In [217]:
## size of the sample

weather_and_state_dataframe.size

26364

In [218]:
## Categorical requirements for the label: https://cloud.google.com/automl-tables/docs/prepare
# If it is Categorical, it must have at least 2 and no more than 500 distinct values.
print(weather_and_state_dataframe['element'].unique())
print(weather_and_state_dataframe['element'].value_counts())

['WT11' 'WT05' 'WT08' 'WT06' 'WT04' 'WT09' 'WT02' 'WT01' 'WT03']
WT11    1000
WT05    1000
WT08    1000
WT06    1000
WT04    1000
WT02    1000
WT01    1000
WT03    1000
WT09     788
Name: element, dtype: int64


In [220]:
## Make sure there are at least 3 values for each - this gets split into test, train and validation models

for col in weather_and_state_dataframe.columns:
    if col != 'element':
        print(weather_and_state_dataframe[col].value_counts())

2022-02-03    166
2022-02-25    149
2022-02-23    143
2022-02-04    134
2022-02-24    120
             ... 
2022-10-14      1
2022-10-21      1
2022-10-04      1
2022-10-22      1
2022-10-11      1
Name: date, Length: 296, dtype: int64
TX    528
PA    457
MI    391
NY    372
KS    356
AK    337
NE    328
MN    319
CA    314
CO    308
IL    286
MO    279
WI    272
VA    267
AR    231
NC    230
OH    228
OK    181
ME    181
IA    178
IN    174
SD    166
ND    161
MT    159
KY    142
MA    131
LA    124
TN    123
WV    122
WY    118
WA    116
UT    102
NH    102
FL     92
AZ     91
GA     90
SC     87
OR     81
ID     79
VT     75
NM     74
NJ     72
MD     71
MS     59
AL     58
NV     33
RI     15
CT     10
DE      8
HI      8
PR      2
Name: state, dtype: int64


In [221]:
target_column = 'element'

In [223]:
# https://cloud.google.com/bigquery-ml/docs/reference/standard-sql/bigqueryml-syntax-create-automl

create_model_query = f"""
CREATE OR REPLACE MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`
OPTIONS(MODEL_TYPE = 'automl_classifier', 
BUDGET_HOURS = 1.0, 
OPTIMIZATION_OBJECTIVE = 'MINIMIZE_LOG_LOSS',
INPUT_LABEL_COLS=['{target_column}'])
AS 
{get_data_query}
"""

print(create_model_query)


CREATE OR REPLACE MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`
OPTIONS(MODEL_TYPE = 'automl_classifier', 
BUDGET_HOURS = 1.0, 
OPTIMIZATION_OBJECTIVE = 'MINIMIZE_LOG_LOSS',
INPUT_LABEL_COLS=['element'])
AS 

WITH get_counts AS (
SELECT
  yd.id, COUNT(*) num_rows
FROM
  `bigquery-public-data.ghcn_d.ghcnd_2022` as yd
  JOIN `bigquery-public-data.ghcn_d.ghcnd_stations` sd
  ON yd.id = sd.id
WHERE yd.qflag IS NULL
AND yd.element LIKE 'WT%%'
GROUP BY yd.id
HAVING num_rows > 50)
SELECT
  date, element, state
FROM
(
SELECT
  yd.date, yd.element, sd.state,
  RAND() AS rnd, ROW_NUMBER() OVER(PARTITION BY yd.element) AS pos
FROM
  `bigquery-public-data.ghcn_d.ghcnd_2022` as yd 
  JOIN `bigquery-public-data.ghcn_d.ghcnd_stations` sd
  ON yd.id = sd.id
  -- JOIN get_counts gc ON yd.id = gc.id
WHERE yd.qflag IS NULL
AND yd.element LIKE 'WT%%'
AND yd.element <> 'WT10'
)
WHERE pos <= 1000
and element <> 'WT07'




In [224]:
create_model = bq_client.query(create_model_query)
create_model

QueryJob<project=msds-434-robords-oct, location=US, id=0f791e88-bf75-4965-b509-473b88c06ed5>

In [236]:
# Get the current state of the query.  We could write a "while .." job here, but that would mean
# we'd be continually querying bq and we might get charged for it. So, run as needed.

get_query_status = f"""
SELECT
job_type, state, start_time, end_time, query, total_bytes_billed/1000000000 as gigabytes_billed,
job_id
FROM
  `region-us`.INFORMATION_SCHEMA.JOBS
WHERE
  job_id = '{create_model.job_id}'
"""

query_state = bq_client.query(get_query_status)
query_state.to_dataframe()

Unnamed: 0,job_type,state,start_time,end_time,query,gigabytes_billed,job_id
0,QUERY,DONE,2022-10-29 15:37:27.500000+00:00,2022-10-29 17:01:53.335000+00:00,\nCREATE OR REPLACE MODEL `msds-434-robords-oc...,4249.343754,0f791e88-bf75-4965-b509-473b88c06ed5


In [237]:
results = create_model.result()
results

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f93750b8990>

## Get Training Info

In [238]:
model_training_info = """
SELECT
  *
FROM
  ML.TRAINING_INFO(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`)
"""

automl_model_training = bq_client.query(model_training_info)
train_info = automl_model_training.to_dataframe()

In [239]:
train_info

Unnamed: 0,training_run,iteration,loss,eval_loss,learning_rate,duration_ms
0,0,0,0.0,1.621536,,4568400


## Make Predictions

Pass two, manual predictions to Big Query

In [240]:
translate_element_dict = {
    "WT01":"Fog, ice fog, or freezing fog (may include heavy fog)",
    "WT02":"Heavy fog or heaving freezing fog (not always distinguished from fog)",
    "WT03":"Thunder",
    "WT04":"Ice pellets, sleet, snow pellets, or small hail",
    "WT05":"Hail (may include small hail)",
    "WT06":"Glaze or rime",
    "WT07":"Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction",
    "WT08":"Smoke or haze",
    "WT09":"Blowing or drifting snow",
    "WT10":"Tornado, waterspout, or funnel cloud",
    "WT11": "High or damaging winds",
    "WT12":"Blowing spray",
    "WT13":"Mist",
    "WT14":"Drizzle",
    "WT15":"Freezing drizzle",
    "WT16":"Rain (may include freezing rain, drizzle, and freezing drizzle)",
    "WT17":"Freezing rain",
    "WT18":"Snow, snow pellets, snow grains, or ice crystals",
    "WT19":"Unknown source of precipitation",
    "WT21":"Ground fog",
    "WT22":"Ice fog or freezing fog"
}

In [None]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classes`,(
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/10/21') as date
  UNION ALL
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/11/01') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_info = automl_model_predictions.to_dataframe()

In [None]:
predictions_info['element_name'] = predictions_info['predicted_element'].map(translate_element_dict) 
predictions_info

In [None]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classes`,(
  SELECT 
  'VA' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
  UNION ALL
  SELECT 
  'VT' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_info_va_vt = automl_model_predictions.to_dataframe()

In [None]:
predictions_info_va_vt['element_name'] = predictions_info_va_vt['predicted_element'].map(translate_element_dict) 
predictions_info_va_vt

In [None]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`,(
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/10/21') as date
  UNION ALL
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/11/01') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_infov2 = automl_model_predictions.to_dataframe()
predictions_infov2['element_name'] = predictions_infov2['predicted_element'].map(translate_element_dict) 
predictions_infov2

In [None]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`,(
  SELECT 
  'VA' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
  UNION ALL
  SELECT 
  'VT' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_info_va_vtv2 = automl_model_predictions.to_dataframe()
predictions_info_va_vtv2['element_name'] = predictions_info_va_vtv2['predicted_element'].map(translate_element_dict) 
predictions_info_va_vtv2

In [167]:
predictions_info['element_name'] = predictions_info['predicted_element'].map(translate_element_dict) 
predictions_info

Unnamed: 0,predicted_element,predicted_element_probs,state,date,element_name
0,WT01,"[{'label': 'WT01', 'prob': 0.6697785258293152}...",AK,2022-10-21,"Fog, ice fog, or freezing fog (may include hea..."
1,WT01,"[{'label': 'WT01', 'prob': 0.6858705282211304}...",AK,2022-11-01,"Fog, ice fog, or freezing fog (may include hea..."


In [169]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classes`,(
  SELECT 
  'VA' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
  UNION ALL
  SELECT 
  'VT' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_info_va_vt = automl_model_predictions.to_dataframe()

In [170]:
predictions_info_va_vt['element_name'] = predictions_info_va_vt['predicted_element'].map(translate_element_dict) 
predictions_info_va_vt

Unnamed: 0,predicted_element,predicted_element_probs,state,date,element_name
0,WT01,"[{'label': 'WT01', 'prob': 0.5749873518943787}...",VA,2022-05-21,"Fog, ice fog, or freezing fog (may include hea..."
1,WT01,"[{'label': 'WT01', 'prob': 0.5905430316925049}...",VT,2022-05-21,"Fog, ice fog, or freezing fog (may include hea..."


In [241]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`,(
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/10/21') as date
  UNION ALL
  SELECT 
  'AK' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/11/01') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_infov2 = automl_model_predictions.to_dataframe()
predictions_infov2['element_name'] = predictions_infov2['predicted_element'].map(translate_element_dict) 
predictions_infov2

Unnamed: 0,predicted_element,predicted_element_probs,state,date,element_name
0,WT02,"[{'label': 'WT08', 'prob': 0.07884140312671661...",AK,2022-10-21,Heavy fog or heaving freezing fog (not always ...
1,WT09,"[{'label': 'WT08', 'prob': 0.16644008457660675...",AK,2022-11-01,Blowing or drifting snow


In [242]:
predictions_query = """
SELECT * FROM ML.PREDICT(MODEL `msds-434-robords-oct.weather_prediction.automl_weather_classesv2`,(
  SELECT 
  'VA' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
  UNION ALL
  SELECT 
  'VT' AS state,
  PARSE_DATE('%Y/%m/%d',  '2022/05/21') as date
))
"""

automl_model_predictions = bq_client.query(predictions_query)
predictions_info_va_vtv2 = automl_model_predictions.to_dataframe()
predictions_info_va_vtv2['element_name'] = predictions_info_va_vtv2['predicted_element'].map(translate_element_dict) 
predictions_info_va_vtv2

Unnamed: 0,predicted_element,predicted_element_probs,state,date,element_name
0,WT01,"[{'label': 'WT08', 'prob': 0.03322505205869675...",VA,2022-05-21,"Fog, ice fog, or freezing fog (may include hea..."
1,WT02,"[{'label': 'WT08', 'prob': 0.09590507298707962...",VT,2022-05-21,Heavy fog or heaving freezing fog (not always ...
