In [1]:
# Copyright 2024 Forusone(shins777@gmail.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Dataset on Vertex AI from BigQuery and GCS


## Overview
This notebook simplifies [get_started_bq_datasets](https://colab.sandbox.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/datasets/get_started_bq_datasets.ipynb) in Google manual site. Use the original notebook to get more detailed information about this process. Learn more about [BigQuery datasets](https://cloud.google.com/bigquery/docs/datasets-intro) and [Vertex AI for BigQuery users](https://cloud.google.com/vertex-ai/docs/beginner/bqml).

### Dataset

* The dataset used for this tutorial is the GSOD dataset from [BigQuery public datasets](https://cloud.google.com/bigquery/public-data). In this version of the dataset you consider the fields year, month and day to predict the value of mean daily temperature (mean_temp).

## Install Vertex AI SDK

In [1]:
! pip install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-bigquery \
                                 tensorflow \
                                 tensorflow-io \
                                 xgboost \
                                 numpy \
                                 pandas \
                                 pyarrow \
                                 db-dtypes

## Configuration

### Authenticate your notebook environment

In [2]:
import sys
from IPython.display import Markdown, display

PROJECT_ID="ai-hangsik"
LOCATION="us-central1"

# For only colab user, no need this process for Colab Enterprise in Vertex AI.
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id=PROJECT_ID)

# set project.
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


### Initialize Vertex AI SDK

In [3]:
from google.cloud import aiplatform, bigquery
aiplatform.init(project=PROJECT_ID, location=LOCATION)

### Create a Cloud Storage bucket

In [9]:
# Create a bucket.
BUCKET_URI = f"gs://mlops-{PROJECT_ID}-1207"
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://mlops-ai-hangsik-1207/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-ai-hangsik-1207' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


### Import libraries and define constants

In [4]:
import pandas as pd
import xgboost as xgb
from google.cloud import bigquery

## Create DataSet from BigQuery

### Create BigQuery client


In [5]:
IMPORT_FILE = "bq://bigquery-public-data.samples.gsod"
BQ_TABLE = "bigquery-public-data.samples.gsod"

In [6]:
bqclient = bigquery.Client(project=PROJECT_ID)

### Create the dataset on Vertex AI


In [16]:
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset#google_cloud_aiplatform_TabularDataset_create

dataset = aiplatform.TabularDataset.create(
    display_name="NOAA historical weather data - bq dataset",
    bq_source=[IMPORT_FILE],
    labels={"user_metadata": BUCKET_URI[5:]},
)

label_column = "mean_temp"

print(dataset.resource_name)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/us-central1/datasets/7610287873593442304/operations/2263486969453477888
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/us-central1/datasets/7610287873593442304
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/us-central1/datasets/7610287873593442304')


projects/721521243942/locations/us-central1/datasets/7610287873593442304


In [9]:
print(type(dataset))

<class 'google.cloud.aiplatform.datasets.tabular_dataset.TabularDataset'>


## Create DataSet from GCS(Google Cloud Storage)

### Copy the dataset to Cloud Storage

make a copy of the BigQuery table as a CSV file, to Cloud Storage using the BigQuery extract command.[BigQuery command line interface](https://cloud.google.com/bigquery/docs/reference/bq-cli-reference).

#### Create CSV files into GCS

In [11]:
comps = BQ_TABLE.split(".")
BQ_PROJECT_DATASET_TABLE = comps[0] + ":" + comps[1] + "." + comps[2]

! bq --location=us extract --destination_format CSV $BQ_PROJECT_DATASET_TABLE $BUCKET_URI/dataset/csv/mydata*.csv

Waiting on bqjob_r7685f3a55d0c4eed_00000193a39f3684_1 ... (66s) Current status: DONE   
['gs://mlops-ai-hangsik-1207/mydata000000000097.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000098.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000099.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000100.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000101.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000102.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000103.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000104.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000105.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000106.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000107.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000108.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000109.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000110.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000111.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000112.csv', 'gs://mlops-ai-hangsik-1207/mydata000000000113.csv', 'gs://mlop

#### Check exported CSV files list

In [10]:
IMPORT_FILES = ! gsutil ls $BUCKET_URI/dataset/csv/mydata*.csv
print(IMPORT_FILES)

['gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000000.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000001.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000002.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000003.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000004.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000005.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000006.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000007.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000008.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000009.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000010.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000011.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000012.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000013.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000014.csv', 'gs://mlops-ai-hangsik-1

#### Check the contents of a CSV file

In [13]:
EXAMPLE_FILE = IMPORT_FILES[0]
! gsutil cat $EXAMPLE_FILE | head

station_number,wban_number,year,month,day,mean_temp,num_mean_temp_samples,mean_dew_point,num_mean_dew_point_samples,mean_sealevel_pressure,num_mean_sealevel_pressure_samples,mean_station_pressure,num_mean_station_pressure_samples,mean_visibility,num_mean_visibility_samples,mean_wind_speed,num_mean_wind_speed_samples,max_sustained_wind_speed,max_gust_wind_speed,max_temperature,max_temperature_explicit,min_temperature,min_temperature_explicit,total_precipitation,snow_depth,fog,rain,snow,hail,thunder,tornado
39800,99999,1929,12,11,45.5,4,43.5,4,981.4000244140625,4,,,4.3000001907348633,4,19.799999237060547,4,29.899999618530273,,34,false,,,,,false,false,false,false,false,false
37770,99999,1929,12,6,47,4,41.299999237060547,4,993.0999755859375,4,,,4.3000001907348633,4,14.300000190734863,4,18.100000381469727,,45,false,,,,,false,false,false,false,false,false
31590,99999,1929,12,6,45.799999237060547,4,38.299999237060547,4,974.5,4,,,12.399999618530273,4,24.5,4,36.900001525878906,,43,false,,,0,,fa

### Create the dataset on Vertex AI

Learn more about [TabularDataset from CSV files](https://cloud.google.com/vertex-ai/docs/datasets/create-dataset-api#aiplatform_create_dataset_tabular_gcs_sample-python)

In [15]:
gcs_source = IMPORT_FILES

print(f"gcs_source : {gcs_source}")

# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset
# https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.TabularDataset#google_cloud_aiplatform_TabularDataset_create

dataset = aiplatform.TabularDataset.create(
    display_name="NOAA historical weather data - csv dataset",
    gcs_source=gcs_source,
    labels={"user_metadata": BUCKET_URI[5:]},
)

label_column = "mean_temp"

print(dataset.resource_name)

gcs_source : ['gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000000.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000001.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000002.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000003.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000004.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000005.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000006.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000007.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000008.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000009.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000010.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000011.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000012.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000013.csv', 'gs://mlops-ai-hangsik-1207/dataset/csv/mydata000000000014.csv', 'gs://mlops

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/721521243942/locations/us-central1/datasets/377506872036425728/operations/1988767392183877632
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/721521243942/locations/us-central1/datasets/377506872036425728
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/721521243942/locations/us-central1/datasets/377506872036425728')


projects/721521243942/locations/us-central1/datasets/377506872036425728
