In [1]:
# Import Vertex AI library
from google.cloud import aiplatform

In [2]:
# Initialize connection
aiplatform.init(location='europe-west1')

#### Dataset from BigQuery

In [3]:
# Define BigQuery source 
bq_source = 'bq://cloud4datascience.test_datasets.diabetes'

# Define the dataset name
dataset_display_name = 'diabetes_bq'

# Create the dataset inside Vertex AI
create_dataset_task = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    bq_source=bq_source
)
create_dataset_task.wait()

Creating TabularDataset
Create TabularDataset backing LRO: projects/268076997885/locations/europe-west1/datasets/8187297831954939904/operations/8840612248016650240
TabularDataset created. Resource name: projects/268076997885/locations/europe-west1/datasets/8187297831954939904
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/268076997885/locations/europe-west1/datasets/8187297831954939904')


In [4]:
# Get dataset
dataset = aiplatform.TabularDataset(create_dataset_task.resource_name)
dataset

<google.cloud.aiplatform.datasets.tabular_dataset.TabularDataset object at 0x7f64c197fbd0> 
resource name: projects/268076997885/locations/europe-west1/datasets/8187297831954939904

#### Dataset from Cloud Storage

In [5]:
# Define Google Cloud Storage source 
gcs_source = 'gs://c4ds-datasets/diabetes.csv'

# Define the dataset name
dataset_display_name = 'diabetes_gcs'

# Create the dataset inside Vertex AI
create_dataset_task = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    gcs_source=gcs_source
)
create_dataset_task.wait()

Creating TabularDataset
Create TabularDataset backing LRO: projects/268076997885/locations/europe-west1/datasets/9218622146622783488/operations/5296173788159803392
TabularDataset created. Resource name: projects/268076997885/locations/europe-west1/datasets/9218622146622783488
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/268076997885/locations/europe-west1/datasets/9218622146622783488')


In [None]:
# Get dataset
dataset = aiplatform.TabularDataset(create_dataset_task.resource_name)
dataset

#### Dataset from Dataframe

In [6]:
import pandas as pd

# Define BigQuery staging path 
bq_staging_path = 'bq://cloud4datascience.test_datasets.diabetes_df'

# Read the dataset from local file
df = pd.read_csv('diabetes.csv')

# Define the dataset name
dataset_display_name = 'diabetes_df'

# Create the dataset inside Vertex AI
create_dataset_task = aiplatform.TabularDataset.create_from_dataframe(
    df_source=df,
    staging_path=bq_staging_path,
    display_name=dataset_display_name)
create_dataset_task.wait()

Your DataFrame has 768 rows and AutoML requires 1000 rows to train on tabular data. You can still train a custom model once your dataset has been uploaded to Vertex, but you will not be able to use AutoML for training.
Creating TabularDataset
Create TabularDataset backing LRO: projects/268076997885/locations/europe-west1/datasets/7948607051704303616/operations/1083161889870970880
TabularDataset created. Resource name: projects/268076997885/locations/europe-west1/datasets/7948607051704303616
To use this TabularDataset in another session:
ds = aiplatform.TabularDataset('projects/268076997885/locations/europe-west1/datasets/7948607051704303616')
