In [9]:
import time

# Make app directory, which will hold the training code, __init__.py and setup.py (as required by python)
!mkdir ./census_training_app
# Make a blank __init__.py (required, but blank because we do not need to import any classes or do other initialization actions in this case)
!touch ./census_training_app/__init__.py


#!gsutil cp ./__init__.py gs://ml-demo-rw/CAIP_train_demo/


mkdir: cannot create directory ‘./census_training_app’: File exists


In [4]:

%%writefile ./census_training_app/train.py
## Train model and upload model to Cloud Storage
## Also, write this code to a file in our app directory
import argparse
import pickle
import pandas as pd

from google.cloud import storage

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer

parser = argparse.ArgumentParser()
parser.add_argument("--bucket-name", help="The bucket name", required=True)

arguments, unknown = parser.parse_known_args()
bucket_name = arguments.bucket_name

# Define the format of your input data, including unused columns.
# These are the columns from the census data files.
COLUMNS = (
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income-level'
)

# Categorical columns are columns that need to be turned into a numerical value
# to be used by scikit-learn
CATEGORICAL_COLUMNS = (
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
)

# Create a Cloud Storage client to download the census data
storage_client = storage.Client()

# Download the data
public_bucket = storage_client.bucket('cloud-samples-data')
blob = public_bucket.blob('ml-engine/sklearn/census_data/adult.data')
blob.download_to_filename('adult.data')

# Load the training census dataset
with open("./adult.data", "r") as train_data:
    raw_training_data = pd.read_csv(train_data, header=None, names=COLUMNS)
    # Removing the whitespaces in categorical features
    for col in CATEGORICAL_COLUMNS:
        raw_training_data[col] = raw_training_data[col].apply(lambda x: str(x).strip())

# Remove the column we are trying to predict ('income-level') from our features
# list and convert the DataFrame to a lists of lists
train_features = raw_training_data.drop("income-level", axis=1).values.tolist()
# Create our training labels list, convert the DataFrame to a lists of lists
train_labels = (raw_training_data["income-level"] == " >50K").values.tolist()

# Since the census data set has categorical features, we need to convert
# them to numerical values. We'll use a list of pipelines to convert each
# categorical column and then use FeatureUnion to combine them before calling
# the RandomForestClassifier.
categorical_pipelines = []

# Each categorical column needs to be extracted individually and converted to a
# numerical value. To do this, each categorical column will use a pipeline that
# extracts one feature column via SelectKBest(k=1) and a LabelBinarizer() to
# convert the categorical value to a numerical one. A scores array (created
# below) will select and extract the feature column. The scores array is
# created by iterating over the columns and checking if it is a
# categorical column.
for i, col in enumerate(COLUMNS[:-1]):
    if col in CATEGORICAL_COLUMNS:
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [
        #      39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married',
        #      'Adm-clerical', 'Not-in-family', 'White', 'Male', 2174, 0,
        #      40, 'United-States'
        #  ]
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['State-gov']]
        # Build the scores array
        scores = [0] * len(COLUMNS[:-1])
        # This column is the categorical column we want to extract.
        scores[i] = 1
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(train_features)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(
            (
                'categorical-{}'.format(i), 
                 Pipeline([
                    ('SKB-{}'.format(i), skb),
                    ('LBN-{}'.format(i), lbn)])
            )
        )

# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(("numerical", skb))

# Combine all the features using FeatureUnion
preprocess = FeatureUnion(categorical_pipelines)

# Create the classifier
classifier = RandomForestClassifier()

# Transform the features and fit them to the classifier
classifier.fit(preprocess.transform(train_features), train_labels)

# Create the overall model as a single pipeline
pipeline = Pipeline([("union", preprocess), ("classifier", classifier)])

# Create the model file
# It is required to name the model file "model.pkl" if you are using pickle
model_filename = "model.pkl"
with open(model_filename, "wb") as model_file:
    pickle.dump(pipeline, model_file)

# Upload the model to Cloud Storage
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(model_filename)
blob.upload_from_filename(model_filename)

Writing ./census_training_app/train.py


In [10]:

# Define a timestamped job name
JOB_NAME = "census_training_{}".format(int(time.time()))

# Submit the training job
# AI Platform will automatically create a setup.py file that will install all of the packages you name in your import statements from pipy
!gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir gs://ml-demo-rw/CAIP_train_demo/jobdir \
  --package-path ./census_training_app \
  --module-name train.py \
  --region us-central1 \
  --runtime-version=1.12 \
  --python-version=3.5 \
  --scale-tier BASIC \
  --stream-logs \
  -- \
  --bucket-name ml-demo-rw/CAIP_train_demo/saved_model

Job [census_training_1594237895] submitted successfully.
INFO	2020-07-08 19:51:36 +0000	service		Validating job requirements...
INFO	2020-07-08 19:51:37 +0000	service		Job creation request has been successfully validated.
INFO	2020-07-08 19:51:37 +0000	service		Waiting for job to be provisioned.
INFO	2020-07-08 19:51:37 +0000	service		Job census_training_1594237895 is queued.
INFO	2020-07-08 19:51:39 +0000	service		Waiting for training program to start.
INFO	2020-07-08 19:52:41 +0000	master-replica-0		Running task with arguments: --cluster={"master": ["127.0.0.1:2222"]} --task={"type": "master", "index": 0} --job={  "package_uris": ["gs://ml-demo-rw/CAIP_train_demo/jobdir/packages/d57090bfc3fff4e012f2e5245592d564018058600e748559aaf20d3d58b0ebf5/census_training_app-0.0.0.tar.gz"],  "python_module": "train.py",  "args": ["--bucket-name", "ml-demo-rw/CAIP_train_demo/saved_model"],  "region": "us-central1",  "runtime_version": "1.12",  "job_dir": "gs://ml-demo-rw/CAIP_train_demo/jobdir",  

In [None]:
You've got your model all packaged up, dependencies included...but maybe you've been coding on ai notebooks. 
Maybe you haven't gone through that process, of packing your app, which can be really annoying. Use scheduled notebooks.

In [5]:
MODEL_NAME = "CensusPredictor"
VERSION_NAME = "census_predictor_{}".format(int(time.time()))

In [6]:
## Create the model
!gcloud ai-platform models create $MODEL_NAME --regions us-central1

Created ml engine model [projects/remy-sandbox/models/CensusPredictor].


To take a quick anonymous survey, run:
  $ gcloud survey



In [7]:
## Deploy the model
!gcloud ai-platform versions create $VERSION_NAME \
  --model=$MODEL_NAME \
  --framework=scikit-learn \
  --origin=gs://ml-demo-rw/CAIP_train_demo/ \
  --python-version=3.5 \
  --runtime-version=1.12

[1;31mERROR:[0m (gcloud.ml-engine.versions.create) FAILED_PRECONDITION: Field: version.deployment_uri Error: Deployment directory gs://ml-demo-rw/CAIP_train_demo/ is expected to contain exactly one of: [model.pkl, model.joblib].
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: 'Deployment directory gs://ml-demo-rw/CAIP_train_demo/ is expected
      to contain exactly one of: [model.pkl, model.joblib].'
    field: version.deployment_uri


In [None]:
!PATCH https://ml.googleapis.com/v1/{name=projects/*/models/*/versions/*} \
   { "name" : "CensusPredictor", "requestLoggingConfig": { 
  "samplingPercentage": 100,
  "bigqueryTableName": "remy-sandbox.Mics.response_request_logs"
    }
   } 

In [None]:
requests.patch(https://ml.googleapis.com/v1/{name=projects/*/models/*/versions/*}, params={ "name" : "CensusPredictor", "requestLoggingConfig": { 
  "samplingPercentage": 100,
  "bigqueryTableName": "remy-sandbox.Mics.response_request_logs"
    }
   }, args)

In [None]:
Now set up BQ request-response logging