In [1]:
!gcloud services enable ml.googleapis.com
!gcloud services enable compute.googleapis.com
# !pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper

We initially query the dataset to view all the fields and decide which fields are useful and what kind of prediction we can make.

Of the 23 fields, we decided to cut the dataset down significantly since many fields did not have complete data, and others share the same information (i.e. community area and the latitude/longitude fields).

In [2]:
import bq_helper
from bq_helper import BigQueryHelper


#Displays a table with all the labels
chicago_taxi = bq_helper.BigQueryHelper(active_project="bigquery-public-data", dataset_name="chicago_taxi_trips")
bq_assistant = BigQueryHelper("bigquery-public-data", "chicago_taxi_trips")
bq_assistant.list_tables()
bq_assistant.head("taxi_trips", num_rows=3)
bq_assistant.table_schema("taxi_trips")

Unnamed: 0,name,type,mode,description
0,unique_key,STRING,REQUIRED,Unique identifier for the trip.
1,taxi_id,STRING,REQUIRED,A unique identifier for the taxi.
2,trip_start_timestamp,TIMESTAMP,NULLABLE,"When the trip started, rounded to the nearest ..."
3,trip_end_timestamp,TIMESTAMP,NULLABLE,"When the trip ended, rounded to the nearest 15..."
4,trip_seconds,INTEGER,NULLABLE,Time of the trip in seconds.
5,trip_miles,FLOAT,NULLABLE,Distance of the trip in miles.
6,pickup_census_tract,INTEGER,NULLABLE,The Census Tract where the trip began. For pri...
7,dropoff_census_tract,INTEGER,NULLABLE,The Census Tract where the trip ended. For pri...
8,pickup_community_area,INTEGER,NULLABLE,The Community Area where the trip began.
9,dropoff_community_area,INTEGER,NULLABLE,The Community Area where the trip ended.


After deciding which fields were useful and the use-case of our model, we ran the query to collect and pre-processed the data:

In [3]:
from google.cloud import bigquery
client = bigquery.Client(project='ml-sandbox-1-191918')


dataset_id = 'chicagotaxi'
job_config = bigquery.QueryJobConfig()

client.delete_table('ml-sandbox-1-191918.chicagotaxi.chicago_taxi_processed', not_found_ok=True)
table_ref = client.dataset(dataset_id).table('chicago_taxi_processed')
job_config.destination = table_ref


query = '''SELECT
  IF(payment_type='Cash',1,0) cash,
  EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS day_of_week,
  (((EXTRACT(HOUR from trip_start_timestamp)*3600)+(EXTRACT(MINUTE from trip_start_timestamp)*60)+(EXTRACT(SECOND from trip_start_timestamp)))/86400) as start_time,
  (((EXTRACT(HOUR from trip_end_timestamp)*3600)+(EXTRACT(MINUTE from trip_end_timestamp)*60)+(EXTRACT(SECOND from trip_end_timestamp)))/86400) as end_time,
  EXTRACT(DAYOFYEAR FROM trip_start_timestamp) as day_of_year,
  EXTRACT(MONTH FROM trip_start_timestamp) as month,
  EXTRACT(YEAR FROM trip_start_timestamp) as year,
  trip_miles,
  (pickup_latitude - 41.660136051)/(42.021223593 - 41.660136051) AS standard_pickup_lat,
  (pickup_longitude + 87.913624596)/(-87.531386257 + 87.913624596) AS standard_pickup_long,
  (dropoff_latitude - 41.650221676)/(42.021223593 - 41.650221676 ) AS standard_dropoff_lat,
  (dropoff_longitude + 87.913624596)/(-87.531386257 + 87.913624596) AS standard_dropoff_long
FROM
  `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
  trip_miles > 0
  AND trip_seconds > 0
  AND fare > 0
  AND payment_type in ('Cash', 'Credit Card')
  AND trip_start_timestamp IS NOT NULL
  AND trip_end_timestamp IS NOT NULL
  AND trip_miles IS NOT NULL
  AND pickup_latitude IS NOT NULL
  AND pickup_longitude IS NOT NULL
  AND dropoff_latitude IS NOT NULL
  AND dropoff_longitude IS NOT NULL;
'''

query_job = client.query(query, location='US', job_config=job_config)

query_job.result()  # Waits for the query to finish
print('Query results loaded to table {}'.format(table_ref.path))


Query results loaded to table /projects/ml-sandbox-1-191918/datasets/chicagotaxi/tables/chicago_taxi_processed


The pickup and dropoff latitudes and longitudes were normalized using $\frac{x-x_{min}}{x_{max}-x_{min}}$. These minimum and maximum values were found using the following query:

In [4]:
%%bigquery
SELECT
  MIN(pickup_latitude) as min_pick_lat,
  MAX(pickup_latitude) as max_pick_lat,
  MIN(pickup_longitude) as min_pick_lon,
  MAX(pickup_longitude) as max_pick_lon,
  MIN(dropoff_latitude) as min_drop_lat,
  MAX(dropoff_latitude) as max_drop_lat,
  MIN(dropoff_longitude) as min_drop_lon,
  MAX(dropoff_longitude) as max_drop_lon
FROM
  `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
  trip_miles > 0
  AND trip_seconds > 0
  AND fare > 0
  AND payment_type in ('Cash', 'Credit Card')
  AND trip_start_timestamp IS NOT NULL
  AND trip_end_timestamp IS NOT NULL
  AND trip_miles IS NOT NULL
  AND pickup_latitude IS NOT NULL
  AND pickup_longitude IS NOT NULL
  AND dropoff_latitude IS NOT NULL
  AND dropoff_longitude IS NOT NULL;

Unnamed: 0,min_pick_lat,max_pick_lat,min_pick_lon,max_pick_lon,min_drop_lat,max_drop_lat,min_drop_lon,max_drop_lon
0,41.660136,42.021224,-87.913625,-87.531386,41.650222,42.021224,-87.913625,-87.531386


After pre-processing the data, we ran matrix correlation to check if our problem could be solved simply. We found that there was no direct correlation between payment type and any of the other fields, so we moved on to using a Linear ML Classifier. 

In [5]:
%%bigquery
SELECT
  CORR(cash,
    trip_miles) AS trip_miles_corr,
  CORR(cash,
    standard_pickup_lat) AS pickup_latitude_corr,
  CORR(cash,
    standard_pickup_long) AS pickup_longitude_corr,
  CORR(cash,
    standard_dropoff_lat) AS dropoff_latitude_corr,
  CORR(cash,
    standard_dropoff_long) AS dropoff_longitude_corr,
  CORR(cash,
    start_time) AS dropoff_time_corr,
  CORR(cash,
    year) AS dropoff_year_corr,
  CORR(cash,
    month) AS month_corr,
  CORR(cash,
    day_of_year) AS day_corr,
  CORR(cash,
    day_of_week) AS weekday_corr
FROM
  `ml-sandbox-1-191918.chicagotaxi.chicago_taxi_processed`

Unnamed: 0,trip_miles_corr,pickup_latitude_corr,pickup_longitude_corr,dropoff_latitude_corr,dropoff_longitude_corr,dropoff_time_corr,dropoff_year_corr,month_corr,day_corr,weekday_corr
0,-0.068182,0.011734,0.116265,0.001082,0.057388,-0.010678,-0.141403,0.002425,0.002589,0.022717


We then send the processed BigQuery table to a Google Cloud Storage bucket, where it can be accessed by our model for training:

In [6]:
from google.cloud import bigquery
client = bigquery.Client()
bucket_name = 'chicago-taxi-data-processed'
project = 'ml-sandbox-1-191918'
dataset_id = 'chicagotaxi'
table_id = 'final_taxi_standardized'
destination_uri = 'gs://{}/{}'.format(bucket_name, 'chicago-taxi-*.csv')
dataset_ref = client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

job_config = bigquery.job.ExtractJobConfig(print_header=False)

extract_job = client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US',
    job_config=job_config)  # API request

extract_job.result()  # Waits for job to complete.

print('Exported {}:{}.{} to {}'.format(
    project, dataset_id, table_id, destination_uri))

Exported ml-sandbox-1-191918:chicagotaxi.final_taxi_standardized to gs://chicago-taxi-data-processed/chicago-taxi-*.csv


Combine files into one for training:

In [7]:
!gsutil compose gs://chicago-taxi-data-processed/chicago-taxi-* gs://chicago-taxi-data-processed/processed-chicago-taxi.csv

Composing gs://chicago-taxi-data-processed/processed-chicago-taxi.csv from 24 component object(s).


In [8]:
%%writefile config.yaml
trainingInput:
  scaleTier: CUSTOM
  masterType: large_model_v100
  args:
    - "--preprocess"
    - "--training_data_path=gs://chicago-taxi-data-processed/half_finaltaxi_encoded.csv"
    - "--validation_split=0.2"
    - "--test_split=0.1"
    - "--model_type=classification"
    - "--max_steps=10000000"
    - "--learning_rate=0.0002"
    - "--eval_steps=1000"
    - "--batch_size=1"
    - "--eval_frequency_secs=100"
    - "--optimizer_type=ftrl"
  region: us-central1
  jobDir: gs://chicago-taxi-data-processed
  masterConfig:
    imageUri: gcr.io/cloud-ml-algos/linear_learner_gpu:latest

Overwriting config.yaml


In [9]:
CONFIG = 'config.yaml'

Execute trainining job:

In [10]:
from random import randrange
JOB_NAME = "tuesday_taxi_" + str(randrange(100000,999999))
BUCKET_NAME = "chicago-taxi-data-processed/" 

In [11]:
# Submit the training job:
!gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir gs://$BUCKET_NAME/tuesday_taxi_2 \
  --package-path ./tuesday_taxi_2 \
  --module-name tuesday_taxi_2/processed_data/training.csv \
  --region us-central1 \
  --config $CONFIG \
  --python-version=3.5 \
  -- \
  --bucket-name $BUCKET_NAME

Job [tuesday_taxi_435866] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe tuesday_taxi_435866

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs tuesday_taxi_435866
jobId: tuesday_taxi_435866
state: QUEUED


Generate a prediction:

In [12]:
DATA_FORMAT="text" # JSON data format
INPUT_PATHS='data.json'
OUTPUT_PATH='gs://$BUCKET_NAME/'
MODEL_NAME = "taxi_checkpoint_1"
VERSION_NAME = "taxi_checkpoint_1"
REGION='us-east1'
#now=$(date +"%Y%m%d_%H%M%S")
#JOB_NAME = "friday_training_{}".format(int(time.time()))
MAX_WORKER_COUNT="20"

In [13]:
%%writefile $INPUT_FILE
{"csv_row":"1,0.135416667,0.135416667,343,12,2018,0.17,41.90602597,-87.67531162,41.90602597,-87.67531162","key" : "dummy-key"} #class 1
{"csv_row":"4,0.90625,0.916666667,58,2,2013,1.7,41.94982935,-87.64396537,41.92907766,-87.64629348","key" : "dummy-key"} #class 0

Overwriting $INPUT_FILE


In [14]:
INPUT_FILE="data.json"

!gcloud ai-platform predict --model $MODEL_NAME --version \
  $VERSION_NAME --json-instances $INPUT_FILE

CLASS_IDS  CLASSES  KEY             LOGISTIC              LOGITS                 PROBABILITIES
[1]        [u'1']   [u'dummy-key']  [0.5945031642913818]  [0.38261300325393677]  [0.40549683570861816, 0.5945031642913818]
[1]        [u'1']   [u'dummy-key']  [0.6245089769363403]  [0.5087311267852783]   [0.37549105286598206, 0.6245089769363403]
