In [None]:
# 
# Build pCVR using GAP bigquery and Tensorflow DNN library 
# Used calibration period for training & holdout period for validation
# Script can be run on Compute Engine of GCP
# The query to extract GA360 features referred to an Auto case by Yiling Liu(yilliu@) 
# 
# By JeeWook Kim
#

<h1> Install libraries

In [1]:
%%bash
pip install --upgrade pip
pip install --upgrade google-api-python-client
pip install --upgrade gcloud
pip install tensorflow
pip install --ignore-installed --upgrade tensorflow==1.9.0

Requirement already up-to-date: pip in /usr/local/lib/python2.7/dist-packages (18.0)
Requirement already up-to-date: google-api-python-client in /usr/local/lib/python2.7/dist-packages (1.7.4)
Requirement already up-to-date: gcloud in /usr/local/lib/python2.7/dist-packages (0.18.3)
Collecting tensorflow==1.9.0
  Using cached https://files.pythonhosted.org/packages/37/ff/97d4542f805ae25bf4b65b6263515584c78bd9a6111ed78ea971eff2946a/tensorflow-1.9.0-cp27-cp27mu-manylinux1_x86_64.whl
Collecting mock>=2.0.0 (from tensorflow==1.9.0)
  Using cached https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl
Collecting grpcio>=1.8.6 (from tensorflow==1.9.0)
  Using cached https://files.pythonhosted.org/packages/ea/96/234eaae211b6b34739018735be33e1616bc3e73c62d842c5189b80a4bd72/grpcio-1.14.0-cp27-cp27mu-manylinux1_x86_64.whl
Collecting termcolor>=1.1.0 (from tensorflow==1.9.0)
Collecting numpy>=1.13.3 (from tensorflow

google-cloud 0.27.0 has requirement google-cloud-core<0.27dev,>=0.26.0, but you'll have google-cloud-core 0.27.1 which is incompatible.
google-cloud 0.27.0 has requirement google-cloud-storage<1.4dev,>=1.3.0, but you'll have google-cloud-storage 1.4.0 which is incompatible.
google-cloud-vision 0.26.0 has requirement google-cloud-core<0.27dev,>=0.26.0, but you'll have google-cloud-core 0.27.1 which is incompatible.
google-cloud-vision 0.26.0 has requirement google-gax<0.16dev,>=0.15.13, but you'll have google-gax 0.12.5 which is incompatible.
google-cloud-logging 1.2.0 has requirement google-cloud-core<0.27dev,>=0.26.0, but you'll have google-cloud-core 0.27.1 which is incompatible.
google-cloud-speech 0.28.0 has requirement google-cloud-core<0.27dev,>=0.26.0, but you'll have google-cloud-core 0.27.1 which is incompatible.
google-cloud-speech 0.28.0 has requirement google-gax<0.16dev,>=0.15.13, but you'll have google-gax 0.12.5 which is incompatible.
google-cloud-videointelligence 0.25.

<h1> Import libraries

In [2]:
import tensorflow as tf
print(tf.__version__)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import urllib
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

1.9.0


  from . import _ni_label
  from . import _ni_label


<h1> Prepare dataset using GA360

In [3]:
today = datetime.date.today().strftime("%Y%m%d")
begin_date = '20160801'
end_date = '20170801'
calibration_end_date = '20170201'
# Googel Store demo table
gap_table = 'bigquery-public-data.google_analytics_sample.ga_sessions_*'

print('# today: {}'.format(today))
print('# begin_date: {}'.format(begin_date))
print('# end_date: {}'.format(end_date))
print('# calibration_end_date: {}'.format(calibration_end_date))
# query to retrieve GAP exported BigQuery 
sql_train = """ 
          WITH ga_raw AS ( -- hit level data
          SELECT
            date,
            fullVisitorId,
            channelGrouping,
            socialEngagementType,
            visitId,
            visitNumber,
            trafficSource.source,
            trafficSource.medium,
            device.deviceCategory,
            device.browser,
            hits.hitNumber,
            hits.page.pagePath as pagePath,
            hits.type as type,
            hits.eventInfo.eventCategory as eventCategory,
            hits.eventInfo.eventAction as eventAction,
            hits.eventInfo.eventLabel as eventLabel,
            TIMESTAMP_SECONDS(visitStartTime) AS sessionStartTtime, 
            TIMESTAMP_ADD(TIMESTAMP_SECONDS(visitStartTime), INTERVAL hits.time MILLISECOND) AS hitTime,
            CASE WHEN hits.hour IN (5,6,7,8,9,10) THEN 1 ELSE 0 END AS morningVisit,
            CASE WHEN hits.hour IN (11,12,13,14,15,16) THEN 1 ELSE 0 END AS dayVisit,
            CASE WHEN hits.hour IN (17,18,19,20,21,22) THEN 1 ELSE 0 END AS eveningVisit,
            totals.timeOnSite AS timeOnSite,
            totals.bounces AS bounceNumber,
            totals.timeOnScreen	AS timeOnScreen,
            totals.transactions AS transactions,
            IF(REGEXP_CONTAINS(trafficSource.source , 'google') AND REGEXP_CONTAINS(trafficSource.medium , 'cpc'), 1, 0) AS fromPaidSearch,
            IF(REGEXP_CONTAINS(trafficSource.source , 'google') AND REGEXP_CONTAINS(trafficSource.medium , 'organic'), 1, 0) AS fromOrganicSearch
          FROM `"""+gap_table+"""`, unnest(hits) as hits
          WHERE
           (_TABLE_SUFFIX >= '"""+begin_date+"""' AND _TABLE_SUFFIX <= '"""+calibration_end_date+"""') ),
       
        session AS ( -- aggregate hit level to session level
          SELECT fullVisitorId, visitId,
            SUM(if (timeOnSite is null, 0, timeOnSite)) AS timeOnSite,
            SUM(if (bounceNumber is null, 0, bounceNumber)) AS bounceNumber,
            SUM( dayVisit ) AS dayVisit,
            SUM( eveningVisit ) AS eveningVisit ,
            SUM( morningVisit ) AS morningVisit ,
            ANY_VALUE(visitNumber) as visitNumber,

            SUM(if( type = 'PAGE', 1, 0)) as totalPageViews, 
            SUM(if( type = 'EVENT', 1, 0)) as totalEvents, 

            SUM(if(pagePath LIKE '%/apparel%', 1, 0)) as apparelViews,
            SUM(if(pagePath LIKE '%/bags%', 1, 0)) as bagsViews,
            SUM(if(pagePath LIKE '%/drinkware%', 1, 0)) as drinkwareViews,
            SUM(if(pagePath LIKE '%/accessories%', 1, 0)) as accessoriesViews,
            SUM(if(pagePath LIKE '%/office%', 1, 0)) as officeViews,

            IF(SUM(fromPaidSearch) != 0, 1, 0) AS  fromPaidSearch,
            IF(SUM(fromOrganicSearch) != 0, 1, 0) AS  fromOrganicSearch,

            IF(SUM(transactions) is null, False, True) AS hasConverted,

            COUNT(*) as totalInteractions    
          FROM ga_raw
          GROUP BY fullVisitorId , visitId ),
  
       ml_dataset AS ( -- aggregate seesion level data to user level

          SELECT 
            fullVisitorId, 
            MAX(visitNumber) as totalSessions,
            SUM(totalPageViews) as totalPageViews, 
            SUM(totalInteractions) as totalInteractions, 

            SUM(timeOnSite) AS timeOnSite,
            SUM(bounceNumber) AS bounceNumber,
            SUM( dayVisit ) AS dayVisit,
            SUM( eveningVisit ) AS eveningVisit ,
            SUM( morningVisit ) AS morningVisit ,

            SUM(totalEvents) as totalEvents, 

            SUM(apparelViews) as apparelViews,
            SUM(bagsViews) as bagsViews,
            SUM(drinkwareViews)  as drinkwareViews,
            SUM(accessoriesViews) as accessoriesViews,
            SUM(officeViews) as officeViews,

            SUM(fromPaidSearch) AS fromPaidSearch,
            SUM(fromOrganicSearch) AS fromOrganicSearch,
            ANY_VALUE(hasConverted) AS hasConverted

          FROM session
          GROUP BY fullVisitorId)

        select * from ml_dataset; 
"""

print ('# BigQuery SQL - train data')  
print (sql_train)
    
sql_test = """ 
         WITH ga_raw AS (
          SELECT
            date,
            fullVisitorId,
            channelGrouping,
            socialEngagementType,
            visitId,
            visitNumber,
            trafficSource.source,
            trafficSource.medium,
            device.deviceCategory,
            device.browser,
            hits.hitNumber,
            hits.page.pagePath as pagePath,
            hits.type as type,
            hits.eventInfo.eventCategory as eventCategory,
            hits.eventInfo.eventAction as eventAction,
            hits.eventInfo.eventLabel as eventLabel,
            TIMESTAMP_SECONDS(visitStartTime) AS sessionStartTtime, 
            TIMESTAMP_ADD(TIMESTAMP_SECONDS(visitStartTime), INTERVAL hits.time MILLISECOND) AS hitTime,
            CASE WHEN hits.hour IN (5,6,7,8,9,10) THEN 1 ELSE 0 END AS morningVisit,
            CASE WHEN hits.hour IN (11,12,13,14,15,16) THEN 1 ELSE 0 END AS dayVisit,
            CASE WHEN hits.hour IN (17,18,19,20,21,22) THEN 1 ELSE 0 END AS eveningVisit,
            totals.timeOnSite AS timeOnSite,
            totals.bounces AS bounceNumber,
            totals.timeOnScreen	AS timeOnScreen,
            totals.transactions AS transactions,
            IF(REGEXP_CONTAINS(trafficSource.source , 'google') AND REGEXP_CONTAINS(trafficSource.medium , 'cpc'), 1, 0) AS fromPaidSearch,
            IF(REGEXP_CONTAINS(trafficSource.source , 'google') AND REGEXP_CONTAINS(trafficSource.medium , 'organic'), 1, 0) AS fromOrganicSearch
          FROM `"""+gap_table+"""`, unnest(hits) as hits
          WHERE
           (_TABLE_SUFFIX > '"""+calibration_end_date+"""' AND _TABLE_SUFFIX <= '"""+end_date+"""') ),
       
        session AS ( -- aggregate hit level to session level
          SELECT fullVisitorId, visitId,
            SUM(if (timeOnSite is null, 0, timeOnSite)) AS timeOnSite,
            SUM(if (bounceNumber is null, 0, bounceNumber)) AS bounceNumber,
            SUM( dayVisit ) AS dayVisit,
            SUM( eveningVisit ) AS eveningVisit ,
            SUM( morningVisit ) AS morningVisit ,
            ANY_VALUE(visitNumber) as visitNumber,

            SUM(if( type = 'PAGE', 1, 0)) as totalPageViews, 
            SUM(if( type = 'EVENT', 1, 0)) as totalEvents, 

            SUM(if(pagePath LIKE '%/apparel%', 1, 0)) as apparelViews,
            SUM(if(pagePath LIKE '%/bags%', 1, 0)) as bagsViews,
            SUM(if(pagePath LIKE '%/drinkware%', 1, 0)) as drinkwareViews,
            SUM(if(pagePath LIKE '%/accessories%', 1, 0)) as accessoriesViews,
            SUM(if(pagePath LIKE '%/office%', 1, 0)) as officeViews,

            IF(SUM(fromPaidSearch) != 0, 1, 0) AS  fromPaidSearch,
            IF(SUM(fromOrganicSearch) != 0, 1, 0) AS  fromOrganicSearch,

            IF(SUM(transactions) is null, False, True) AS hasConverted,

            COUNT(*) as totalInteractions    
          FROM ga_raw
          GROUP BY fullVisitorId , visitId ),
  
       ml_dataset AS ( -- aggregate seesion level data to user level

          SELECT 
            fullVisitorId, 
            MAX(visitNumber) as totalSessions,
            SUM(totalPageViews) as totalPageViews, 
            SUM(totalInteractions) as totalInteractions, 

            SUM(timeOnSite) AS timeOnSite,
            SUM(bounceNumber) AS bounceNumber,
            SUM( dayVisit ) AS dayVisit,
            SUM( eveningVisit ) AS eveningVisit ,
            SUM( morningVisit ) AS morningVisit ,

            SUM(totalEvents) as totalEvents, 

            SUM(apparelViews) as apparelViews,
            SUM(bagsViews) as bagsViews,
            SUM(drinkwareViews)  as drinkwareViews,
            SUM(accessoriesViews) as accessoriesViews,
            SUM(officeViews) as officeViews,

            SUM(fromPaidSearch) AS fromPaidSearch,
            SUM(fromOrganicSearch) AS fromOrganicSearch,
            ANY_VALUE(hasConverted) AS hasConverted

          FROM session
          GROUP BY fullVisitorId)

        select * from ml_dataset; 
"""

print ('# BigQuery SQL - test data')  
print (sql_test)
    
# execute the query using datalab lib
import google.datalab.bigquery as bq
# Pandas lib to handle table data
import pandas as pd
transaction_query = bq.Query(sql_train)
query_result = transaction_query.execute()
query_data = query_result.result().to_dataframe()
    
print('# query_data # of converted users {}'.format(query_data[query_data['hasConverted']==True].shape[0]))
print(query_data[query_data['hasConverted']==True].head(20))
print('# query_data # of not converted users {}'.format(query_data[query_data['hasConverted']==False].shape[0]))
print(query_data[query_data['hasConverted']==False].head(20))
    
training_set_data = query_data.as_matrix(columns=["totalPageViews", "totalInteractions", "timeOnSite","bounceNumber","dayVisit","eveningVisit","morningVisit","totalEvents","apparelViews","bagsViews","drinkwareViews","accessoriesViews","officeViews","fromPaidSearch","fromOrganicSearch"])
training_set_target = query_data.as_matrix(columns=["hasConverted"])
    
test_transaction_query = bq.Query(sql_test)
test_query_result = test_transaction_query.execute()
test_query_data = test_query_result.result().to_dataframe()
    
test_set_data = test_query_data.as_matrix(columns=["totalPageViews", "totalInteractions", "timeOnSite","bounceNumber","dayVisit","eveningVisit","morningVisit","totalEvents","apparelViews","bagsViews","drinkwareViews","accessoriesViews","officeViews","fromPaidSearch","fromOrganicSearch"])
test_set_target = test_query_data.as_matrix(columns=["hasConverted"])

# today: 20180806
# begin_date: 20160801
# end_date: 20170801
# calibration_end_date: 20170201
# BigQuery SQL - train data
 
          WITH ga_raw AS ( -- hit level data
          SELECT
            date,
            fullVisitorId,
            channelGrouping,
            socialEngagementType,
            visitId,
            visitNumber,
            trafficSource.source,
            trafficSource.medium,
            device.deviceCategory,
            device.browser,
            hits.hitNumber,
            hits.page.pagePath as pagePath,
            hits.type as type,
            hits.eventInfo.eventCategory as eventCategory,
            hits.eventInfo.eventAction as eventAction,
            hits.eventInfo.eventLabel as eventLabel,
            TIMESTAMP_SECONDS(visitStartTime) AS sessionStartTtime, 
            TIMESTAMP_ADD(TIMESTAMP_SECONDS(visitStartTime), INTERVAL hits.time MILLISECOND) AS hitTime,
            CASE WHEN hits.hour IN (5,6,7,8,9,10) THEN 1 ELSE 0 END AS morningVisit,

<h1> Select Model (Tensorflow DNN Classifier)

In [4]:
# Specify that all features have real-value data
feature_columns = [tf.feature_column.numeric_column("x", shape=[15])]

# Build 3 layer DNN with 256, 64, 32 units respectively.
classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                        hidden_units=[256, 64, 32],
                                        n_classes=2,
                                        model_dir="/tmp/pcvr_model")


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd02df69f90>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/pcvr_model', '_train_distribute': None, '_save_summary_steps': 100}


<h1> Train Model

In [5]:
# Define the training inputs
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(training_set_data)},
    y=np.array(training_set_target),
    num_epochs=None,
    shuffle=True)

# Train model.
classifier.train(input_fn=train_input_fn, steps=2000)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/pcvr_model/model.ckpt.
INFO:tensorflow:loss = 12.439629, step = 1
INFO:tensorflow:global_step/sec: 259.071
INFO:tensorflow:loss = 0.0015332294, step = 101 (0.388 sec)
INFO:tensorflow:global_step/sec: 349.458
INFO:tensorflow:loss = 2.0183288e-06, step = 201 (0.288 sec)
INFO:tensorflow:global_step/sec: 297.645
INFO:tensorflow:loss = 6.162442e-05, step = 301 (0.336 sec)
INFO:tensorflow:global_step/sec: 321.222
INFO:tensorflow:loss = 8.220364e-09, step = 401 (0.310 sec)
INFO:tensorflow:global_step/sec: 429.188
INFO:tensorflow:loss = 8.0968616e-19, step = 501 (0.235 sec)
INFO:tensorflow:global_step/sec: 376.054
INFO:tensorflow:loss = 1.1697067e-05, step = 601 (0.265 sec)
INFO:tensorflow:global_step/sec: 451.

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fd042db3f90>

<h1>Test and Evaluate Model

In [6]:
# Define the test inputs
test_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": np.array(test_set_data)},
    y=np.array(test_set_target),
    num_epochs=1,
    shuffle=False)

# Evaluate accuracy.
accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]

print("\nTest Accuracy: {0:f}\n".format(accuracy_score))



INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-06-08:00:56
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/pcvr_model/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-06-08:01:10
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.9909073, accuracy_baseline = 0.9909073, auc = 0.8131782, auc_precision_recall = 0.51198494, average_loss = 0.0953363, global_step = 2000, label/mean = 0.009092706, loss = 12.201024, precision = 0.0, prediction/mean = 0.07411259, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2000: /tmp/pcvr_model/model.ckpt-2000

Test Accuracy: 0.990907



<h1> Predict

In [7]:
new_input= test_query_data[query_data['hasConverted']==False].as_matrix(columns=["totalPageViews", "totalInteractions", "timeOnSite","bounceNumber","dayVisit","eveningVisit","morningVisit","totalEvents","apparelViews","bagsViews","drinkwareViews","accessoriesViews","officeViews","fromPaidSearch","fromOrganicSearch"])
new_input_key = test_query_data[query_data['hasConverted']==False].as_matrix(columns=["fullVisitorId"])
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": new_input},
    num_epochs=1,
    shuffle=False)

predictions = list(classifier.predict(input_fn=predict_input_fn))

  if __name__ == '__main__':
  from ipykernel import kernelapp as app


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/pcvr_model/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [12]:
p_array = np.empty((0, 1))
for prob in predictions:
  probability = round(prob['probabilities'][1],4)
  new_row = np.array([[probability]])
  p_array = np.vstack((p_array, new_row))

print(p_array.shape)
print(new_input_key.shape)
df = pd.DataFrame(np.hstack((new_input_key, p_array)), columns=['fullVisitorId','probability'])
df.head(100)
# df.sort_values('probability', ascending=False).head(50)


(311993, 1)
(311993, 1)


Unnamed: 0,fullVisitorId,probability
0,5900472703847352092,0
1,0424668908545837304,0
2,5484016666945224510,0
3,6719182032425610749,0
4,6343719132544553875,0
5,5451524334943728267,0
6,0879691200468622129,0
7,5989534016227542303,0
8,1511382723683036364,0
9,3711936160516700696,0
