First step is import data from Parse.ly's AWS bucket for the relevant  time period into Google BigQuery.  
The key and passphrase for these are with rachit.kinger@jpimedia.co.uk. More key and passwords can be obtained by reaching out directly to Parse.ly.  
This data is best imported via GCP's GUI. Go to `GCP > Storage > Transfer` and the steps from there should be obvious.  
Remember to specify file filters to download data only for the relevant months.  

Once this data has been download the follow these steps to import into a single 'large' database. DO NOT use this database for analysis. Only around 30% of this dataset is useful for us. This database will have a column called `action` which has mostly two values, `pageview` or `heartbeat`. The `pageview` is what is of interest to us and it occupies only 30% of the database.  



In [None]:
# the following commands were run from bash using gsutil
# create empty table with the rawdata schema
%%bash
bq mk -t --schema /home/rachit/gdrive/GCP/bigquery_parsely/bqtable_from_cli/parsely_rawdata_schema.json \
--time_partitioning_type=DAY --time_partitioning_field ts_action \
--require_partition_filter=TRUE --clustering_fields='apikey,action' \
parsely.rawdata

In [None]:
# the following commands were run from bash using gsutil
# use the following command to load the raw parsely data into the the empty table created in previous step
%%bash
bq load --source_format=NEWLINE_DELIMITED_JSON --max_bad_records=1000 --ignore_unknown_values parsely.rawdata gs://parsely/*

In [1]:
# use oauth to authenticate access into GCP and bigquery
# toggle this variable depending on whether you are on a machine that has an Internet browser or not
launch_browser = True 
# The `launch_browser` boolean variable indicates if a local server is used
# as the callback URL in the auth flow. A value of `True` is recommended,
# but a local server does not work if accessing the application remotely,
# such as over SSH or from a remote Jupyter notebook.

from google_auth_oauthlib import flow

# if you are not Rachit Kinger please build your own oauth access client id and key and set the 
# path in the function below to your own client secret

appflow = flow.InstalledAppFlow.from_client_secrets_file(
    '/home/rachit/gdrive/GCP/oauth_client_key/data-team-rachit-kinger-desktop-apps.json',
    scopes = ['https://www.googleapis.com/auth/bigquery'])

if launch_browser:
    appflow.run_local_server()
else:
    appflow.run_console()
    
credentials = appflow.credentials

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=306244342965-pv1n0vhib5nv2ks66pnj59csdf8cqor0.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=AtIZlfeTclQVuvYzghB6Ym6dYJmFRB&access_type=offline


In [2]:
# access bigquery
from google.cloud import bigquery
project_id = "bigquery-test-165213"
client = bigquery.Client(project = project_id, credentials = credentials)

In [5]:
#create new dataset without the non-pageviews data
job_config = bigquery.QueryJobConfig()
#set destination table
table_ref = client.dataset('parsely').table('raw_data_for_jan_feb')
job_config.destination = table_ref

sql = '''
SELECT
  apikey,
  event_id,
  flags_is_amp,
  ip_city,
  ip_lat,
  ip_lon,
  metadata_page_type,
  metadata_section,
  metadata_tags,
  metadata_title,
  ref_category,
  ref_domain,
  session,
  session_id, 
  session_initial_referrer,
  ts_action,
  ua_device,
  ua_devicetype, 
  url_clean,
  visitor,
  visitor_site_id,
  visitor_network_id
FROM
  `bigquery-test-165213.parsely.rawdata`
WHERE
  ts_action < TIMESTAMP("2019-03-01") --FOR Jan, Feb
  AND action = "pageview"
  AND visitor_site_id IN ("", "OPTOUT")
'''

#run the query
query_job = client.query(
sql,
# location must match that of the dataset(s) referenced in the query
# and that of the destination table
location = 'EU',
job_config=job_config)

query_job.result() # waits for the query to finish
print("Query results loaded to table {}".format(table_ref.path))


Query results loaded to table /projects/bigquery-test-165213/datasets/parsely/tables/raw_data_for_jan_feb


The next set of steps tries to impute session_ids in order to determine how frequently users visit our sites. The main reason why sessions ids are not captured is because AMP pages do not return session data however, Parse.ly does manage to assign `visitor_site_id` to the visitors of AMP pages. To calculate the number of sessions a user has had we will assume that a session lasts maximum for 30 minutes and any pageviews outside of these time gaps are part of different sessions.  

In [9]:
# create a separate dataset which has imputed session_ids where session_ids do not exist

job_config = bigquery.QueryJobConfig()
# assign destination table
table_ref = client.dataset('parsely').table('imputed_session_ids_for_null_sessions')
job_config.destination = table_ref

sql = '''
  /*
loyalty analysis
creating table 1
where session_id = NULL
where visitor_site_id != BLANK
This table will create another table which has the same schema as
rawdata_for_loyalty_analysis but will compute session_id for those users who have NULL as session_ids
this table will then be unioned with table 2 where session_id != NULL and visitor_site_id != BLANK

NOTE: All rows where visitor_site_id IN ("", "OPTOUT") will be removed for analysis
*/

SELECT
  * EXCEPT (ts_action, session_change,
    time_diff,
    previous_ts),
  SUM(session_change) OVER (PARTITION BY visitor_site_id ORDER BY ts_action ASC) AS session_id
FROM (
  SELECT
    *,
    CASE
      WHEN time_diff IS NULL THEN 1
      WHEN time_diff > 1800 THEN 1
      ELSE 0
    END AS session_change
  FROM (
    SELECT
      *,
      TIMESTAMP_DIFF(ts_action, previous_ts, SECOND) AS time_diff
    FROM (
      SELECT
        visitor_site_id,
        event_id,
        ts_action,
        LAG(ts_action) OVER (PARTITION BY visitor_site_id ORDER BY ts_action ASC) AS previous_ts
      FROM
        `bigquery-test-165213.parsely.raw_data_for_jan_feb`
      WHERE
        session_id IS NULL) AS w_prev_ts) AS w_time_diff) AS w_sess_change
'''

# run the query
query_job = client.query(
sql,
# location must match that of the dataset(s) referenced in the query
# and that of the destination table
location = 'EU',
job_config=job_config)

query_job.result() # waits for the query to finish
print("Query results loaded to table {}".format(table_ref.path))

Query results loaded to table /projects/bigquery-test-165213/datasets/parsely/tables/imputed_session_ids_for_null_sessions


In [10]:
# reinserting imputed session ids back into raw_data_for_jan_feb
job_config = bigquery.QueryJobConfig()
table_ref = client.dataset('parsely').table('jan_feb_with_session_ids')
job_config.destination = table_ref

sql = '''
  /*
Inserting imputed session_ids for null sessions back into
raw data
Using LEFT JOIN on rawdata with imputed rawdata*/

SELECT
  main.* EXCEPT(session_id),
  CASE
    WHEN main.session_id IS NULL THEN imputed.session_id
    ELSE main.session_id
  END AS session_id
FROM (
  SELECT
    *
  FROM
    `bigquery-test-165213.parsely.raw_data_for_jan_feb`) AS main
LEFT JOIN (
  SELECT
    event_id,
    session_id
  FROM
    `bigquery-test-165213.parsely.imputed_session_ids_for_null_sessions`) AS imputed
ON
  main.event_id = imputed.event_id
'''

# run the query
query_job = client.query(
sql,
# location must match that of the dataset(s) referenced in the query
# and that of the destination table
location = 'EU',
job_config=job_config)

query_job.result() # waits for the query to finish
print("Query results loaded to table {}".format(table_ref.path))

Query results loaded to table /projects/bigquery-test-165213/datasets/parsely/tables/jan_feb_with_session_ids


In [None]:
# determine monthly loyalty segment for each visitor
# add a new column to an existing table and update values based on ts_action
# the other option is to separate the jan & feb data into two tables
# one for jan, the other for feb and separately put them back into the loyaly_analysis_dataset


