In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

* **Make data set for Prediction Pipeline**

* Tables:
    * IDs:
        * skt.sm_pred_ns_pv1_pool - sampled fvids
    * GA data:
        * skt.sm_pred_ns_pv1_ga - eligible GA data for non-subscribed fvids
    
[sm = subscription modeling]

NOTE: 
* Data not grouped by anything. For respective features, group by during dev time

In [3]:
start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.content_articles` AS (

   # Get IAB Tiers for articles. Articles can be classified multiple times with different tier1 values given. 
   # To avoid duplication, partition by naturalid and take categorization by latest timestamp
   
   WITH content_iab AS (
   SELECT * FROM (
        SELECT 
            naturalId AS iab_natid, 
            RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
            categoryName,
            tier1,
            tier2
        FROM `api-project-901373404215.Content.mnet_iab_categories` where tier1 IS NOT NULL
        ) 
    WHERE mostrecent = 1
    ),

    # Get the natid and the corresponding title form the main content table. There are duplicates so partition by naturalid and get latest articles 
    
    content AS (
      SELECT 
        natid AS content_natid,
        title,
        publish_date,
        body
      FROM (
        SELECT DISTINCT
          LOWER(NaturalId) AS natid,
          RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
          title,
          DATETIME(date, "America/New_York") as publish_date,
          body
        FROM
          `api-project-901373404215.Content.content`
        WHERE
          Visible is true
          AND type in (
          'blog',
          'blogslide',
          'magazine')
          )
      WHERE mostrecent = 1
    )
    
    # Join the content iab table with the main content table to get the title with the iab category
      SELECT
        * 
        EXCEPT (iab_natid, mostrecent)
      FROM
        content
      LEFT JOIN
        content_iab
      ON
        LOWER(content_natid) = LOWER(iab_natid)
)
"""

content = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 64.09079957008362 seconds ---


#### --PRED PIPELINE NON-SUBSCRIBERS--
* **Make non-subscriber pool - raw IDs datasets**
    * eligible = random sample i.e. 700k non-sub fvids who visited in Dec'21 and had >1pv
        * In prod = change this to 
            * people visited yest
            * and had >1pv in past 90 days

In [4]:
start_time = time.time()

query_string = """
    CREATE OR REPLACE TABLE `api-project-901373404215.skt.test_sm_pred_ns_pv1_pool` AS
    (
    SELECT
                rand() AS random_num,
                piano_id,
                ga_fullvisitorid
    FROM (
                SELECT
                    ga_fullvisitorid, 
                    MAX(ga_pianoId) AS piano_id,
                    sum(GA_pageViews) AS total_pvs
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-11-01' AND '2022-01-31' 
                    AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.smpv1_subs_pool`
                        ) 
                    AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.smpv1_nonsubs_pool`
                        )
                GROUP BY 
                    ga_fullvisitorid
                HAVING 
                    # make sure the fvid never had a piano id choose people w/ >1 pv in given month
                    
                    piano_id IS NULL AND
                    sum(GA_pageViews)>1
    )
    ORDER BY random_num
    LIMIT 700000
    )
"""                  

make_ns_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 41.34809923171997 seconds ---


* Query raw non-subscribers IDs' data (sanity check)

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.test_sm_pred_ns_pv1_pool`
"""
ns = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(ns.shape)

--- 4.0982255935668945 seconds ---
(700000, 3)


In [5]:
ns.isna().sum()

random_num               0
piano_id            700000
ga_fullvisitorid         0
dtype: int64

* **Pull GA data for these non-subscribers (date filter - users' GA activity between 1-Oct-2021 and 31-Dec-2021) - 90 days**

In [6]:
start_time = time.time()

query_string = """ 
CREATE OR REPLACE TABLE `api-project-901373404215.skt.sm_pred_ns_pv1_ga` AS (

    # get article content + IAB tiers
    WITH content_joined AS (
      SELECT
        *
      FROM
        `api-project-901373404215.skt.content_articles`
    )

    # for sampled non-subscribers - fetch their GA data in past 90 days for preparing features
    SELECT 
            z.piano_id,
            d.GA_fullVisitorId, 
            GA_visitStartTime,
            GA_date, 
            GA_pagePath,             
            -- feature cols below
            GA_dfpNewZone, GA_visitNumber,
            GA_pageViews, GA_scrollDepth, timeOnPage,  
            GA_cmsNaturalId, title, publish_date, 
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
  
    FROM 
        `api-project-901373404215.skt.sm_pred_ns_pv1_pool` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
    WHERE 
        ga_date BETWEEN '2021-10-01' AND '2021-12-31'
)
     """

make_nsga_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time)) 

--- 25.215169668197632 seconds ---


* Query non-subscribers' GA data (sanity check)

In [7]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_pred_ns_pv1_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 12.713375329971313 seconds ---


In [8]:
print(nonsubs_data.shape)

(2622566, 22)


In [9]:
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.head()

Unique unlimited fullvids:  700000 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,,14729520443569715085,1633318496,2021-10-03,/sites/ewanspence/2021/10/02/apple-macbook-pro...,article-amp/standard/default/standard,1,1,0.25,17.0,...,2021-10-02 17:50:48,ios,mobile,safari,united states,content aggregators,innovation,consumer tech,Technology & Computing,Computing
1,,13935526792308338402,1633620569,2021-10-07,/sites/ewanspence/2021/10/05/apple-new-macbook...,article-amp/standard/default/standard,37,1,0.5,0.0,...,2021-10-05 18:32:17,ios,mobile,safari,united states,content aggregators,innovation,consumer tech,Technology & Computing,Computing
2,,11117262807541892618,1633597114,2021-10-07,/sites/ewanspence/2021/10/05/apple-new-macbook...,article-amp/standard/default/standard,6,1,0.25,10.0,...,2021-10-05 18:32:17,ios,mobile,safari,australia,content aggregators,innovation,consumer tech,Technology & Computing,Computing
3,,9772833006757969971,1633574071,2021-10-06,/sites/ewanspence/2021/10/05/apple-new-macbook...,article-amp/standard/default/standard,9,1,0.5,50.0,...,2021-10-05 18:32:17,ios,mobile,safari,canada,content aggregators,innovation,consumer tech,Technology & Computing,Computing
4,,16006610991731860651,1633611985,2021-10-07,/sites/ewanspence/2021/10/05/apple-new-macbook...,article-amp/standard/default/standard,140,1,0.5,0.0,...,2021-10-05 18:32:17,ios,mobile,safari,canada,content aggregators,innovation,consumer tech,Technology & Computing,Computing


In [10]:
nonsubs_data.isna().sum()

piano_id                    2622566
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth                42365
timeOnPage                   243498
GA_cmsNaturalId                   0
title                         16895
publish_date                  16895
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                        459841
tier2                        624108
dtype: int64

In [11]:
print(nonsubs_data.GA_date.min())
print(nonsubs_data.GA_date.max())

2021-10-01
2021-12-31


In [12]:
# testing types of people included in this data
whole_ns = nonsubs_data.groupby('GA_fullVisitorId').agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})

whole_ns["avg_top"] = whole_ns['timeOnPage']/whole_ns['sum_pvs']

In [13]:
whole_ns.describe()

Unnamed: 0,sum_pvs,timeOnPage,avg_top
count,700000.0,700000.0,700000.0
mean,3.75,320.89,92.98
std,3.83,590.14,144.9
min,2.0,0.0,0.0
25%,2.0,40.0,14.5
50%,2.0,128.0,42.17
75%,4.0,355.0,101.33
max,1058.0,157104.0,2820.0


In [14]:
# cut sum(pvs) per person into buckets and calculate % people in each bucket

whole_ns['range'] = pd.cut(whole_ns.sum_pvs, [0, 1, 3 , 5, 464])

print(whole_ns.range.value_counts(normalize=True).sort_index())

whole_ns.range.value_counts().sort_index()

(0, 1]     0.00
(1, 3]     0.70
(3, 5]     0.15
(5, 464]   0.15
Name: range, dtype: float64


(0, 1]           0
(1, 3]      488401
(3, 5]      103933
(5, 464]    107665
Name: range, dtype: int64

* Double checking no subs included in non-subs

In [15]:
nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')]

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
6945,,7808484505445231019,1640199703,2021-12-22,/sites/natashagural/2021/12/22/view-rare-and-s...,article-amp/standard/subscriber/standard,1589,1,0.00,0.00,...,2021-12-22 09:06:43,ios,mobile,safari (in-app),united states,content aggregators,lifestyle,arts,Fine Art,Fine Art Photography
35904,,1741589448943694498,1640656866,2021-12-27,/sites/charliefink/2021/12/23/mystic-moose-rai...,article-amp/standard/subscriber/standard,45,1,0.00,5.00,...,2021-12-23 13:43:28,ios,mobile,safari,united states,organic search,innovation,consumer tech,Business and Finance,Industries
45088,,16082585704597191084,1640867763,2021-12-30,/sites/terencemoore/2021/12/30/green-bay-packe...,article-amp/standard/subscriber/standard,4,1,0.25,876.00,...,2021-12-30 05:45:00,ios,mobile,safari (in-app),mexico,content aggregators,business,sportsmoney,Sports,American Football
45313,,7808484505445231019,1640540651,2021-12-26,/sites/jamiecartereurope/2021/12/26/in-photos-...,article-amp/standard/subscriber/standard,1590,1,0.00,,...,2021-12-26 05:02:32,ios,mobile,safari (in-app),united states,content aggregators,innovation,science,Science,Space and Astronomy
68209,,13781077858432433716,1639368698,2021-12-12,/sites/gordonkelly/2021/12/11/microsoft-warnin...,article-amp/standard/subscriber/standard,2,1,0.75,53.00,...,2021-12-11 10:23:52,ios,mobile,safari,united states,organic search,innovation,consumer tech,Technology & Computing,Computing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2572691,,18217193590903043181,1639694793,2021-12-16,/sites/paultassi/2021/12/16/the-destiny-2-dawn...,article-amp/standard/subscriber/standard,333,1,0.50,82.00,...,2021-12-16 10:30:20,ios,mobile,safari,united states,organic search,innovation,games,Video Gaming,PC Games
2585168,,12838288850946699703,1640209299,2021-12-22,/sites/natashagural/2021/12/22/view-rare-and-s...,article-amp/standard/subscriber/standard,44,1,0.75,34.00,...,2021-12-22 09:06:43,android,mobile,android webview,united states,content aggregators,lifestyle,arts,Fine Art,Fine Art Photography
2588376,,17662223862124555246,1640323409,2021-12-24,/sites/gordonkelly/2021/12/23/google-chrome-up...,article-amp/standard/subscriber/standard,4,1,0.75,113.00,...,2021-12-23 12:00:37,ios,mobile,safari (in-app),united states,organic social (dark),innovation,consumer tech,Technology & Computing,Computing
2592291,,1887431278445795648,1639969105,2021-12-19,/sites/jimdobson/2021/12/16/this-wealthy-encla...,article-amp/standard/subscriber/standard,4,1,1.00,265.00,...,2021-12-16 11:51:33,android,mobile,chrome,united states,direct,lifestyle,travel,Travel,Travel Type
