In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

* **Make data set for Prediction Pipeline**

* Tables:
    * IDs:
        * skt.sm_pred_ns_pool - sampled fvids
    * GA data:
        * skt.sm_pred_ns_pool_ga - eligible GA data for non-subscribed fvids
    
[sm = subscription modeling]

NOTE: 
* Data not grouped by anything. For respective features, group by during dev time

In [3]:
start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.content_articles` AS (

   # Get IAB Tiers for articles. Articles can be classified multiple times with different tier1 values given. 
   # To avoid duplication, partition by naturalid and take categorization by latest timestamp
   
   WITH content_iab AS (
   SELECT * FROM (
        SELECT 
            naturalId AS iab_natid, 
            RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
            categoryName,
            tier1,
            tier2
        FROM `api-project-901373404215.Content.mnet_iab_categories` where tier1 IS NOT NULL
        ) 
    WHERE mostrecent = 1
    ),

    # Get the natid and the corresponding title form the main content table. There are duplicates so partition by naturalid and get latest articles 
    
    content AS (
      SELECT 
        natid AS content_natid,
        title,
        publish_date,
        body
      FROM (
        SELECT DISTINCT
          LOWER(NaturalId) AS natid,
          RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
          title,
          DATETIME(date, "America/New_York") as publish_date,
          body
        FROM
          `api-project-901373404215.Content.content`
        WHERE
          Visible is true
          AND type in (
          'blog',
          'blogslide',
          'magazine')
          )
      WHERE mostrecent = 1
    )
    
    # Join the content iab table with the main content table to get the title with the iab category
      SELECT
        * 
        EXCEPT (iab_natid, mostrecent)
      FROM
        content
      LEFT JOIN
        content_iab
      ON
        LOWER(content_natid) = LOWER(iab_natid)
)
"""

content = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 135.89927911758423 seconds ---


#### --PRED PIPELINE NON-SUBSCRIBERS--
* **Make non-subscriber pool - raw IDs datasets**
    * eligible = random sample i.e. 600k non-sub fvids who visited in Nov'21
    
    * In prod = change this to anyone who visited yest
        * No other condition like choose people w/ >1 pv in past 90 days or >4 articles read in a month
            * how to deal with - if they came yest but had also come few days ago? -- to include?

    * STARTS WITH condition keeps only people who came for articles
        * This condition in training data was reviewed and no issues raised

In [5]:
start_time = time.time()

query_string = """
    CREATE OR REPLACE TABLE `api-project-901373404215.skt.sm_pred_ns_pool` AS
    (
    SELECT
                rand() AS random_num,
                piano_id,
                ga_fullvisitorid
    FROM (
                SELECT
                    ga_fullvisitorid, 
                    EXTRACT(MONTH FROM ga_date) AS ga_month,
                    MAX(ga_pianoId) AS piano_id,
                    count(GA_cmsNaturalId) AS num_articles
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-11-01' AND '2021-11-30' AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.sm_subs_pool`
                        )
                    AND STARTS_WITH(GA_cmsNaturalId, "blogandpostid/blog/post/")
                
                GROUP BY 
                    ga_fullvisitorid, ga_month
                HAVING 
        
                    # make sure fvid never had a piano id (No other condition like choose people w/ >1 pv in past 90 days or >4 articles read in a month)
                    piano_id IS NULL
    )
    ORDER BY random_num
    LIMIT 600000
    )
"""                  

make_ns_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 13.960380792617798 seconds ---


* Query raw non-subscribers IDs' data (sanity check)

In [6]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_pred_ns_pool`
"""
ns = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(ns.shape)

--- 5.219186067581177 seconds ---
(600000, 3)


In [7]:
ns.isna().sum()

random_num               0
piano_id            600000
ga_fullvisitorid         0
dtype: int64

* **Pull GA data for these non-subscribers (date filter - users' GA activity between 1-Sep-2021 and 30-Nov-2021) - 90 days**

In [12]:
start_time = time.time()

query_string = """ 
CREATE OR REPLACE TABLE `api-project-901373404215.skt.sm_pred_ns_pool_ga` AS (

    # get article content + IAB tiers
    WITH content_joined AS (
      SELECT
        *
      FROM
        `api-project-901373404215.skt.content_articles`
    )

    # for sampled non-subscribers - fetch their GA data in past 90 days for preparing features
    SELECT 
            z.piano_id,
            d.GA_fullVisitorId, 
            GA_visitStartTime,
            GA_date, 
            GA_pagePath,             
            -- feature cols below
            GA_dfpNewZone, GA_visitNumber,
            GA_pageViews, GA_scrollDepth, timeOnPage,  
            GA_cmsNaturalId, title, publish_date, 
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
  
    FROM 
        `api-project-901373404215.skt.sm_pred_ns_pool` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
    WHERE 
        ga_date BETWEEN '2021-09-01' AND '2021-11-30'
)
     """

make_nsga_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time)) 

--- 20.434919595718384 seconds ---


* Query non-subscribers' GA data (sanity check)

In [13]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_pred_ns_pool_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 14.060956954956055 seconds ---


In [14]:
print(nonsubs_data.shape)

(1196951, 22)


In [15]:
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.head()

Unique unlimited fullvids:  600000 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,,555334623995782819,1637823558,2021-11-25,/sites/krisholt/2021/04/22/dvas-falling-mech-i...,article-amp/standard/default/standard,1,1,0.0,10.0,...,2021-04-22 17:47:19,android,mobile,ye,malaysia,direct,innovation,games,Healthy Living,Wellness
1,,13699065382605936021,1636137963,2021-11-05,/,none,2,1,1.0,13.0,...,NaT,windows,desktop,edge,united states,organic search,home,none,,
2,,7019832430111495626,1636094219,2021-11-05,/sites/jacobmorgan/2014/03/11/every-employee-w...,article-amp/standard/default/standard,1,1,1.0,22.0,...,2014-03-11 00:33:00,windows,desktop,edge,singapore,referral,leadership,none,,
3,,266833496890775900,1638044124,2021-11-27,/sites/sarahlandrum/2017/12/08/the-importance-...,article-amp/standard/default/standard,1,1,0.0,,...,2017-12-08 08:00:00,macintosh,desktop,edge,united states,direct,under 30,none,,
4,,3607138599855816907,1636421851,2021-11-08,/sites/catherineschnaubelt/2018/11/26/4-reason...,article-amp/standard/default/standard,1,1,0.0,,...,2018-11-26 11:15:00,windows,desktop,edge,united states,direct,money,retirement,Personal Finance,Insurance


In [16]:
nonsubs_data.isna().sum()

piano_id                    1196951
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth                26821
timeOnPage                   119538
GA_cmsNaturalId                   0
title                         15147
publish_date                  15147
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                        292026
tier2                        356764
dtype: int64

In [17]:
print(nonsubs_data.GA_date.min())
print(nonsubs_data.GA_date.max())

2021-09-01
2021-11-30


In [40]:
# testing types of people included in this data
whole_ns = nonsubs_data.groupby('GA_fullVisitorId').agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})

whole_ns["avg_top"] = whole_ns['timeOnPage']/whole_ns['sum_pvs']

In [43]:
whole_ns.describe()

Unnamed: 0,sum_pvs,timeOnPage,avg_top
count,600000.0,600000.0,600000.0
mean,1.99,144.64,67.33
std,2.48,346.18,123.41
min,1.0,0.0,0.0
25%,1.0,4.0,3.0
50%,1.0,43.0,29.5
75%,2.0,135.0,78.0
max,464.0,62256.0,3813.0


In [44]:
# cut sum(pvs) per person into buckets and calculate % people in each bucket

whole_ns['range'] = pd.cut(whole_ns.sum_pvs, [0, 1, 3 , 5, 464])

print(whole_ns.range.value_counts(normalize=True).sort_index())

whole_ns.range.value_counts().sort_index()

(0, 1]     0.63
(1, 3]     0.25
(3, 5]     0.06
(5, 464]   0.05
Name: range, dtype: float64


(0, 1]      379257
(1, 3]      149459
(3, 5]       38308
(5, 464]     32976
Name: range, dtype: int64

* Double checking no subs included in non-subs

In [18]:
nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')]

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2


* Curiosity: How many total non-subs eligible people in Oct?
    * NOTE - no filtering condition on these people like num articles read >4 in a month

In [27]:
start_time = time.time()

sql = """SELECT
                    ga_fullvisitorid, 
                    EXTRACT(MONTH FROM ga_date) AS ga_month,
                    MAX(ga_pianoId) AS piano_id,
                    count(GA_cmsNaturalId) AS num_articles
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-11-01' AND '2021-11-30' AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.sm_subs_pool`
                        )
                    AND STARTS_WITH(GA_cmsNaturalId, "blogandpostid/blog/post/")
                
                GROUP BY 
                    ga_fullvisitorid, ga_month
                HAVING 
                
                    # make sure the fvid never had a piano id. (No other condition like choose people w/ >4 articles in a month this year)
                    piano_id IS NULL"""
mod = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(mod.shape)

--- 32.09491539001465 seconds ---
(22875414, 4)


In [28]:
print(len(mod.ga_fullvisitorid.unique()))

22875414


In [21]:
mod

Unnamed: 0,ga_fullvisitorid,ga_month,piano_id,num_articles
0,12505644519232193020,11,,12
1,16946813056702599095,11,,7
2,9157250631509528459,11,,9
3,5295075884485060310,11,,24
4,7768252609832986157,11,,25
...,...,...,...,...
22875409,6571895178395876043,11,,5
22875410,14744624632989369012,11,,5
22875411,11325386880392830176,11,,5
22875412,944463004474769426,11,,5


In [24]:
mod.ga_month.unique()

array([11])

In [22]:
mod.num_articles.describe()

count   22875414.00
mean           1.33
std            2.45
min            1.00
25%            1.00
50%            1.00
75%            1.00
max        10895.00
Name: num_articles, dtype: float64

In [39]:
# curiousity - removing natid condition to check how many people returned

start_time = time.time()

sql = """SELECT
                    ga_fullvisitorid, 
                    MAX(ga_pianoId) AS piano_id
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-11-01' AND '2021-11-30' AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.sm_subs_pool`
                        )
                GROUP BY 
                    ga_fullvisitorid
                HAVING 
                
                    # make sure the fvid never had a piano id. (No other condition like choose people w/ >4 articles in a month this year)
                    piano_id IS NULL"""
mod2 = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(mod2.shape)

--- 27.620813608169556 seconds ---
(22802170, 2)


In [None]:
# smthng is wrong here - how come people are less?

In [36]:
mod2[~mod2.ga_fullvisitorid.isin(mod.ga_fullvisitorid)]

Unnamed: 0,ga_fullvisitorid,ga_month,piano_id,num_articles
3749,11595017592810618595,11,,7
8384,14460914344640155258,11,,6
8916,956093682023474801,11,,15
9350,7660097628767673561,11,,10
13639,4138282054353744728,11,,0
...,...,...,...,...
22780673,4058468634334172371,11,,5
22782844,6791739768361531131,11,,5
22788392,11482729181800333708,11,,5
22791637,16759163950408718857,11,,5


In [37]:
mod[~mod.ga_fullvisitorid.isin(mod2.ga_fullvisitorid)]

Unnamed: 0,ga_fullvisitorid,ga_month,piano_id,num_articles
8,1543236845303830098,11,,6
97,16718633095006124014,11,,6
242,8795472413954996949,11,,7
243,15321985895840734479,11,,8
363,7832231731272207824,11,,8
...,...,...,...,...
22875258,1434709270350055759,11,,5
22875276,2078855678617372527,11,,5
22875300,525244502298181884,11,,5
22875361,3372769795493462154,11,,5
