In [2]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [3]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

## Exploring subscriber data for 'unlimited' subscription type 

NOTE: 
* Data not grouped by anything. For respective features, group by during dev time
* Tables:
    * IDs:
        * skt.smpv1_subs_pool - all ACTIVE subs piano IDs + fvids
        * skt.smpv3_nonsubs_pool - sampled fvids w/ pv>1 in last 90 days
    * GA data:
        * skt.smpv1_subs_ga - 12-month GA data for subs
        * skt.smpv3_nonsubs_ga - 12-month GA data for non-subs
    
[sm = subscription modeling]

In [3]:
# same - dont run since updated today 1.12.22
start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.content_articles` AS (

   # Get IAB Tiers for articles. Articles can be classified multiple times with different tier1 values given. 
   # To avoid duplication, partition by naturalid and take categorization by latest timestamp
   
   WITH content_iab AS (
   SELECT * FROM (
        SELECT 
            naturalId AS iab_natid, 
            RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
            categoryName,
            tier1,
            tier2
        FROM `api-project-901373404215.Content.mnet_iab_categories` where tier1 IS NOT NULL
        ) 
    WHERE mostrecent = 1
    ),

    # Get the natid and the corresponding title form the main content table. There are duplicates so partition by naturalid and get latest articles 
    
    content AS (
      SELECT 
        natid AS content_natid,
        title,
        publish_date,
        body
      FROM (
        SELECT DISTINCT
          LOWER(NaturalId) AS natid,
          RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
          title,
          DATETIME(date, "America/New_York") as publish_date,
          body
        FROM
          `api-project-901373404215.Content.content`
        WHERE
          Visible is true
          AND type in (
          'blog',
          'blogslide',
          'magazine')
          )
      WHERE mostrecent = 1
    )
    
    # Join the content iab table with the main content table to get the title with the iab category
      SELECT
        * 
        EXCEPT (iab_natid, mostrecent)
      FROM
        content
      LEFT JOIN
        content_iab
      ON
        LOWER(content_natid) = LOWER(iab_natid)
)
"""

content = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 143.84403109550476 seconds ---


#### --SUBSCRIBERS--
* **Make subscribers pool - raw IDs dataset**

In [4]:
# same - dont run since updated today 1.12.22

start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.smpv1_subs_pool` AS

    # get ALL user_id_uids having unlimited type subscription only
    WITH eligible_users AS ( 
      
       SELECT 
           DISTINCT 
               user_id_uid, 
               resource_id_rid, 
               start_date,
               status, 
               subscription_trial_end_date               
    FROM
        `api-project-901373404215.piano.subscriber_details`
    WHERE 
        # Filter for the 'universal' subscriptions only
        resource_id_rid IN UNNEST(['RKPEVDB', 'R8W03AS'])
        
        # filter for active only
               AND status='active'
               AND total__refunded<1
               AND cast(dt_updated as date)=current_date('America/New_York')
        )
    
    # join user_id_uids with GA's pianoIDs and For each pianoID, get their fullvids. De-duplicate.
     SELECT 
         DISTINCT
            ga_pianoId,
            user_id_uid,
            ga_fullvisitorid,
            resource_id_rid,
            start_date,
            status, 
            subscription_trial_end_date
    FROM
        eligible_users
         INNER JOIN 
         `api-project-901373404215.DataMart.v_DataMart_updated` 
    ON 
        LOWER(ga_pianoId) = LOWER(user_id_uid)
"""


make_s_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 32.43524241447449 seconds ---


* Query raw subscribers data (sanity check)

In [5]:
# same - dont run since updated today 1.12.22
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_subs_pool`
"""
raw = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(raw.shape)

--- 2.9832119941711426 seconds ---
(101697, 7)


In [6]:
print("Unique unlimited subscribers: ", len(raw.user_id_uid.unique()))
print("Unique unlimited fullvids: ", len(raw.ga_fullvisitorid.unique()), "\n")
print("Duplicates?: ", raw.duplicated().any(), "\n")
raw.head()

Unique unlimited subscribers:  38772
Unique unlimited fullvids:  101503 

Duplicates?:  False 



Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,start_date,status,subscription_trial_end_date
0,pnip6dg0bqj4nn5,PNIp6dG0Bqj4nn5,7374421297923879923,R8W03AS,2020-11-01 12:11:06 -0500,active,
1,pnip6dg0bqj4nn5,PNIp6dG0Bqj4nn5,311538254761535371,R8W03AS,2020-11-01 12:11:06 -0500,active,
2,pnip6dg0bqj4nn5,PNIp6dG0Bqj4nn5,1690950057931379125,R8W03AS,2020-11-01 12:11:06 -0500,active,
3,dhsam1vq5vmikdnk7dujnzmbmzy1,DHsAM1Vq5vMIkDNk7DujnZmBMZy1,4529785030978001679,R8W03AS,2020-11-01 12:28:24 -0500,active,
4,dhsam1vq5vmikdnk7dujnzmbmzy1,DHsAM1Vq5vMIkDNk7DujnZmBMZy1,1896111012057916420,R8W03AS,2020-11-01 12:28:24 -0500,active,


In [7]:
raw_acc = raw[['ga_fullvisitorid', 'status']].drop_duplicates(keep='first')

print(":: Types of account status ::")
print(raw_acc.status.value_counts(), "\n")

:: Types of account status ::
active    101503
Name: status, dtype: int64 



In [8]:
# every piano ID has on average 3 fullvids

pd.DataFrame(raw.groupby('ga_pianoId').ga_fullvisitorid.nunique().describe())

Unnamed: 0,ga_fullvisitorid
count,38772.0
mean,2.62
std,5.84
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,229.0


In [9]:
print("Min subscription start", raw.start_date.min())
print("Max subscription start", raw.start_date.max())

Min subscription start 2020-11-01 12:11:06 -0500
Max subscription start 2022-01-12 01:55:58 -0500


* **Pull GA data for these subscribers (date filter - users' GA activity between 1-jan-2021 and 31-Dec-2021)**

In [10]:
# same - dont run since updated today 1.12.22

start_time = time.time()

query_string = """ 

CREATE OR REPLACE TABLE `api-project-901373404215.skt.smpv1_subs_ga` AS (
  
    # get article content + IAB tiers
    WITH content_joined AS (
      SELECT
        *
      FROM
        `api-project-901373404215.skt.content_articles`
    )
    
    # for all eligible subscribers - fetch their GA data post 1-jan-21 for preparing features
    SELECT 
            z.ga_pianoId,
            user_id_uid,
            resource_id_rid,
            d.GA_fullVisitorId, 
            GA_visitStartTime, 
            GA_date, 
            GA_pagePath, 
            -- feature cols below
            GA_dfpNewZone, GA_visitNumber,
            GA_pageViews, GA_scrollDepth, timeOnPage,  
            GA_cmsNaturalId, title, publish_date, 
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
 
    FROM 
        `api-project-901373404215.skt.smpv1_subs_pool` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
    WHERE 
        ga_date BETWEEN '2021-01-01' AND '2021-12-31'
)
     """


make_sga_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 41.07100439071655 seconds ---


* Query subscribers' GA data (sanity check)

In [11]:
# same - dont run since updated today 1.12.22

start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 23.937049627304077 seconds ---


In [13]:
print(subs_data.shape)

(5251696, 24)


In [14]:
print("Unique unlimited subscribers: ", len(subs_data.user_id_uid.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")

subs_data.head()

Unique unlimited subscribers:  37771
Unique unlimited fullvids:  96895 



Unnamed: 0,ga_pianoId,user_id_uid,resource_id_rid,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,pni5iymvaqitjet,PNI5IymvAqitjet,R8W03AS,1275852554082159309,1611241905,2021-01-21,/sites/natashalekwa/,none,49,1,...,NaT,macintosh,desktop,chrome,united states,organic search,business,none,,
1,pniy3dvuiqjbz2l,PNIy3DVUiqjbz2l,R8W03AS,7565625914766383136,1631115741,2021-09-08,/profile/naomi-azrieli/,none,21,1,...,NaT,macintosh,desktop,safari,united states,organic search,none,none,,
2,pnirfsyyxqplyt4,PNIRFSyYXqplyt4,R8W03AS,4033599429359564984,1630894141,2021-09-05,/,none,2,1,...,NaT,ios,mobile,safari,united states,direct,home,none,,
3,pni0j74vvqpat3t,PNI0J74VVqpat3t,R8W03AS,7498059348985416466,1628605204,2021-08-10,/,none,1,1,...,NaT,android,mobile,android webview,united states,direct,home,none,,
4,pni2w63xeqw265z,PNI2W63XEqw265z,R8W03AS,2875977353634377874,1638023293,2021-11-27,/,none,67,1,...,NaT,ios,mobile,safari,united states,organic search,home,none,,


In [15]:
subs_data.isna().sum()

ga_pianoId                        0
user_id_uid                       0
resource_id_rid                   0
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               729040
timeOnPage                    29347
GA_cmsNaturalId                   0
title                       1789596
publish_date                1789593
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       2403843
tier2                       2650724
dtype: int64

In [16]:
print(subs_data.GA_date.min())
print(subs_data.GA_date.max())

2021-01-01
2021-12-31


* Why ga_date > 01-jan-21 limit? 
    * before Nov-2020 people didn't even have a chance to subscribe
    * going very far back not helpful because people clear cookies. Forbes site was very diff

NOTE:  ga_date > jan 1,'21 condition - for some fvids may pull some before + all after subscription behavior - obscured. Thats still inline with what we want

#### --NON-SUBSCRIBERS--
* **Make non-subscriber pool - raw IDs datasets**
    * eligible = random sample of 410k non-sub fvids who visited in Dec'21
        * with filtering condition = folks read >3pv in those 30 days

In [3]:
# new setting - sanity check

start_time = time.time()

query_string = """
    CREATE OR REPLACE TABLE `api-project-901373404215.skt.smpv3_nonsubs_pool` AS
    (
    SELECT
                rand() AS random_num,
                piano_id,
                ga_fullvisitorid
    FROM (
                SELECT
                    ga_fullvisitorid, 
                    MAX(ga_pianoId) AS piano_id,
                    sum(GA_pageViews) AS total_pvs
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-12-01' AND '2021-12-31' AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.smpv1_subs_pool`
                        )
                GROUP BY 
                    ga_fullvisitorid
                HAVING 
                    # make sure the fvid never had a piano id choose people w/ >1 pv in 90 days
                    
                    piano_id IS NULL AND
                    sum(GA_pageViews)>3
    )
    ORDER BY random_num
    LIMIT 410000
    )
"""                  

make_ns_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 26.28161311149597 seconds ---


* Query raw non-subscribers IDs' data (sanity check)

In [4]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv3_nonsubs_pool`
"""
ns = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(ns.shape)

--- 4.526185989379883 seconds ---
(410000, 3)


In [5]:
ns.isna().sum()

random_num               0
piano_id            410000
ga_fullvisitorid         0
dtype: int64

In [6]:
ns_fvids = list(ns.ga_fullvisitorid)
s_fvids = list(raw.ga_fullvisitorid.unique())

print("subscribed fvids: ", len(s_fvids))
print("non-subscribed fvids: ", len(ns_fvids))

print("any intersection between them?: ", list(set(s_fvids) & set(ns_fvids)))

NameError: name 'raw' is not defined

* **Pull GA data for these non-subscribers (date filter - users' GA activity between 1-jan-2021 and 31-Dec-2021)**

In [6]:
start_time = time.time()

query_string = """ 
CREATE OR REPLACE TABLE `api-project-901373404215.skt.smpv3_nonsubs_ga` AS (

    # get article content + IAB tiers
    WITH content_joined AS (
      SELECT
        *
      FROM
        `api-project-901373404215.skt.content_articles`
    )

    # for sampled non-subscribers - fetch their GA data post 1-jan-21 for preparing features
    SELECT 
            z.piano_id,
            d.GA_fullVisitorId, 
            GA_visitStartTime,
            GA_date, 
            GA_pagePath,             
            -- feature cols below
            GA_dfpNewZone, GA_visitNumber,
            GA_pageViews, GA_scrollDepth, timeOnPage,  
            GA_cmsNaturalId, title, publish_date, 
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
  
    FROM 
        `api-project-901373404215.skt.smpv3_nonsubs_pool` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
    WHERE 
        ga_date BETWEEN '2021-01-01' AND '2021-12-31'
)
     """

make_nsga_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 62.92223238945007 seconds ---


* Query non-subscribers' GA data (sanity check)

In [7]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv3_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 26.32185411453247 seconds ---


In [23]:
# orig - pv>1 filter over 3 months
# print(nonsubs_data.shape)

(2068379, 22)


In [8]:
print(nonsubs_data.shape)

(6014971, 22)


In [9]:
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.head()

Unique unlimited fullvids:  410000 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,,10119782771159255955,1640905924,2021-12-30,/sites/paultassi/2021/12/14/the-destiny-2-gras...,article-amp/standard/default/standard,25,1,0.25,6.0,...,2021-12-14 09:03:43,android,mobile,chrome,united kingdom,organic search,innovation,games,Video Gaming,PC Games
1,,16901048999901858838,1616308339,2021-03-21,/sites/paultassi/2021/03/20/the-gap-between-de...,article-amp/standard/default/standard,3,1,0.5,,...,2021-03-20 10:42:15,android,mobile,chrome,united states,organic search,innovation,games,Video Gaming,PC Games
2,,7505902588518132633,1640618100,2021-12-27,/sites/paultassi/2021/12/26/five-destiny-2-ner...,article-amp/standard/default/standard,3,1,0.5,145.0,...,2021-12-26 09:06:46,android,mobile,chrome,united states,direct,innovation,games,Video Gaming,PC Games
3,,5220340637250074386,1640549038,2021-12-26,/sites/paultassi/2021/12/26/five-destiny-2-ner...,article-amp/standard/default/standard,7,1,0.5,73.0,...,2021-12-26 09:06:46,android,mobile,chrome,united kingdom,direct,innovation,games,Video Gaming,PC Games
4,,9527849266102683174,1616016058,2021-03-17,/sites/paultassi/2021/03/17/destiny-2s-plague-...,article-amp/standard/default/standard,29,1,0.75,46.0,...,2021-03-17 09:06:39,android,mobile,chrome,united states,direct,innovation,games,Video Gaming,PC Games


In [10]:
nonsubs_data.isna().sum()

piano_id                    6014971
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     4
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               107342
timeOnPage                   551540
GA_cmsNaturalId                   4
title                         39687
publish_date                  39687
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 4
GA_primarySection                 4
tier1                        859464
tier2                       1171320
dtype: int64

In [11]:
print(nonsubs_data.GA_date.min())
print(nonsubs_data.GA_date.max())

2021-01-01
2021-12-31


* Double checking no subs included in non-subs

In [12]:
nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

# Forbes people?
nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')]

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
313,,13499180008097073251,1615280037,2021-03-09,/sites/alisondurkee/2021/03/08/supreme-court-k...,article-amp/topline/subscriber/standard,575,1,1.00,116.00,...,2021-03-08 10:42:04,android,mobile,android webview,united states,content aggregators,business,none,News and Politics,Law
728,,14278918909469134335,1640619464,2021-12-27,/sites/nicholasreimann/2021/12/27/trumps-tumbl...,article-amp/topline/subscriber/standard,9,1,0.25,10.00,...,2021-12-27 09:00:00,ios,mobile,safari,united states,organic search,business,none,News and Politics,Politics
3225,,15703767343430202960,1625672936,2021-07-07,/sites/judykoutsky/2021/01/18/winter-in-the-be...,article-amp/standard/subscriber/standard,89,1,0.50,42.00,...,2021-01-18 12:20:20,ios,mobile,safari,united states,organic search,lifestyle,travel,Travel,Travel Type
3702,,8407182356711146498,1640709151,2021-12-28,/sites/zackfriedman/2021/12/28/student-loans-a...,article-amp/standard/subscriber/standard,5,1,0.50,67.00,...,2021-12-28 08:30:00,ios,mobile,safari,united states,organic search,money,personal finance,Personal Finance,Personal Debt
4086,,6933793746327572126,1640001612,2021-12-20,/sites/adamminsky/2021/12/16/student-loan-forg...,article-amp/standard/subscriber/standard,131,1,0.50,136.00,...,2021-12-16 12:15:42,ios,mobile,safari,united states,organic search,money,personal finance,Personal Finance,Personal Debt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6012980,,12917877582245739886,1610502265,2021-01-12,/sites/davidteich/2021/01/12/business-contract...,article-amp/standard/subscriber/standard,142,1,0.25,94.00,...,2021-01-12 11:17:10,android,mobile,chrome,mexico,content aggregators,innovation,ai,Technology & Computing,Artificial Intelligence
6013584,,4400952167098872274,1628602475,2021-08-10,/sites/christinefletcher/2021/08/10/what-gen-z...,article/standard/subscriber/alx,28,1,0.00,22.00,...,2021-08-10 09:00:00,android,mobile,chrome,united states,organic search,money,retirement,Personal Finance,Financial Planning
6013845,,15179020181479012042,1617695406,2021-04-06,/sites/forbes-personal-shopper/2021/04/05/spri...,article-amp/finds/subscriber/standard,53,1,0.25,6.00,...,2021-04-05 10:58:04,android,mobile,chrome,united states,content aggregators,shopping,none,Shopping,Sales and Promotions
6013882,,4400952167098872274,1628602475,2021-08-10,/sites/steveparrish/2021/04/07/2021-retirement...,article/standard/subscriber/alx,28,1,,19.00,...,2021-04-07 06:41:25,android,mobile,chrome,united states,organic search,money,retirement,Personal Finance,Financial Assistance


In [13]:
suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
len(suspicious_fvid)

395

In [14]:
print("Piano ID for suspicious fvids?: ", nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].piano_id.unique(), "\n")

# print("suspicious fvids present in subs dataset?: ", raw[raw.ga_fullvisitorid.isin(suspicious_fvid)].shape) # no

Piano ID for suspicious fvids?:  [None] 



In [15]:
# remove these suspicious fvids before eda

print("Before: ", nonsubs_data.shape)

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After: ", nonsubs_data.shape)

print("Unqiue non-subs fvids for eda: ", len(nonsubs_data.GA_fullVisitorId.unique()))

Before:  (6014971, 22)
After:  (5938101, 22)
Unqiue non-subs fvids for eda:  409605


* dfpNewZone meaning:

    * Default = not logged in, not subscribed
    * nonsubscriber = logged in but not subscribed
    * advisor = investment advicsor - its a Forbes product

In [32]:
# now all clean

nonsubs_data['subs_from_GA_dfpNewZone'] = nonsubs_data['GA_dfpNewZone'].str.split('/').str[2]

nonsubs_data[['GA_fullVisitorId', 'subs_from_GA_dfpNewZone']].drop_duplicates(keep = 'first').subs_from_GA_dfpNewZone.value_counts()

default          409176
nonsubscriber       221
Name: subs_from_GA_dfpNewZone, dtype: int64

* Curiosity: How many total non-subs eligible people in Oct?
    * NOTE - no filtering condition on these people like num articles read >4 in a month

In [4]:
start_time = time.time()

sql = """SELECT
                    ga_fullvisitorid, 
                    MAX(ga_pianoId) AS piano_id,
                    sum(GA_pageViews) AS total_pvs
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date BETWEEN '2021-12-01' AND '2021-12-31' AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.smpv1_subs_pool`
                        )
                GROUP BY 
                    ga_fullvisitorid
                HAVING 
                    # make sure the fvid never had a piano id and choose people w/ >1 pv in 90 days
                    
                    piano_id IS NULL AND
                    sum(GA_pageViews)>3"""
mod = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(mod.shape) # >2 = 1145381

--- 13.77833104133606 seconds ---
(442308, 3)


In [34]:
mod

Unnamed: 0,ga_fullvisitorid,piano_id,total_pvs
0,15515958497248299200,,177
1,13416693579555678719,,10
2,17458839994691485306,,11
3,2063877349943260580,,12
4,1942758558328170255,,17
...,...,...,...
1145376,575206895022299277,,9
1145377,5307003103751073168,,9
1145378,11297628785445932476,,9
1145379,12023094562567420273,,9


In [36]:
# orig
# mod.total_pvs.describe()

count   15409118.00
mean           2.81
std            6.41
min            2.00
25%            2.00
50%            2.00
75%            3.00
max        23904.00
Name: total_pvs, dtype: float64

In [35]:
mod.total_pvs.describe()

count   1145381.00
mean          3.87
std           5.61
min           3.00
25%           3.00
50%           3.00
75%           4.00
max        5426.00
Name: total_pvs, dtype: float64