In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [3]:
def pvs_eda(input_df, cat_col_name, drop_cols=False):
    
    # pivot
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='subscription_status',
        aggfunc='sum')\
        .reset_index()
    
    # drop none and other cols
    if drop_cols==True:
        
        if 'none' in list(df_cat.columns):
            df_cat.drop('none', axis=1, inplace=True)
        
        if 'other' in list(df_cat.columns):
            df_cat.drop('other', axis=1, inplace=True)

    
    # percentage calc below
    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header
  
    df_cat['% of non_subscriber pvs'] = (df_cat['non_subscriber'] / df_cat['non_subscriber'].sum()) * 100
    df_cat['% of subscriber pvs'] = (df_cat['subscriber'] / df_cat['subscriber'].sum()) * 100
    
    df_cat.drop(['non_subscriber', 'subscriber'], axis=1, inplace=True)

    df_cat['non_subscriber_rank'] = df_cat['% of non_subscriber pvs'].rank(ascending=False).astype(int)
    df_cat['subscriber_rank'] = df_cat['% of subscriber pvs'].rank(ascending=False).astype(int)

    df_cat = df_cat.sort_values('subscriber_rank')
    
    return df_cat

In [4]:
def top_eda(df):
    '''
    Prep df for eda
    '''
    # remove column = "none" i.e. Tier 1/Tier2/PC/PS was not available
    df.drop("none", axis=1, inplace=True)
    
    # group all subscribers, calculate mean of their (avg. time on page) for each category. Same for Non-subscribers
    df = df.groupby('subscription_status').mean().T
    
    # assign rank
    df['non_subscriber_rank'] = df['non_subscriber'].rank(ascending=False).astype(int)
    df['subscriber_rank'] = df['subscriber'].rank(ascending=False).astype(int)
    df = df.sort_values('subscriber_rank')

    return df

## DATA

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.subscriber_ga_data`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)
subs_data.rename(columns={'ga_pianoId': 'piano_id'}, inplace=True)

# drop dups
# subs_data.drop_duplicates(keep='first', inplace=True)

print(subs_data.shape)
print("Unique unlimited subscribers: ", len(subs_data.piano_id.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", subs_data.duplicated().any())

subs_data.head()

--- 23.603087186813354 seconds ---
(9746613, 23)
Unique unlimited subscribers:  42678
Unique unlimited fullvids:  101600 

Duplicates?:  True


Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,pni0hkm7oqjzww7,5648315289347757915,1633454424,2021-10-05,/sites/robinryan/2021/10/05/add-a-linkedin-bac...,article/standard/subscriber/alx,44,1,0.0,3.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,,,subscriber
1,pni49gmy7qnaeqe,7460490544181184665,1611958424,2021-01-29,/sites/lizryan/2017/05/15/how-to-handle-a-bait...,article/standard/subscriber/alx,1,1,0.75,35.0,...,macintosh,desktop,safari,united states,organic search,leadership,careers,,,subscriber
2,pniohazwsqkmkqb,3996879997278966106,1612361085,2021-02-03,/sites/jackkelly/2021/02/02/amazons-new-massiv...,article/standard/subscriber/alx,88,1,0.75,560.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,Real Estate,Developmental Sites,subscriber
3,pniptkdboqj82qy,3206243803054980323,1623795177,2021-06-15,/sites/williamarruda/2021/06/13/5-red-flags-to...,article/standard/subscriber/alx,118,1,0.75,51.0,...,windows,desktop,edge,united states,organic search,leadership,careers,Careers,,subscriber
4,pniirhrfiqzfn7f,4623187229507344716,1633469388,2021-10-05,/sites/davidsturt/2018/03/08/10-shocking-workp...,article/standard/subscriber/alx,32,1,1.0,25.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,,,subscriber


In [6]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.nonsubscriber_ga_data`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Duplicates?: ", nonsubs_data.duplicated().any())


print("---Before removing suspicious fvids---")
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')
suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
print("---After removing suspicious fvids---", len(suspicious_fvid))
nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")


nonsubs_data.head()

--- 13.846479177474976 seconds ---
Duplicates?:  True
---Before removing suspicious fvids---
(5014346, 23)
Unique unlimited fullvids:  109738 

---After removing suspicious fvids--- 155
(4995517, 23)
Unique unlimited fullvids:  109583 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,,4322305614776686660,1602062780,2020-10-07,/sites/gordonkelly/2020/10/06/apple-iphone-12-...,article/standard/default/standard,224,1,0.25,20.0,...,android,desktop,android webview,canada,content aggregators,innovation,consumer tech,Technology & Computing,Consumer Electronics,non_subscriber
1,,11020952544086358612,1610486452,2021-01-12,/sites/zakdoffman/2021/01/12/if-these-apps-are...,article-amp/standard/default/standard,6,1,0.5,3.0,...,android,mobile,android webview,united states,content aggregators,innovation,cybersecurity,Technology & Computing,Consumer Electronics,non_subscriber
2,,17312517012768210000,1596333111,2020-08-01,/sites/daveywinder/2020/08/01/this-samsung-sec...,article-amp/standard/default/standard,26,1,0.5,45.0,...,android,mobile,android webview,united states,content aggregators,innovation,cybersecurity,Technology & Computing,Consumer Electronics,non_subscriber
3,,1358346986832705004,1603433574,2020-10-23,/sites/gordonkelly/2020/10/22/apple-iphone-13-...,article-amp/standard/default/standard,103,1,0.5,55.0,...,android,mobile,android webview,united kingdom,content aggregators,innovation,consumer tech,Technology & Computing,Consumer Electronics,non_subscriber
4,,7782286270075194416,1603089853,2020-10-19,/sites/johnarcher/2020/10/18/lg-oled48cx-oled-...,article-amp/standard/default/standard,318,1,0.0,0.0,...,android,mobile,android webview,united states,content aggregators,innovation,consumer tech,Technology & Computing,Consumer Electronics,non_subscriber


In [7]:
# any col names mismatch? - no

[x for x in list(subs_data.columns) if x not in list(nonsubs_data.columns)]

[]

In [8]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape)

# fill na
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('None') 

# short list device OS
shortlisted_os = ["android", "ios", "macintosh", "windows"]

df["deviceOS"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), 
                          df["GA_deviceOperatingSystem"], 
                          "other")

df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

Shape:  (14742130, 23)


non_subscriber    109583
subscriber        101600
Name: subscription_status, dtype: int64

In [14]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status,deviceOS
0,pni0hkm7oqjzww7,5648315289347757915,1633454424,2021-10-05,/sites/robinryan/2021/10/05/add-a-linkedin-bac...,article/standard/subscriber/alx,44,1,0.0,3.0,...,desktop,chrome,united states,organic search,leadership,careers,,,subscriber,windows
1,pni49gmy7qnaeqe,7460490544181184665,1611958424,2021-01-29,/sites/lizryan/2017/05/15/how-to-handle-a-bait...,article/standard/subscriber/alx,1,1,0.75,35.0,...,desktop,safari,united states,organic search,leadership,careers,,,subscriber,macintosh
2,pniohazwsqkmkqb,3996879997278966106,1612361085,2021-02-03,/sites/jackkelly/2021/02/02/amazons-new-massiv...,article/standard/subscriber/alx,88,1,0.75,560.0,...,desktop,chrome,united states,organic search,leadership,careers,Real Estate,Developmental Sites,subscriber,windows
3,pniptkdboqj82qy,3206243803054980323,1623795177,2021-06-15,/sites/williamarruda/2021/06/13/5-red-flags-to...,article/standard/subscriber/alx,118,1,0.75,51.0,...,desktop,edge,united states,organic search,leadership,careers,Careers,,subscriber,windows
4,pniirhrfiqzfn7f,4623187229507344716,1633469388,2021-10-05,/sites/davidsturt/2018/03/08/10-shocking-workp...,article/standard/subscriber/alx,32,1,1.0,25.0,...,desktop,chrome,united states,organic search,leadership,careers,,,subscriber,windows


In [10]:
target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### User's whole behavior
**i.e., avg. top and sum(pvs) over whole GA history**

In [11]:
whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']

whole

Unnamed: 0,GA_fullVisitorId,subscription_status,sum_pvs,timeOnPage,avg_top
0,10000058813304965608,non_subscriber,200,15870.00,79.35
1,10000251105399450511,non_subscriber,117,10958.00,93.66
2,10000610991313890343,non_subscriber,12,2076.00,173.00
3,1000096511026234346,subscriber,31,1904.00,61.42
4,1000104336364784244,subscriber,92,16635.00,180.82
...,...,...,...,...,...
211178,9999629024408201784,non_subscriber,7,75.00,10.71
211179,999969109989075422,non_subscriber,27,3096.00,114.67
211180,9999757056159711694,non_subscriber,195,13977.00,71.68
211181,9999883233119772718,non_subscriber,11,1628.00,148.00


* Distribution of sum(pvs)
    * Even after limiting non-subs with >5 articles, on a whole -- subs have more pvs than non-subs (except extreme values)

In [12]:
whole.groupby('subscription_status').sum_pvs.describe().T

subscription_status,non_subscriber,subscriber
count,109583.0,101600.0
mean,45.59,95.93
std,483.64,616.55
min,5.0,1.0
25%,8.0,10.0
50%,15.0,24.0
75%,36.0,65.0
max,155102.0,76285.0


* Distribution of avg(top)
    * Subs spend more avg. top than non-subs on a whole

In [13]:
whole.groupby('subscription_status').avg_top.describe().T

subscription_status,non_subscriber,subscriber
count,109583.0,101600.0
mean,92.94,138.33
std,99.84,103.77
min,0.0,0.0
25%,31.4,68.0
50%,66.08,118.27
75%,119.0,181.84
max,5661.83,2497.0


### Users' Unique Pageviews in each session (avg, median)

In [15]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'GA_scrollDepth': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'GA_scrollDepth': 'mean', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'unique_pageViews', 'timeOnPage': 'top_per_session'}, inplace=True)
session

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,unique_pageViews,GA_scrollDepth,top_per_session
0,10000058813304965608,1582907879,1,1.00,9.00
1,10000058813304965608,1582991705,4,1.00,517.00
2,10000058813304965608,1583126231,1,1.00,9.00
3,10000058813304965608,1584312177,1,1.00,4.00
4,10000058813304965608,1584746000,1,1.00,8.00
...,...,...,...,...,...
7146908,9999953169198331967,1626093686,1,0.00,0.00
7146909,9999953169198331967,1626266711,1,0.50,49.00
7146910,9999953169198331967,1626612852,1,0.50,14.00
7146911,9999953169198331967,1626697065,1,0.25,10.00


* Average and Median of (unique pageviews in each sessions)

In [16]:
pageViews = session.groupby('GA_fullVisitorId').agg({'unique_pageViews': ['mean', 'median']}).reset_index()

# rename cols
pageViews.columns = [' '.join(col).strip() for col in pageViews.columns.values]
pageViews.rename(columns={'unique_pageViews mean':'unique_pageviews_mean', 'unique_pageViews median': 'unique_pageviews_median'}, inplace=True)

# join target class
pageViews = pd.merge(pageViews, target_class, how="left", on = "GA_fullVisitorId")
pageViews

Unnamed: 0,GA_fullVisitorId,unique_pageviews_mean,unique_pageviews_median,subscription_status
0,10000058813304965608,1.14,1.00,non_subscriber
1,10000251105399450511,1.81,1.00,non_subscriber
2,10000610991313890343,1.00,1.00,non_subscriber
3,1000096511026234346,3.00,2.50,subscriber
4,1000104336364784244,1.67,1.00,subscriber
...,...,...,...,...
211178,9999629024408201784,1.00,1.00,non_subscriber
211179,999969109989075422,1.08,1.00,non_subscriber
211180,9999757056159711694,1.16,1.00,non_subscriber
211181,9999883233119772718,1.00,1.00,non_subscriber


In [17]:
pageViews.groupby('subscription_status').unique_pageviews_mean.describe().T

subscription_status,non_subscriber,subscriber
count,109583.0,101600.0
mean,1.29,2.33
std,1.01,4.45
min,1.0,1.0
25%,1.0,1.33
50%,1.08,1.75
75%,1.21,2.5
max,142.0,356.0


In [18]:
pageViews.groupby('subscription_status').unique_pageviews_median.describe().T

subscription_status,non_subscriber,subscriber
count,109583.0,101600.0
mean,1.18,1.88
std,0.99,4.43
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,142.0,356.0


### Users' Time on Page in each session (avg, median)

In [19]:
timeOnPage = session.groupby('GA_fullVisitorId').agg({'timeOnPage': ['mean', 'median']}).reset_index()

# rename cols
timeOnPage.columns = [' '.join(col).strip() for col in timeOnPage.columns.values]
timeOnPage.rename(columns={'timeOnPage mean':'top_mean', 'timeOnPage median': 'top_median'}, inplace=True)

# join target class
timeOnPage = pd.merge(timeOnPage, target_class, how="left", on = "GA_fullVisitorId")
timeOnPage

KeyError: "Column 'timeOnPage' does not exist!"

* Average and Median of (sum of time on page in each session)

In [19]:
timeOnPage.groupby('subscription_status').top_mean.describe().T

subscription_status,non_subscriber,subscriber
count,109738.0,101600.0
mean,110.69,210.48
std,184.45,321.82
min,0.0,0.0
25%,30.78,85.53
50%,64.27,149.0
75%,123.67,249.65
max,9542.0,33240.0


In [20]:
timeOnPage.groupby('subscription_status').top_median.describe().T

subscription_status,non_subscriber,subscriber
count,109738.0,101600.0
mean,58.56,123.16
std,166.54,286.59
min,0.0,0.0
25%,2.0,36.0
50%,21.0,71.38
75%,56.0,134.0
max,9542.0,33240.0


### Pageviews in referral sources, country, device OS

In [21]:
pvs_eda(df, 'GA_referralGroup')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_referralGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
organic search,25.84,65.84,2,1
referral,1.65,8.49,6,2
direct,4.3,7.69,5,3
newsletter,0.64,7.15,8,4
organic social (dark),4.35,4.82,4,5
organic social (forbes),8.06,3.11,3,6
content aggregators,53.61,2.8,1,7
paid search,0.02,0.07,9,8
paid display,0.0,0.02,11,9
paid web,0.01,0.01,10,10


In [22]:
pvs_eda(df, 'GA_country').drop('(not set)').head(10)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
united states,69.4,89.93,1,1
japan,0.19,1.55,23,2
russia,0.09,1.21,38,3
canada,8.16,1.12,2,4
united kingdom,7.36,1.04,3,5
australia,1.95,0.5,4,6
india,1.41,0.38,5,8
singapore,0.94,0.21,6,9
mexico,0.54,0.16,8,10
germany,0.48,0.16,10,11


In [23]:
pvs_eda(df, 'deviceOS')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,5.59,44.25,3,1
macintosh,3.04,39.31,4,2
ios,48.79,8.25,1,3
android,42.13,7.7,2,4
other,0.45,0.5,5,5


In [24]:
pvs_eda(df, 'deviceOS')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,5.59,44.25,3,1
macintosh,3.04,39.31,4,2
ios,48.79,8.25,1,3
android,42.13,7.7,2,4
other,0.45,0.5,5,5


### Pageviews in various content categories
* Content categories = IAB Tier 1, Tier 2, PC, PS

In [25]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

Shape before:  (14760959, 25)
Shape after:  (11055442, 25)


In [26]:
cat = content.copy()
cat.isna().sum()

piano_id                    4742799
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                814879
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth              1513107
timeOnPage                   405121
GA_cmsNaturalId                   0
title                         56501
publish_date                  56501
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       2575048
tier2                       3221833
subscription_status               0
deviceOS                          0
natid_start                       0
mon_year                          0
dtype: int64

In [28]:
print("Before - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("Before - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

shortlisted_channel = joblib.load("pri_channel_shortlisted.pkl")
shortlisted_section = joblib.load("pri_section_shortlisted.pkl")

# primary channel
cat["GA_primaryChannel"] = np.where(cat["GA_primaryChannel"].isin(shortlisted_channel), 
                                    cat["GA_primaryChannel"], "other")

# primary section 
cat["GA_primarySection"] = np.where(cat["GA_primarySection"].isin(shortlisted_section), 
                                    cat["GA_primarySection"], "other")

print("After shortlisting - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("After shortlisting - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

# fillna with 0
cat.timeOnPage = cat.timeOnPage.fillna(0)

# replace empty and NULL with "none"
cat.tier1 = cat.tier1.replace(r'^\s*$', "none", regex=True)
cat.tier1 = cat.tier1.fillna("none")

# replace empty and NULL with "none"
cat.tier2 = cat.tier2.replace(r'^\s*$', "none", regex=True)
cat.tier2 = cat.tier2.fillna("none")

print("Unique all T1s: ", len(cat.tier1.unique()))  
print("Unique all T2s: ", len(cat.tier1.unique()))  

Before - unique PC:  35
Before - unique PS:  156 

After shortlisting - unique PC:  24
After shortlisting - unique PS:  51 

Unique all T1s:  31
Unique all T2s:  31


* Pageviews - Tier 1

In [30]:
pvs_eda(cat, 'tier1', drop_cols=True)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business and Finance,17.19,30.98,2,1
Personal Finance,12.53,10.53,3,2
News and Politics,28.48,10.41,1,3
Technology & Computing,9.85,8.75,4,4
Medical Health,7.2,7.22,6,5
Video Gaming,7.34,3.84,5,6
Travel,2.16,3.08,7,7
Careers,1.05,2.42,12,8
Sports,1.83,2.18,8,9
Food & Drink,0.74,2.16,17,10


* Pageviews - Tier 2

In [31]:
pvs_eda(cat, 'tier2', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,4.96,18.31,5,1
Industries,4.69,8.43,7,2
Economy,8.65,6.27,2,3
Computing,5.35,6.2,4,4
Politics,20.52,6.16,1,5
Diseases and Conditions,5.35,5.54,3,6
Personal Investing,4.15,5.04,8,7
Personal Debt,3.85,2.14,11,8
Video Game Genres,4.03,1.95,9,9
Travel Type,1.28,1.93,15,10


* Pageviews - Primary Channel

In [35]:
pvs_eda(cat, 'GA_primaryChannel', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primaryChannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
business,30.9,22.37,1,1
leadership,9.73,22.26,4,2
innovation,25.55,19.77,2,3
money,21.45,16.33,3,4
lifestyle,5.53,8.37,5,5
small business,1.02,4.0,7,6
billionaires,1.67,1.98,6,7
real estate,0.28,0.93,13,8
consumer,0.97,0.8,8,9
shopping,0.77,0.75,9,10


In [20]:
# games
pvs_eda(cat, 'GA_primaryChannel', drop_cols=True).head(20)

NameError: name 'cat' is not defined

* Pageviews - Primary Section

In [53]:
pvs_eda(cat, 'GA_primarySection', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primarySection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
careers,3.28,6.56,11,1
leadership strategy,3.47,5.9,9,2
games,10.85,5.8,1,3
travel,3.48,4.74,8,4
markets,6.51,4.65,4,5
forbeswomen,2.1,4.59,16,6
personal finance,10.64,4.21,2,7
hollywood & entertainment,3.03,3.98,13,8
crypto & blockchain,5.55,3.88,5,9
entrepreneurs,1.35,3.85,21,10


### Avg. TOP in various content categories
* Content categories = IAB Tier 1, Tier 2

In [54]:
t1 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier1',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t1.GA_fullVisitorId)

# calc avg. top
t1_top = t1["timeOnPage"]/t1["GA_pageViews"]
t1_top = t1_top.fillna(0)

t1_top["GA_fullVisitorId"] = fvids
t1_top = pd.merge(t1_top, target_class, how="left", on="GA_fullVisitorId")

* Average time on page - Tier 1

In [58]:
top_eda(t1_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'})

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business and Finance,56.54,129.02,1,1
Personal Finance,43.94,83.6,3,2
Technology & Computing,35.33,71.38,4,3
Medical Health,32.6,67.43,5,4
News and Politics,45.96,66.41,2,5
Careers,11.83,39.0,9,6
Travel,18.57,37.14,6,7
Shopping,10.98,32.61,13,8
Style & Fashion,8.83,31.28,16,9
Sports,13.4,29.33,8,10


* Average time on page - Tier 2

In [61]:
t2 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2.GA_fullVisitorId)

# calc avg. top
t2_top = t2["timeOnPage"]/t2["GA_pageViews"]
t2_top = t2_top.fillna(0)

t2_top["GA_fullVisitorId"] = fvids
t2_top = pd.merge(t2_top, target_class, how="left", on="GA_fullVisitorId")

top_eda(t2_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'}).head(15)

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business,33.18,105.75,3,1
Industries,31.0,81.2,4,2
Computing,28.62,59.54,5,3
Economy,33.5,59.38,2,4
Personal Investing,25.03,56.94,7,5
Diseases and Conditions,25.82,56.39,6,6
Politics,37.92,43.18,1,7
Law,23.33,27.5,8,8
Personal Debt,18.23,26.79,10,9
Travel Type,12.9,26.29,13,10


### Avg. article views in each month

In [74]:
user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'subscription_status', 'mon_year']).GA_cmsNaturalId.count()).reset_index().rename(columns=
                                                                                                                                                   {'GA_cmsNaturalId':'total_article_views'})
user_per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,total_article_views
0,10000058813304965608,non_subscriber,2020-02,6
1,10000058813304965608,non_subscriber,2020-03,14
2,10000058813304965608,non_subscriber,2020-04,13
3,10000058813304965608,non_subscriber,2020-05,27
4,10000058813304965608,non_subscriber,2020-06,15
...,...,...,...,...
1038067,9999883233119772718,non_subscriber,2020-01,1
1038068,9999883233119772718,non_subscriber,2020-10,2
1038069,9999883233119772718,non_subscriber,2021-01,1
1038070,9999883233119772718,non_subscriber,2021-05,6


* Avg(articles per month)

In [76]:
per_mon =  pd.DataFrame(user_per_mon.groupby(['GA_fullVisitorId', 'subscription_status']).total_article_views.mean()).reset_index()

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round()
per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,total_article_views
0,10000058813304965608,non_subscriber,9.52
1,10000251105399450511,non_subscriber,8.92
2,10000610991313890343,non_subscriber,2.40
3,1000096511026234346,subscriber,6.00
4,1000104336364784244,subscriber,3.81
...,...,...,...
203780,9999629024408201784,non_subscriber,2.33
203781,999969109989075422,non_subscriber,2.70
203782,9999757056159711694,non_subscriber,11.47
203783,9999883233119772718,non_subscriber,2.20


In [77]:
per_mon.groupby('subscription_status').total_article_views.describe().T 

subscription_status,non_subscriber,subscriber
count,109738.0,94047.0
mean,6.44,13.69
std,9.54,30.8
min,1.39,1.0
25%,3.5,4.5
50%,5.0,8.0
75%,7.0,15.0
max,2064.8,3740.0


In [73]:
# example non-subs
user_per_mon[user_per_mon.GA_fullVisitorId=='10000058813304965608']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
0,10000058813304965608,non_subscriber,2020-02,6
1,10000058813304965608,non_subscriber,2020-03,14
2,10000058813304965608,non_subscriber,2020-04,13
3,10000058813304965608,non_subscriber,2020-05,27
4,10000058813304965608,non_subscriber,2020-06,15
5,10000058813304965608,non_subscriber,2020-07,9
6,10000058813304965608,non_subscriber,2020-08,1
7,10000058813304965608,non_subscriber,2020-09,13
8,10000058813304965608,non_subscriber,2020-10,23
9,10000058813304965608,non_subscriber,2020-11,13


In [69]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='3857395123229566996']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
491055,3857395123229566996,subscriber,2020-03,2
491056,3857395123229566996,subscriber,2020-04,10
491057,3857395123229566996,subscriber,2020-05,5
491058,3857395123229566996,subscriber,2020-06,12
491059,3857395123229566996,subscriber,2020-07,8
491060,3857395123229566996,subscriber,2020-08,9
491061,3857395123229566996,subscriber,2020-09,1
491062,3857395123229566996,subscriber,2020-10,10
491063,3857395123229566996,subscriber,2020-11,8
491064,3857395123229566996,subscriber,2020-12,17


In [70]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='5277846215104667271']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
627088,5277846215104667271,subscriber,2020-03,3
627089,5277846215104667271,subscriber,2020-04,38
627090,5277846215104667271,subscriber,2020-05,34
627091,5277846215104667271,subscriber,2020-06,96
627092,5277846215104667271,subscriber,2020-07,38
627093,5277846215104667271,subscriber,2020-08,69
627094,5277846215104667271,subscriber,2020-09,42
627095,5277846215104667271,subscriber,2020-10,55
627096,5277846215104667271,subscriber,2020-11,73
627097,5277846215104667271,subscriber,2020-12,109


In [None]:
# joining with C-levels for curiosity

start_time = time.time()

sql = """
    SELECT 
        *
      FROM (
        SELECT 
            DISTINCT *,
            RANK() OVER (PARTITION BY GA_fullVisitorId ORDER BY date DESC) AS mostrecent,
        FROM
            `api-project-901373404215.lookalike.zoom_info_c_level`
          )
      WHERE 
          mostrecent = 1
"""

clevels = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time)) #12.45

In [None]:
clevels