In [3]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [1]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [2]:
def pvs_eda(input_df, cat_col_name, drop_cols=False):
    
    # pivot
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='subscription_status',
        aggfunc='sum')\
        .reset_index()
    
    # drop none and other cols
    if drop_cols==True:
        
        if 'none' in list(df_cat.columns):
            df_cat.drop('none', axis=1, inplace=True)
        
        if 'other' in list(df_cat.columns):
            df_cat.drop('other', axis=1, inplace=True)

    
    # percentage calc below
    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header
  
    df_cat['% of non_subscriber pvs'] = (df_cat['non_subscriber'] / df_cat['non_subscriber'].sum()) * 100
    df_cat['% of subscriber pvs'] = (df_cat['subscriber'] / df_cat['subscriber'].sum()) * 100
    
    df_cat.drop(['non_subscriber', 'subscriber'], axis=1, inplace=True)

    df_cat['non_subscriber_rank'] = df_cat['% of non_subscriber pvs'].rank(ascending=False).astype(int)
    df_cat['subscriber_rank'] = df_cat['% of subscriber pvs'].rank(ascending=False).astype(int)

    df_cat = df_cat.sort_values('subscriber_rank')
    
    return df_cat

In [4]:
def top_eda(df):
    '''
    Prep df for eda
    '''
    # remove column = "none" i.e. Tier 1/Tier2/PC/PS was not available
    df.drop("none", axis=1, inplace=True)
    
    # group all subscribers, calculate mean of their (avg. time on page) for each category. Same for Non-subscribers
    df = df.groupby('subscription_status').mean().T
    
    # assign rank
    df['non_subscriber_rank'] = df['non_subscriber'].rank(ascending=False).astype(int)
    df['subscriber_rank'] = df['subscriber'].rank(ascending=False).astype(int)
    df = df.sort_values('subscriber_rank')

    return df

## DATA

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.subscriber_ga_data`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)
subs_data.rename(columns={'ga_pianoId': 'piano_id'}, inplace=True)

# drop dups
subs_data.drop_duplicates(keep='first', inplace=True)
print("Duplicates?: ", subs_data.duplicated().any())


print(subs_data.shape)
print("Unique unlimited subscribers: ", len(subs_data.piano_id.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")

subs_data.head()

--- 24.56462812423706 seconds ---
(6588071, 23)
Unique unlimited subscribers:  42678
Unique unlimited fullvids:  101600 

Duplicates?:  False


Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,pniuojdbxqsjc3h,6969734970922600880,1629730389,2021-08-23,/sites/jackkelly/2021/08/15/the-remote-trend-o...,article/standard/subscriber/alx,61,1,1.0,33.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,Careers,Job Search,subscriber
1,pnicghdflqmfx7t,6375914690944396875,1609814826,2021-01-04,/sites/kwamechristian/2021/01/04/how-to-handle...,article/standard/subscriber/alx,3,1,0.25,783.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,News and Politics,Politics,subscriber
2,pniy7r900qn3sir,6642711071460721994,1610927979,2021-01-17,/sites/nicolebendaly/2021/01/03/three-things-t...,article/standard/subscriber/alx,8,1,0.75,306.0,...,windows,desktop,edge,united states,organic search,leadership,careers,Business and Finance,Business,subscriber
3,pnihyef2cqjnevq,781170581791183074,1617029229,2021-03-29,/sites/danabrownlee/2021/03/28/increasingly-co...,article/standard/subscriber/alx,1007,1,0.5,8.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Industries,subscriber
4,pniagcmbwqjw92n,3644467403674640393,1632406065,2021-09-23,/sites/kathycaprino/2021/09/20/6-key-ways-lead...,article/standard/subscriber/alx,63,1,0.75,107.0,...,macintosh,desktop,chrome,united states,newsletter,leadership,careers,Business and Finance,Business,subscriber


In [6]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.oct_ns_ga_data`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

# drop dups
nonsubs_data.drop_duplicates(keep='first', inplace=True)
print("Duplicates?: ", nonsubs_data.duplicated().any(), "\n")


print("---Before removing suspicious fvids---")
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')
suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
print("---After removing suspicious fvids---", len(suspicious_fvid))
nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")


nonsubs_data.head()

--- 11.949727773666382 seconds ---
Duplicates?:  False 

---Before removing suspicious fvids---
(4028728, 23)
Unique unlimited fullvids:  110000 

---After removing suspicious fvids--- 62
(4014189, 23)
Unique unlimited fullvids:  109938 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,,4883999870040518240,1599485310,2020-09-07,/sites/robertberger/2020/09/06/when-300-unempl...,article-amp/advisor/default/standard,97,1,0.75,141.0,...,android,mobile,android webview,united states,content aggregators,money,personal finance,Personal Finance,Financial Assistance,non_subscriber
1,,11827268133006795236,1588572447,2020-05-04,/sites/andrewsolender/2020/05/03/support-in-co...,article-amp/topline/default/standard,40,1,0.25,22.0,...,android,mobile,android webview,united states,content aggregators,business,policy,Personal Finance,Financial Assistance,non_subscriber
2,,8704629655117643411,1595016270,2020-07-17,/sites/ebauer/2020/07/17/the-social-security-t...,article-amp/standard/default/standard,43,1,,107.0,...,android,mobile,android webview,united states,content aggregators,money,retirement,Personal Finance,Financial Assistance,non_subscriber
3,,4959561173344172366,1595700290,2020-07-25,/sites/zackfriedman/2020/07/24/unemployment-be...,article-amp/standard/default/standard,11,1,0.0,,...,android,desktop,android webview,united states,content aggregators,money,personal finance,Personal Finance,Financial Assistance,non_subscriber
4,,1225431135904108999,1596797370,2020-08-07,/sites/zackfriedman/2020/07/22/second-stimulus...,article-amp/standard/default/standard,10,1,0.0,0.0,...,android,mobile,android webview,united states,content aggregators,money,personal finance,Personal Finance,Financial Assistance,non_subscriber


In [7]:
# any col names mismatch? - no

[x for x in list(subs_data.columns) if x not in list(nonsubs_data.columns)]

[]

In [8]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape)

# fill na
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('None') 

# short list device OS
shortlisted_os = ["android", "ios", "macintosh", "windows"]

df["deviceOS"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), 
                          df["GA_deviceOperatingSystem"], 
                          "other")

df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

Shape:  (10602260, 23)


non_subscriber    109938
subscriber        101600
Name: subscription_status, dtype: int64

In [9]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status,deviceOS
0,pniuojdbxqsjc3h,6969734970922600880,1629730389,2021-08-23,/sites/jackkelly/2021/08/15/the-remote-trend-o...,article/standard/subscriber/alx,61,1,1.0,33.0,...,desktop,chrome,united states,organic search,leadership,careers,Careers,Job Search,subscriber,windows
1,pnicghdflqmfx7t,6375914690944396875,1609814826,2021-01-04,/sites/kwamechristian/2021/01/04/how-to-handle...,article/standard/subscriber/alx,3,1,0.25,783.0,...,desktop,chrome,united states,organic search,leadership,careers,News and Politics,Politics,subscriber,windows
2,pniy7r900qn3sir,6642711071460721994,1610927979,2021-01-17,/sites/nicolebendaly/2021/01/03/three-things-t...,article/standard/subscriber/alx,8,1,0.75,306.0,...,desktop,edge,united states,organic search,leadership,careers,Business and Finance,Business,subscriber,windows
3,pnihyef2cqjnevq,781170581791183074,1617029229,2021-03-29,/sites/danabrownlee/2021/03/28/increasingly-co...,article/standard/subscriber/alx,1007,1,0.5,8.0,...,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Industries,subscriber,windows
4,pniagcmbwqjw92n,3644467403674640393,1632406065,2021-09-23,/sites/kathycaprino/2021/09/20/6-key-ways-lead...,article/standard/subscriber/alx,63,1,0.75,107.0,...,desktop,chrome,united states,newsletter,leadership,careers,Business and Finance,Business,subscriber,macintosh


In [10]:
target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### User's whole behavior
**i.e., avg. top and sum(pvs) over whole GA history**

In [11]:
whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']

whole

Unnamed: 0,GA_fullVisitorId,subscription_status,sum_pvs,timeOnPage,avg_top
0,10000049855779198375,non_subscriber,3,244.00,81.33
1,10000168040775107380,non_subscriber,35,2153.00,61.51
2,10000181072003986570,non_subscriber,11,392.00,35.64
3,10000796870131167689,non_subscriber,6,629.00,104.83
4,1000096511026234346,subscriber,25,1781.00,71.24
...,...,...,...,...,...
211533,9999523824842481160,non_subscriber,13,1194.00,91.85
211534,9999640204191429572,non_subscriber,9,81.00,9.00
211535,9999943173698081042,non_subscriber,17,2529.00,148.76
211536,9999987496363756221,non_subscriber,43,5256.00,122.23


* Distribution of sum(pvs)
    * Even after limiting non-subs with >5 articles, on a whole -- subs have more pvs than non-subs (except extreme values)

In [12]:
whole.groupby('subscription_status').sum_pvs.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,36.51,64.84
std,96.72,246.4
min,1.0,1.0
25%,6.0,7.0
50%,11.0,18.0
75%,24.0,46.0
max,3797.0,22371.0


* Distribution of avg(top)
    * Subs spend more avg. top than non-subs on a whole

In [14]:
whole.groupby('subscription_status').avg_top.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,84.95,138.57
std,77.21,105.15
min,0.0,0.0
25%,35.41,68.0
50%,66.14,117.9
75%,109.93,181.85
max,1261.6,2497.0


### Users' Unique Pageviews in each session (avg, median)

In [15]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'GA_scrollDepth': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'GA_scrollDepth': 'mean', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'unique_pageViews', 'timeOnPage': 'top_per_session'}, inplace=True)
session

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,unique_pageViews,GA_scrollDepth,top_per_session
0,10000049855779198375,1633875633,1,0.50,176.00
1,10000049855779198375,1634090973,1,0.50,68.00
2,10000049855779198375,1634130400,1,0.00,0.00
3,10000168040775107380,1628320080,1,0.75,12.00
4,10000168040775107380,1628441132,1,0.25,22.00
...,...,...,...,...,...
6952743,9999987504359326751,1634442610,1,0.50,48.00
6952744,9999987504359326751,1634898404,1,0.50,159.00
6952745,9999987504359326751,1635315690,1,0.75,173.00
6952746,9999987504359326751,1635637982,1,0.75,329.00


* Average and Median of (unique pageviews in each sessions)

In [16]:
pageViews = session.groupby('GA_fullVisitorId').agg({'unique_pageViews': ['mean', 'median']}).reset_index()

# rename cols
pageViews.columns = [' '.join(col).strip() for col in pageViews.columns.values]
pageViews.rename(columns={'unique_pageViews mean':'unique_pageviews_mean', 'unique_pageViews median': 'unique_pageviews_median'}, inplace=True)

# join target class
pageViews = pd.merge(pageViews, target_class, how="left", on = "GA_fullVisitorId")
pageViews

Unnamed: 0,GA_fullVisitorId,unique_pageviews_mean,unique_pageviews_median,subscription_status
0,10000049855779198375,1.00,1.00,non_subscriber
1,10000168040775107380,1.06,1.00,non_subscriber
2,10000181072003986570,1.00,1.00,non_subscriber
3,10000796870131167689,1.00,1.00,non_subscriber
4,1000096511026234346,3.00,2.50,subscriber
...,...,...,...,...
211533,9999523824842481160,1.00,1.00,non_subscriber
211534,9999640204191429572,1.00,1.00,non_subscriber
211535,9999943173698081042,1.07,1.00,non_subscriber
211536,9999987496363756221,1.07,1.00,non_subscriber


In [19]:
pageViews.groupby('subscription_status').unique_pageviews_mean.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,1.09,2.33
std,0.36,4.45
min,1.0,1.0
25%,1.0,1.33
50%,1.0,1.75
75%,1.09,2.5
max,39.0,356.0


In [20]:
pageViews.groupby('subscription_status').unique_pageviews_median.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,1.03,1.88
std,0.35,4.43
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,39.0,356.0


### Users' Time on Page in each session (avg, median)

In [22]:
timeOnPage = session.groupby('GA_fullVisitorId').agg({'top_per_session': ['mean', 'median']}).reset_index()

# rename cols
timeOnPage.columns = [' '.join(col).strip() for col in timeOnPage.columns.values]
timeOnPage.rename(columns={'top_per_session mean':'top_mean', 'top_per_session median': 'top_median'}, inplace=True)

# join target class
timeOnPage = pd.merge(timeOnPage, target_class, how="left", on = "GA_fullVisitorId")
timeOnPage

Unnamed: 0,GA_fullVisitorId,top_mean,top_median,subscription_status
0,10000049855779198375,81.33,68.00,non_subscriber
1,10000168040775107380,61.31,0.00,non_subscriber
2,10000181072003986570,35.64,17.00,non_subscriber
3,10000796870131167689,104.83,5.00,non_subscriber
4,1000096511026234346,106.76,49.72,subscriber
...,...,...,...,...
211533,9999523824842481160,91.85,1.00,non_subscriber
211534,9999640204191429572,9.00,0.00,non_subscriber
211535,9999943173698081042,140.17,10.00,non_subscriber
211536,9999987496363756221,110.76,79.25,non_subscriber


* Average and Median of (sum of time on page in each session)

In [23]:
timeOnPage.groupby('subscription_status').top_mean.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,79.78,143.7
std,73.55,126.58
min,0.0,0.0
25%,33.67,66.92
50%,61.79,115.57
75%,102.28,183.72
max,1266.42,4994.0


In [24]:
timeOnPage.groupby('subscription_status').top_median.describe().T

subscription_status,non_subscriber,subscriber
count,109938.0,101600.0
mean,40.73,84.77
std,61.29,111.33
min,0.0,0.0
25%,5.0,28.17
50%,24.0,56.29
75%,52.5,99.5
max,1320.0,4994.0


### Pageviews in referral sources, country, device OS

In [25]:
pvs_eda(df, 'GA_referralGroup')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_referralGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
organic search,29.7,66.1,2,1
newsletter,0.01,7.95,8,2
referral,0.39,7.76,5,3
direct,9.28,6.82,3,4
organic social (dark),0.77,4.9,4,5
content aggregators,59.72,3.24,1,6
organic social (forbes),0.09,3.13,6,7
paid search,0.03,0.07,7,8
paid display,0.0,0.01,10,9
paid web,0.0,0.01,9,10


In [26]:
pvs_eda(df, 'GA_country').drop('(not set)').head(10)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
united states,66.09,91.29,1,1
canada,12.06,1.22,2,2
united kingdom,6.1,1.2,3,3
japan,0.17,0.71,24,4
australia,3.38,0.51,4,5
india,1.23,0.42,6,7
singapore,1.51,0.23,5,8
israel,0.12,0.19,32,9
dominican republic,0.06,0.18,44,10
hong kong,0.34,0.18,14,11


In [27]:
pvs_eda(df, 'deviceOS')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,0.02,43.57,5,1
macintosh,0.21,39.92,3,2
ios,57.28,8.2,1,3
android,42.45,7.82,2,4
other,0.03,0.49,4,5


In [28]:
pvs_eda(df, 'deviceOS')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,0.02,43.57,5,1
macintosh,0.21,39.92,3,2
ios,57.28,8.2,1,3
android,42.45,7.82,2,4
other,0.03,0.49,4,5


### Pageviews in various content categories
* Content categories = IAB Tier 1, Tier 2, PC, PS

In [29]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

Shape before:  (10602260, 25)
Shape after:  (8661507, 25)


In [30]:
cat = content.copy()
cat.isna().sum()

piano_id                    3991421
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                235688
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth              1081180
timeOnPage                   402819
GA_cmsNaturalId                   0
title                         31771
publish_date                  31771
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       2311390
tier2                       2772598
subscription_status               0
deviceOS                          0
natid_start                       0
mon_year                          0
dtype: int64

In [32]:
print("Before - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("Before - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

shortlisted_channel = joblib.load("pri_channel_shortlisted.pkl")
shortlisted_section = joblib.load("pri_section_shortlisted.pkl")

# primary channel
cat["GA_primaryChannel"] = np.where(cat["GA_primaryChannel"].isin(shortlisted_channel), 
                                    cat["GA_primaryChannel"], "other")

# primary section 
cat["GA_primarySection"] = np.where(cat["GA_primarySection"].isin(shortlisted_section), 
                                    cat["GA_primarySection"], "other")

print("After shortlisting - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("After shortlisting - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

# fillna with 0
cat.timeOnPage = cat.timeOnPage.fillna(0)

# replace empty and NULL with "none"
cat.tier1 = cat.tier1.replace(r'^\s*$', "none", regex=True)
cat.tier1 = cat.tier1.fillna("none")

# replace empty and NULL with "none"
cat.tier2 = cat.tier2.replace(r'^\s*$', "none", regex=True)
cat.tier2 = cat.tier2.fillna("none")

print("Unique all T1s: ", len(cat.tier1.unique()))  
print("Unique all T2s: ", len(cat.tier1.unique()))  

Before - unique PC:  35
Before - unique PS:  156 

After shortlisting - unique PC:  24
After shortlisting - unique PS:  51 

Unique all T1s:  31
Unique all T2s:  31


In [33]:
cat.shape

(8661507, 26)

* Pageviews - Tier 1

In [34]:
pvs_eda(cat, 'tier1', drop_cols=True)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business and Finance,17.03,31.08,1,1
Personal Finance,15.01,10.51,3,2
News and Politics,11.95,10.31,5,3
Technology & Computing,14.42,8.62,4,4
Medical Health,5.82,7.33,6,5
Video Gaming,16.18,3.67,2,6
Travel,2.19,3.11,7,7
Careers,1.18,2.49,12,8
Food & Drink,0.82,2.21,16,9
Shopping,1.02,2.19,14,10


* Pageviews - Tier 2

In [35]:
pvs_eda(cat, 'tier2', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,4.41,18.58,9,1
Industries,4.04,8.55,11,2
Computing,8.18,6.12,3,3
Politics,7.77,6.11,4,4
Economy,9.52,5.98,1,5
Diseases and Conditions,4.11,5.62,10,6
Personal Investing,5.7,4.96,8,7
Personal Debt,6.4,2.12,7,8
Travel Type,1.16,1.93,17,9
Video Game Genres,6.4,1.83,6,10


* Pageviews - Primary Channel

In [36]:
pvs_eda(cat, 'GA_primaryChannel', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primaryChannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
leadership,8.48,22.52,4,1
business,16.96,22.28,3,2
innovation,38.03,19.8,1,3
money,24.02,16.04,2,4
lifestyle,6.01,8.47,5,5
small business,0.91,4.11,7,6
billionaires,0.85,2.0,8,7
real estate,0.25,0.99,14,8
consumer,1.29,0.78,6,9
shopping,0.53,0.77,12,10


* Pageviews - Primary Section

In [37]:
pvs_eda(cat, 'GA_primarySection', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primarySection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
careers,2.54,6.62,13,1
leadership strategy,2.68,5.93,11,2
games,17.85,5.65,1,3
travel,2.98,4.73,8,4
forbeswomen,1.35,4.69,18,5
markets,3.9,4.51,6,6
personal finance,8.92,4.21,4,7
entrepreneurs,1.3,3.94,19,8
hollywood & entertainment,2.94,3.91,9,9
crypto & blockchain,9.04,3.56,3,10


### Avg. TOP in various content categories
* Content categories = IAB Tier 1, Tier 2

In [38]:
t1 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier1',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t1.GA_fullVisitorId)

# calc avg. top
t1_top = t1["timeOnPage"]/t1["GA_pageViews"]
t1_top = t1_top.fillna(0)

t1_top["GA_fullVisitorId"] = fvids
t1_top = pd.merge(t1_top, target_class, how="left", on="GA_fullVisitorId")

* Average time on page - Tier 1

In [39]:
top_eda(t1_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'})

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business and Finance,45.3,127.88,1,1
Personal Finance,41.52,83.13,2,2
Technology & Computing,34.72,70.92,3,3
Medical Health,22.6,67.06,6,4
News and Politics,24.12,66.05,5,5
Careers,10.51,38.82,11,6
Travel,13.2,36.92,7,7
Shopping,8.27,32.34,15,8
Style & Fashion,7.18,31.12,17,9
Sports,9.74,29.15,12,10


* Average time on page - Tier 2

In [40]:
t2 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2.GA_fullVisitorId)

# calc avg. top
t2_top = t2["timeOnPage"]/t2["GA_pageViews"]
t2_top = t2_top.fillna(0)

t2_top["GA_fullVisitorId"] = fvids
t2_top = pd.merge(t2_top, target_class, how="left", on="GA_fullVisitorId")

top_eda(t2_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'}).head(15)

MemoryError: Unable to allocate 66.1 MiB for an array with shape (8661507,) and data type int64

### Avg. article views in each month

In [74]:
user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'subscription_status', 'mon_year']).GA_cmsNaturalId.count()).reset_index().rename(columns=
                                                                                                                                                   {'GA_cmsNaturalId':'total_article_views'})
user_per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,total_article_views
0,10000058813304965608,non_subscriber,2020-02,6
1,10000058813304965608,non_subscriber,2020-03,14
2,10000058813304965608,non_subscriber,2020-04,13
3,10000058813304965608,non_subscriber,2020-05,27
4,10000058813304965608,non_subscriber,2020-06,15
...,...,...,...,...
1038067,9999883233119772718,non_subscriber,2020-01,1
1038068,9999883233119772718,non_subscriber,2020-10,2
1038069,9999883233119772718,non_subscriber,2021-01,1
1038070,9999883233119772718,non_subscriber,2021-05,6


* Avg(articles per month)

In [76]:
per_mon =  pd.DataFrame(user_per_mon.groupby(['GA_fullVisitorId', 'subscription_status']).total_article_views.mean()).reset_index()

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round()
per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,total_article_views
0,10000058813304965608,non_subscriber,9.52
1,10000251105399450511,non_subscriber,8.92
2,10000610991313890343,non_subscriber,2.40
3,1000096511026234346,subscriber,6.00
4,1000104336364784244,subscriber,3.81
...,...,...,...
203780,9999629024408201784,non_subscriber,2.33
203781,999969109989075422,non_subscriber,2.70
203782,9999757056159711694,non_subscriber,11.47
203783,9999883233119772718,non_subscriber,2.20


In [77]:
per_mon.groupby('subscription_status').total_article_views.describe().T 

subscription_status,non_subscriber,subscriber
count,109738.0,94047.0
mean,6.44,13.69
std,9.54,30.8
min,1.39,1.0
25%,3.5,4.5
50%,5.0,8.0
75%,7.0,15.0
max,2064.8,3740.0


In [73]:
# example non-subs
user_per_mon[user_per_mon.GA_fullVisitorId=='10000058813304965608']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
0,10000058813304965608,non_subscriber,2020-02,6
1,10000058813304965608,non_subscriber,2020-03,14
2,10000058813304965608,non_subscriber,2020-04,13
3,10000058813304965608,non_subscriber,2020-05,27
4,10000058813304965608,non_subscriber,2020-06,15
5,10000058813304965608,non_subscriber,2020-07,9
6,10000058813304965608,non_subscriber,2020-08,1
7,10000058813304965608,non_subscriber,2020-09,13
8,10000058813304965608,non_subscriber,2020-10,23
9,10000058813304965608,non_subscriber,2020-11,13


In [69]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='3857395123229566996']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
491055,3857395123229566996,subscriber,2020-03,2
491056,3857395123229566996,subscriber,2020-04,10
491057,3857395123229566996,subscriber,2020-05,5
491058,3857395123229566996,subscriber,2020-06,12
491059,3857395123229566996,subscriber,2020-07,8
491060,3857395123229566996,subscriber,2020-08,9
491061,3857395123229566996,subscriber,2020-09,1
491062,3857395123229566996,subscriber,2020-10,10
491063,3857395123229566996,subscriber,2020-11,8
491064,3857395123229566996,subscriber,2020-12,17


In [70]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='5277846215104667271']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
627088,5277846215104667271,subscriber,2020-03,3
627089,5277846215104667271,subscriber,2020-04,38
627090,5277846215104667271,subscriber,2020-05,34
627091,5277846215104667271,subscriber,2020-06,96
627092,5277846215104667271,subscriber,2020-07,38
627093,5277846215104667271,subscriber,2020-08,69
627094,5277846215104667271,subscriber,2020-09,42
627095,5277846215104667271,subscriber,2020-10,55
627096,5277846215104667271,subscriber,2020-11,73
627097,5277846215104667271,subscriber,2020-12,109


In [None]:
# joining with C-levels for curiosity

start_time = time.time()

sql = """
    SELECT 
        *
      FROM (
        SELECT 
            DISTINCT *,
            RANK() OVER (PARTITION BY GA_fullVisitorId ORDER BY date DESC) AS mostrecent,
        FROM
            `api-project-901373404215.lookalike.zoom_info_c_level`
          )
      WHERE 
          mostrecent = 1
"""

clevels = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time)) #12.45

In [None]:
clevels