In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [3]:
def pvs_eda(input_df, cat_col_name, drop_cols=False):
    
    # pivot
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='subscription_status',
        aggfunc='sum')\
        .reset_index()
    
    # drop none and other cols
    if drop_cols==True:
        
        if 'none' in list(df_cat.columns):
            df_cat.drop('none', axis=1, inplace=True)
        
        if 'other' in list(df_cat.columns):
            df_cat.drop('other', axis=1, inplace=True)

    
    # percentage calc below
    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header
  
    df_cat['% of non_subscriber pvs'] = (df_cat['non_subscriber'] / df_cat['non_subscriber'].sum()) * 100
    df_cat['% of subscriber pvs'] = (df_cat['subscriber'] / df_cat['subscriber'].sum()) * 100
    
    df_cat.drop(['non_subscriber', 'subscriber'], axis=1, inplace=True)

    df_cat['non_subscriber_rank'] = df_cat['% of non_subscriber pvs'].rank(ascending=False).astype(int)
    df_cat['subscriber_rank'] = df_cat['% of subscriber pvs'].rank(ascending=False).astype(int)

    df_cat = df_cat.sort_values('subscriber_rank')
    
    return df_cat

In [4]:
def top_eda(df):
    '''
    Prep df for eda
    '''
    # remove column = "none" i.e. Tier 1/Tier2/PC/PS was not available
    df.drop("none", axis=1, inplace=True)
    
    # group all subscribers, calculate mean of their (avg. time on page) for each category. Same for Non-subscribers
    df = df.groupby('subscription_status').mean().T
    
    # assign rank
    df['non_subscriber_rank'] = df['non_subscriber'].rank(ascending=False).astype(int)
    df['subscriber_rank'] = df['subscriber'].rank(ascending=False).astype(int)
    df = df.sort_values('subscriber_rank')

    return df

## DATA

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.subscriber_ga_data`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)
subs_data.rename(columns={'ga_pianoId': 'piano_id'}, inplace=True)

print(subs_data.shape)
print("Unique unlimited subscribers: ", len(subs_data.piano_id.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")

subs_data.head()

--- 17.195920944213867 seconds ---
(9746613, 23)
Unique unlimited subscribers:  42678
Unique unlimited fullvids:  101600 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,pnijq8uahqt1sr8,2734608745588682044,1632679824,2021-09-26,/sites/kathycaprino/2021/09/20/6-key-ways-lead...,article/standard/subscriber/alx,147,1,,5.0,...,windows,desktop,edge,united states,newsletter,leadership,careers,Business and Finance,Business,subscriber
1,pniptkdboqj82qy,3206243803054980323,1632683536,2021-09-26,/sites/jackkelly/2021/09/26/the-future-of-hybr...,article/standard/subscriber/alx,279,1,1.0,35.0,...,windows,desktop,edge,united states,organic search,leadership,careers,Technology & Computing,Computing,subscriber
2,pnijvkirpqja131,6647027608060961839,1624973889,2021-06-29,/sites/lucianapaulise/2021/06/29/some-52-of-em...,article/standard/subscriber/alx,535,1,0.5,13.0,...,macintosh,desktop,chrome,united states,organic search,leadership,careers,Careers,Remote Working,subscriber
3,pni6ssai9qjzzxh,4724604073189409071,1620156871,2021-05-04,/sites/jackkelly/2021/04/21/general-motors-the...,article/standard/subscriber/alx,87,1,0.5,611.0,...,macintosh,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Industries,subscriber
4,pniuojdbxqsjc3h,6969734970922600880,1631108312,2021-09-08,/sites/markcperna/2021/08/10/4-strategies-for-...,article/standard/subscriber/alx,67,1,1.0,116.0,...,windows,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Business,subscriber


In [6]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.all_elig_ns_ga_data`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("---Before removing suspicious fvids---")
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')
suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
print("---After removing suspicious fvids---", len(suspicious_fvid))
nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]
print(nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")


nonsubs_data.head()

--- 27.787716150283813 seconds ---
---Before removing suspicious fvids---
(15944870, 23)
Unique unlimited fullvids:  402533 

---After removing suspicious fvids--- 226
(15819504, 23)
Unique unlimited fullvids:  402307 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,,5787597410793801006,1573219646,2019-11-08,/sites/forbesdigitalcovers/2019/11/08/jim-simo...,none,123,1,1.0,399.0,...,ios,mobile,safari (in-app),united states,content aggregators,industry,none,,,non_subscriber
1,,3376429588497101262,1576659613,2019-12-18,/sites/peterjreilly/2019/12/17/100b-in-mormon-...,none,890,1,0.75,750.0,...,ios,mobile,safari (in-app),united states,content aggregators,money,taxes,,,non_subscriber
2,,2789422896167405547,1565434487,2019-08-10,/sites/daveywinder/2019/08/10/apples-iphone-fa...,none,128,1,0.75,208.0,...,ios,mobile,safari (in-app),canada,content aggregators,innovation,cybersecurity,,,non_subscriber
3,,7508104229272451575,1548639598,2019-01-27,/sites/davidphelan/2019/01/27/airpods-2-reveal...,none,160,1,,,...,ios,mobile,safari (in-app),united states,content aggregators,innovation,consumer tech,,,non_subscriber
4,,6220182084376833655,1534964621,2018-08-22,/sites/mitsubishiheavyindustries/2018/06/29/se...,none,78,1,,,...,ios,mobile,safari (in-app),canada,content aggregators,business,none,,,non_subscriber


In [7]:
# any col names mismatch? - no

[x for x in list(subs_data.columns) if x not in list(nonsubs_data.columns)]

[]

In [8]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape)

# fill na
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('None') 

# short list device OS
shortlisted_os = ["android", "ios", "macintosh", "windows"]

df["deviceOS"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), 
                          df["GA_deviceOperatingSystem"], 
                          "other")

df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

Shape:  (25566117, 23)


non_subscriber    402307
subscriber        101600
Name: subscription_status, dtype: int64

In [9]:
df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts(normalize=True)

non_subscriber   0.80
subscriber       0.20
Name: subscription_status, dtype: float64

In [10]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status,deviceOS
0,pnijq8uahqt1sr8,2734608745588682044,1632679824,2021-09-26,/sites/kathycaprino/2021/09/20/6-key-ways-lead...,article/standard/subscriber/alx,147,1,,5.0,...,desktop,edge,united states,newsletter,leadership,careers,Business and Finance,Business,subscriber,windows
1,pniptkdboqj82qy,3206243803054980323,1632683536,2021-09-26,/sites/jackkelly/2021/09/26/the-future-of-hybr...,article/standard/subscriber/alx,279,1,1.0,35.0,...,desktop,edge,united states,organic search,leadership,careers,Technology & Computing,Computing,subscriber,windows
2,pnijvkirpqja131,6647027608060961839,1624973889,2021-06-29,/sites/lucianapaulise/2021/06/29/some-52-of-em...,article/standard/subscriber/alx,535,1,0.5,13.0,...,desktop,chrome,united states,organic search,leadership,careers,Careers,Remote Working,subscriber,macintosh
3,pni6ssai9qjzzxh,4724604073189409071,1620156871,2021-05-04,/sites/jackkelly/2021/04/21/general-motors-the...,article/standard/subscriber/alx,87,1,0.5,611.0,...,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Industries,subscriber,macintosh
4,pniuojdbxqsjc3h,6969734970922600880,1631108312,2021-09-08,/sites/markcperna/2021/08/10/4-strategies-for-...,article/standard/subscriber/alx,67,1,1.0,116.0,...,desktop,chrome,united states,organic search,leadership,careers,Business and Finance,Business,subscriber,windows


In [11]:
target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### User's whole behavior
**i.e., avg. top and sum(pvs) over whole GA history**

In [12]:
whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']

whole

Unnamed: 0,GA_fullVisitorId,subscription_status,sum_pvs,timeOnPage,avg_top
0,10000049855779198375,non_subscriber,6,664.00,110.67
1,10000110541771159873,non_subscriber,34,3281.00,96.50
2,10000113274638268984,non_subscriber,6,666.00,111.00
3,10000130814584242881,non_subscriber,94,2892.00,30.77
4,10000151329276278902,non_subscriber,5,419.00,83.80
...,...,...,...,...,...
503902,9999886779021615610,non_subscriber,10,657.00,65.70
503903,9999921172181296837,non_subscriber,22,1503.00,68.32
503904,9999943173698081042,non_subscriber,26,3410.00,131.15
503905,9999987496363756221,non_subscriber,43,5256.00,122.23


* Distribution of sum(pvs)
    * Even after limiting non-subs with >5 articles, on a whole -- subs have more pvs than non-subs

In [14]:
whole.groupby('subscription_status').sum_pvs.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,39.32,95.93
std,103.0,616.55
min,5.0,1.0
25%,7.0,10.0
50%,12.0,24.0
75%,27.0,65.0
max,7709.0,76285.0


* Distribution of avg(top)
    * Subs spend more avg. top than non-subs on a whole

In [15]:
whole.groupby('subscription_status').avg_top.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,92.61,138.33
std,86.06,103.77
min,0.0,0.0
25%,36.81,68.0
50%,70.13,118.27
75%,119.7,181.84
max,1261.6,2497.0


### Users' Unique Pageviews in each session (avg, median)

In [16]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'GA_scrollDepth': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'GA_scrollDepth': 'mean', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'unique_pageViews', 'timeOnPage': 'top_per_session'}, inplace=True)
session 
#3mins

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,unique_pageViews,GA_scrollDepth,top_per_session
0,10000049855779198375,1633875633,1,0.50,528.00
1,10000049855779198375,1634090973,1,0.50,136.00
2,10000049855779198375,1634130400,1,0.00,0.00
3,10000110541771159873,1626322032,1,0.25,22.00
4,10000110541771159873,1626926009,1,0.50,57.00
...,...,...,...,...,...
16793979,9999987504359326751,1634442610,1,0.50,48.00
16793980,9999987504359326751,1634898404,1,0.50,159.00
16793981,9999987504359326751,1635315690,1,0.75,173.00
16793982,9999987504359326751,1635637982,1,0.75,658.00


In [17]:
pageViews = session.groupby('GA_fullVisitorId').agg({'unique_pageViews': ['mean', 'median']}).reset_index()

# rename cols
pageViews.columns = [' '.join(col).strip() for col in pageViews.columns.values]
pageViews.rename(columns={'unique_pageViews mean':'unique_pageviews_mean', 'unique_pageViews median': 'unique_pageviews_median'}, inplace=True)

# join target class
pageViews = pd.merge(pageViews, target_class, how="left", on = "GA_fullVisitorId")
pageViews

Unnamed: 0,GA_fullVisitorId,unique_pageviews_mean,unique_pageviews_median,subscription_status
0,10000049855779198375,1.00,1.00,non_subscriber
1,10000110541771159873,1.04,1.00,non_subscriber
2,10000113274638268984,1.00,1.00,non_subscriber
3,10000130814584242881,31.00,31.00,non_subscriber
4,10000151329276278902,1.00,1.00,non_subscriber
...,...,...,...,...
503902,9999886779021615610,1.00,1.00,non_subscriber
503903,9999921172181296837,1.00,1.00,non_subscriber
503904,9999943173698081042,1.07,1.00,non_subscriber
503905,9999987496363756221,1.07,1.00,non_subscriber


* Average and Median of (unique pageviews in each sessions)

    * Avg. pv per session is more for subs
    * Median pv per session is 1 for the most part as expected w/ subs median being slighlty higher

In [18]:
pageViews.groupby('subscription_status').unique_pageviews_mean.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,1.09,2.33
std,0.33,4.45
min,1.0,1.0
25%,1.0,1.33
50%,1.0,1.75
75%,1.1,2.5
max,39.0,356.0


In [19]:
pageViews.groupby('subscription_status').unique_pageviews_median.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,1.03,1.88
std,0.32,4.43
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,39.0,356.0


### Users' Time on Page in each session (avg, median)

In [20]:
timeOnPage = session.groupby('GA_fullVisitorId').agg({'top_per_session': ['mean', 'median']}).reset_index()

# rename cols
timeOnPage.columns = [' '.join(col).strip() for col in timeOnPage.columns.values]
timeOnPage.rename(columns={'top_per_session mean':'top_mean', 'top_per_session median': 'top_median'}, inplace=True)

# join target class
timeOnPage = pd.merge(timeOnPage, target_class, how="left", on = "GA_fullVisitorId")
timeOnPage

Unnamed: 0,GA_fullVisitorId,top_mean,top_median,subscription_status
0,10000049855779198375,221.33,136.00,non_subscriber
1,10000110541771159873,109.48,5.50,non_subscriber
2,10000113274638268984,333.00,333.00,non_subscriber
3,10000130814584242881,23.70,23.70,non_subscriber
4,10000151329276278902,83.80,0.00,non_subscriber
...,...,...,...,...
503902,9999886779021615610,73.00,65.00,non_subscriber
503903,9999921172181296837,79.11,13.00,non_subscriber
503904,9999943173698081042,170.60,22.00,non_subscriber
503905,9999987496363756221,110.76,79.25,non_subscriber


* Average and Median of (sum of time on page in each session)

    * Both avg and median distributions for subscribers are higher than non-subs

In [21]:
timeOnPage.groupby('subscription_status').top_mean.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,114.12,210.48
std,181.76,321.82
min,0.0,0.0
25%,37.9,85.53
50%,71.46,149.0
75%,126.07,249.65
max,8484.0,33240.0


In [23]:
timeOnPage.groupby('subscription_status').top_median.describe().T

subscription_status,non_subscriber,subscriber
count,402307.0,101600.0
mean,40.71,84.77
std,60.48,111.33
min,0.0,0.0
25%,5.0,28.17
50%,24.5,56.29
75%,52.5,99.5
max,1518.0,4994.0


### Pageviews in referral sources, country, device OS

* Subs coming more from organic search. 
* Non-subs more from content aggregators

In [23]:
pvs_eda(df, 'GA_referralGroup') #makes sense - flipboard, newsbreak

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_referralGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
organic search,30.43,65.84,2,1
referral,0.38,8.49,5,2
direct,9.36,7.69,3,3
newsletter,0.01,7.15,8,4
organic social (dark),0.8,4.82,4,5
organic social (forbes),0.09,3.11,6,6
content aggregators,58.89,2.8,1,7
paid search,0.03,0.07,7,8
paid display,0.0,0.02,10,9
paid web,0.0,0.01,9,10


* Subs highest in US. 
* Non-subs presence is in other countries also

In [24]:
pvs_eda(df, 'GA_country').drop('(not set)').head(10)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
united states,66.22,89.93,1,1
japan,0.21,1.55,23,2
russia,0.03,1.21,61,3
canada,11.54,1.12,2,4
united kingdom,6.04,1.04,3,5
australia,3.36,0.5,4,6
india,1.25,0.38,6,8
singapore,1.46,0.21,5,9
mexico,0.24,0.16,20,10
germany,0.53,0.16,8,11


* Subs pvs highest from desktop 
* Non-subs pvs highest from phone

In [25]:
pvs_eda(df, 'deviceOS') #content aggre is an app plus we filters for articles

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,0.03,44.25,4,1
macintosh,0.23,39.31,3,2
ios,56.58,8.25,1,3
android,43.14,7.7,2,4
other,0.02,0.5,5,5


### Pageviews in various content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)

In [26]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

Shape before:  (25566117, 25)
Shape after:  (22023999, 25)


In [27]:
cat = content.copy()
cat.isna().sum()

piano_id                    15711356
GA_fullVisitorId                   0
GA_visitStartTime                  0
GA_date                            0
GA_pagePath                        0
GA_dfpNewZone                 316792
GA_visitNumber                     0
GA_pageViews                       0
GA_scrollDepth               1666615
timeOnPage                   1367773
GA_cmsNaturalId                    0
title                          68595
publish_date                   68595
GA_deviceOperatingSystem           0
GA_deviceCategory                  0
GA_deviceBrowser                   0
GA_country                         0
GA_referralGroup                   0
GA_primaryChannel                  0
GA_primarySection                  0
tier1                        6214922
tier2                        7273897
subscription_status                0
deviceOS                           0
natid_start                        0
mon_year                           0
dtype: int64

In [28]:
print("Before - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("Before - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

shortlisted_channel = joblib.load("pri_channel_shortlisted.pkl")
shortlisted_section = joblib.load("pri_section_shortlisted.pkl")

# primary channel
cat["GA_primaryChannel"] = np.where(cat["GA_primaryChannel"].isin(shortlisted_channel), 
                                    cat["GA_primaryChannel"], "other")

# primary section 
cat["GA_primarySection"] = np.where(cat["GA_primarySection"].isin(shortlisted_section), 
                                    cat["GA_primarySection"], "other")

print("After shortlisting - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("After shortlisting - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

# fillna with 0
cat.timeOnPage = cat.timeOnPage.fillna(0)

# replace empty and NULL with "none"
cat.tier1 = cat.tier1.replace(r'^\s*$', "none", regex=True)
cat.tier1 = cat.tier1.fillna("none")

# replace empty and NULL with "none"
cat.tier2 = cat.tier2.replace(r'^\s*$', "none", regex=True)
cat.tier2 = cat.tier2.fillna("none")

print("Unique all T1s: ", len(cat.tier1.unique()))  
print("Unique all T2s: ", len(cat.tier1.unique()))  

Before - unique PC:  35
Before - unique PS:  159 

After shortlisting - unique PC:  24
After shortlisting - unique PS:  51 

Unique all T1s:  31
Unique all T2s:  31


In [30]:
cat.shape

(22023999, 26)

* Pageviews - Tier 1

    * Subs more likely to read
        * Busi & Fin, Personal Fin, Careers, Medical Health
    * Non-subs more likely to read
        * Busi & Fin, Video Gaming, Science, News & Politics

In [31]:
pvs_eda(cat, 'tier1', drop_cols=True)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business and Finance,17.17,30.98,1,1
Personal Finance,15.07,10.53,3,2
News and Politics,11.77,10.41,5,3
Technology & Computing,14.38,8.75,4,4
Medical Health,5.77,7.22,6,5
Video Gaming,16.13,3.84,2,6
Travel,2.16,3.08,7,7
Careers,1.19,2.42,12,8
Sports,1.99,2.18,8,9
Food & Drink,0.84,2.16,16,10


* Pageviews - Tier 2

    * Subs more likely to read
        * Busi, Industries, Economy, Diseases and Conditions
    * Non-subs more likely to read
        * Economy, Games, Consumer Electronics, Computing, Politics

In [32]:
pvs_eda(cat, 'tier2', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,4.51,18.31,9,1
Industries,4.03,8.43,11,2
Economy,9.57,6.27,1,3
Computing,8.17,6.2,3,4
Politics,7.62,6.16,4,5
Diseases and Conditions,4.04,5.54,10,6
Personal Investing,5.75,5.04,8,7
Personal Debt,6.38,2.14,7,8
Video Game Genres,6.41,1.95,6,9
Travel Type,1.14,1.93,17,10


* Pageviews - Primary Channel

    * Subs more likely to read
        * Busi, leadership, real estate	, small business, lifestyle??
    * Non-subs more likely to read
        * innovation, money, Consumer

In [33]:
pvs_eda(cat, 'GA_primaryChannel', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primaryChannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
business,16.88,22.37,3,1
leadership,8.36,22.26,4,2
innovation,37.99,19.77,1,3
money,24.29,16.33,2,4
lifestyle,6.02,8.37,5,5
small business,0.95,4.0,7,6
billionaires,0.84,1.98,8,7
real estate,0.26,0.93,14,8
consumer,1.23,0.8,6,9
shopping,0.56,0.75,12,10


* Pageviews - Primary Section

    * Subs more likely to read
        * careers, forbeswomen, leadership strategy, entrepreneurs, travel??
    * Non-subs more likely to read
        * games, crypto & blockchain, consumer tech

In [34]:
pvs_eda(cat, 'GA_primarySection', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primarySection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
careers,2.5,6.56,13,1
leadership strategy,2.67,5.9,11,2
games,17.87,5.8,1,3
travel,2.93,4.74,9,4
markets,3.85,4.65,6,5
forbeswomen,1.34,4.59,18,6
personal finance,8.95,4.21,4,7
hollywood & entertainment,2.95,3.98,8,8
crypto & blockchain,9.32,3.88,3,9
entrepreneurs,1.3,3.85,19,10


### Avg. TOP in various content categories
* Content categories = IAB Tier 1, Tier 2

In [35]:
t1 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier1',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t1.GA_fullVisitorId)

# calc avg. top
t1_top = t1["timeOnPage"]/t1["GA_pageViews"]
t1_top = t1_top.fillna(0)

t1_top["GA_fullVisitorId"] = fvids
t1_top = pd.merge(t1_top, target_class, how="left", on="GA_fullVisitorId")

* Average time on page - Tier 1

    * Subs spending more time on
        * Busi & Fin, Personal Fin, News & Politics, Careers
    * Non-subs spending more time on
        * Video Gaming

In [36]:
top_eda(t1_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'})

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business and Finance,46.65,129.02,1,1
Personal Finance,43.04,83.6,2,2
Technology & Computing,36.14,71.38,3,3
Medical Health,23.09,67.43,6,4
News and Politics,24.71,66.41,5,5
Careers,10.53,39.0,11,6
Travel,13.2,37.14,8,7
Shopping,8.35,32.61,15,8
Style & Fashion,7.42,31.28,17,9
Sports,10.22,29.33,12,10


* Average time on page - Tier 2

    * Subs spending more time on
        * Business, Industries
    * Non-subs spending more time on
        * Computing, Economy

In [37]:
t2 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2.GA_fullVisitorId)

# calc avg. top
t2_top = t2["timeOnPage"]/t2["GA_pageViews"]
t2_top = t2_top.fillna(0)

t2_top["GA_fullVisitorId"] = fvids
t2_top = pd.merge(t2_top, target_class, how="left", on="GA_fullVisitorId")

top_eda(t2_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'}).head(15)

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business,25.95,105.75,3,1
Industries,22.68,81.2,4,2
Computing,29.96,59.54,1,3
Economy,26.22,59.38,2,4
Personal Investing,22.06,56.94,5,5
Diseases and Conditions,15.91,56.39,10,6
Politics,16.88,43.18,9,7
Law,10.89,27.5,13,8
Personal Debt,21.21,26.79,6,9
Travel Type,8.37,26.29,16,10


### Avg. article views in each month

In [49]:
user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'subscription_status', 'mon_year']).GA_cmsNaturalId.count()).reset_index().rename(columns=
                                                                                                                                                   {'GA_cmsNaturalId':'total_article_views'})
user_per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,total_article_views
0,10000049855779198375,non_subscriber,2021-10,6
1,10000110541771159873,non_subscriber,2021-07,6
2,10000110541771159873,non_subscriber,2021-08,14
3,10000110541771159873,non_subscriber,2021-09,3
4,10000110541771159873,non_subscriber,2021-10,7
...,...,...,...,...
2874097,9999987496363756221,non_subscriber,2021-04,1
2874098,9999987496363756221,non_subscriber,2021-08,4
2874099,9999987496363756221,non_subscriber,2021-09,2
2874100,9999987496363756221,non_subscriber,2021-10,6


In [50]:
per_mon =  pd.DataFrame(user_per_mon.groupby(['GA_fullVisitorId', 'subscription_status']).total_article_views.mean()).reset_index() 

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round() 
per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,total_article_views
0,10000049855779198375,non_subscriber,6.00
1,10000110541771159873,non_subscriber,6.80
2,10000113274638268984,non_subscriber,6.00
3,10000130814584242881,non_subscriber,10.00
4,10000151329276278902,non_subscriber,5.00
...,...,...,...
496349,9999886779021615610,non_subscriber,3.33
496350,9999921172181296837,non_subscriber,2.75
496351,9999943173698081042,non_subscriber,4.33
496352,9999987496363756221,non_subscriber,3.91


* Avg(actual articles per month)
    * Subs on whole have been reading more articles in a month than non-subs

In [51]:
per_mon.groupby('subscription_status').total_article_views.describe().T 

subscription_status,non_subscriber,subscriber
count,402307.0,94047.0
mean,5.04,13.69
std,3.62,30.8
min,1.29,1.0
25%,3.0,4.5
50%,4.33,8.0
75%,6.0,15.0
max,297.5,3740.0


In [73]:
# example non-subs
user_per_mon[user_per_mon.GA_fullVisitorId=='10000058813304965608']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
0,10000058813304965608,non_subscriber,2020-02,6
1,10000058813304965608,non_subscriber,2020-03,14
2,10000058813304965608,non_subscriber,2020-04,13
3,10000058813304965608,non_subscriber,2020-05,27
4,10000058813304965608,non_subscriber,2020-06,15
5,10000058813304965608,non_subscriber,2020-07,9
6,10000058813304965608,non_subscriber,2020-08,1
7,10000058813304965608,non_subscriber,2020-09,13
8,10000058813304965608,non_subscriber,2020-10,23
9,10000058813304965608,non_subscriber,2020-11,13


In [69]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='3857395123229566996']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
491055,3857395123229566996,subscriber,2020-03,2
491056,3857395123229566996,subscriber,2020-04,10
491057,3857395123229566996,subscriber,2020-05,5
491058,3857395123229566996,subscriber,2020-06,12
491059,3857395123229566996,subscriber,2020-07,8
491060,3857395123229566996,subscriber,2020-08,9
491061,3857395123229566996,subscriber,2020-09,1
491062,3857395123229566996,subscriber,2020-10,10
491063,3857395123229566996,subscriber,2020-11,8
491064,3857395123229566996,subscriber,2020-12,17


In [70]:
# example subs
user_per_mon[user_per_mon.GA_fullVisitorId=='5277846215104667271']

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,GA_cmsNaturalId
627088,5277846215104667271,subscriber,2020-03,3
627089,5277846215104667271,subscriber,2020-04,38
627090,5277846215104667271,subscriber,2020-05,34
627091,5277846215104667271,subscriber,2020-06,96
627092,5277846215104667271,subscriber,2020-07,38
627093,5277846215104667271,subscriber,2020-08,69
627094,5277846215104667271,subscriber,2020-09,42
627095,5277846215104667271,subscriber,2020-10,55
627096,5277846215104667271,subscriber,2020-11,73
627097,5277846215104667271,subscriber,2020-12,109


In [None]:
# next steps - data settings: no article limit data - make 1 data. ga_date greater than jan-2021
# send tables - fullvid for subs + nonsubs - 1 single EDA file
# feature selection amd model v1 before dec 9

In [None]:
# tried to question data - 2 ques but this may be inline with expectation. review?

In [None]:
# finalize preliminary EDA

In [None]:
# joining with C-levels for curiosity

start_time = time.time()

sql = """
    SELECT 
        *
      FROM (
        SELECT 
            DISTINCT *,
            RANK() OVER (PARTITION BY GA_fullVisitorId ORDER BY date DESC) AS mostrecent,
        FROM
            `api-project-901373404215.lookalike.zoom_info_c_level`
          )
      WHERE 
          mostrecent = 1
"""

clevels = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time)) #12.45

In [None]:
clevels