In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [3]:
def pvs_eda(input_df, cat_col_name, drop_cols=False):
    
    # pivot
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='subscription_status',
        aggfunc='sum')\
        .reset_index()
    
    # drop none and other cols
    if drop_cols==True:
        
        if 'none' in list(df_cat.columns):
            df_cat.drop('none', axis=1, inplace=True)
        
        if 'other' in list(df_cat.columns):
            df_cat.drop('other', axis=1, inplace=True)

    
    # percentage calc below
    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header
  
    df_cat['% of non_subscriber pvs'] = (df_cat['non_subscriber'] / df_cat['non_subscriber'].sum()) * 100
    df_cat['% of subscriber pvs'] = (df_cat['subscriber'] / df_cat['subscriber'].sum()) * 100
    
    df_cat.drop(['non_subscriber', 'subscriber'], axis=1, inplace=True)

    df_cat['non_subscriber_rank'] = df_cat['% of non_subscriber pvs'].rank(ascending=False).astype(int)
    df_cat['subscriber_rank'] = df_cat['% of subscriber pvs'].rank(ascending=False).astype(int)

    df_cat = df_cat.sort_values('subscriber_rank')
    
    return df_cat

In [4]:
def top_eda(df):
    '''
    Prep df for eda
    '''
    # remove column = "none" i.e. Tier 1/Tier2/PC/PS was not available
    df.drop("none", axis=1, inplace=True)
    
    # group all subscribers, calculate mean of their (avg. time on page) for each category. Same for Non-subscribers
    df = df.groupby('subscription_status').mean().T
    
    # assign rank
    df['non_subscriber_rank'] = df['non_subscriber'].rank(ascending=False).astype(int)
    df['subscriber_rank'] = df['subscriber'].rank(ascending=False).astype(int)
    df = df.sort_values('subscriber_rank')

    return df

## DATA

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)
subs_data.rename(columns={'ga_pianoId': 'piano_id'}, inplace=True)

print(subs_data.shape)
print("Unique unlimited subscribers: ", len(subs_data.piano_id.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")

subs_data.head()

--- 18.09546947479248 seconds ---
(6636071, 23)
Unique unlimited subscribers:  41716
Unique unlimited fullvids:  99096 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,pni6dibvvqpayzs,3174777614220968107,1619791110,2021-04-30,/sites/jefftaylor/,none,14,1,0.5,14.0,...,macintosh,desktop,chrome,united states,organic search,innovation,none,,,subscriber
1,pninjgwtnqj8to8,7616931455160605539,1611070104,2021-01-19,/sites/jeffsteele/,none,90,1,0.0,235.0,...,windows,desktop,chrome,puerto rico,organic search,real estate,none,,,subscriber
2,pnidl2xoeqz0wr2,1121210302312468443,1626184986,2021-07-13,/sites/forbesagencycouncil/2017/12/06/the-sing...,article/standard/default/standard,12,1,0.5,452.0,...,windows,desktop,edge,united states,organic search,leadership,none,,,subscriber
3,pnixmwdpeqjjb7b,7219142532098579810,1621515712,2021-05-20,/sites/sap/2021/05/12/how-hybrid-remote-work-i...,article/standard/default/standard,2,1,0.0,62.0,...,ios,mobile,safari,united states,newsletter,innovation,none,,,subscriber
4,pniluxtn9qn8sos,7600114442027281591,1634087240,2021-10-12,/,none,6,1,0.2,106.0,...,windows,desktop,chrome,united states,direct,home,none,,,subscriber


In [6]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before removing suspicious fvids:", nonsubs_data.shape)
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

# it was noted that some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')
suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

print("After removing ", len(suspicious_fvid), "suspicious fvids:", nonsubs_data.shape)
nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")

nonsubs_data.head()

--- 10.969189882278442 seconds ---
Before removing suspicious fvids: (1231590, 23)
Unique unlimited fullvids:  410000 

After removing  12 suspicious fvids: (1231590, 23)
Unique unlimited fullvids:  409988 



Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status
0,,8738220177769013662,1630523553,2021-09-01,/companies/loom/,none,4,1,0.25,4.0,...,windows,desktop,edge,united states,organic search,none,none,,,non_subscriber
1,,7907834872961320899,1632520131,2021-09-24,/top-wealth-advisors/,none,1,1,1.0,3.0,...,windows,desktop,edge,united states,organic search,advisor,america's top wealth advisors 2021,,,non_subscriber
2,,5408030440582556413,1634478845,2021-10-17,/sites/joshuadines/,none,1,1,0.25,7.0,...,windows,desktop,edge,united states,direct,consumer,sportsmoney,,,non_subscriber
3,,5783491991586390106,1634088981,2021-10-12,/sites/arielcohen/,none,2,1,0.25,4.0,...,windows,desktop,edge,united states,organic social (dark),business,energy,,,non_subscriber
4,,13276614681697164378,1633348653,2021-10-04,/sites/petercohan/2012/02/19/jurassic-park-how...,article-amp/standard/default/standard,1,1,0.5,176.0,...,windows,desktop,edge,united states,referral,money,markets,,,non_subscriber


In [7]:
# any col names mismatch? - no

[x for x in list(subs_data.columns) if x not in list(nonsubs_data.columns)]

[]

In [8]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape)

# fill na
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('None') 

# short list device OS
shortlisted_os = ["android", "ios", "macintosh", "windows"]

df["deviceOS"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), 
                          df["GA_deviceOperatingSystem"], 
                          "other")

df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts()

Shape:  (7866937, 23)


non_subscriber    409988
subscriber         99096
Name: subscription_status, dtype: int64

In [9]:
df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts(normalize=True)

non_subscriber   0.81
subscriber       0.19
Name: subscription_status, dtype: float64

In [10]:
df.head()

Unnamed: 0,piano_id,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,GA_scrollDepth,timeOnPage,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,subscription_status,deviceOS
0,pni6dibvvqpayzs,3174777614220968107,1619791110,2021-04-30,/sites/jefftaylor/,none,14,1,0.5,14.0,...,desktop,chrome,united states,organic search,innovation,none,,,subscriber,macintosh
1,pninjgwtnqj8to8,7616931455160605539,1611070104,2021-01-19,/sites/jeffsteele/,none,90,1,0.0,235.0,...,desktop,chrome,puerto rico,organic search,real estate,none,,,subscriber,windows
2,pnidl2xoeqz0wr2,1121210302312468443,1626184986,2021-07-13,/sites/forbesagencycouncil/2017/12/06/the-sing...,article/standard/default/standard,12,1,0.5,452.0,...,desktop,edge,united states,organic search,leadership,none,,,subscriber,windows
3,pnixmwdpeqjjb7b,7219142532098579810,1621515712,2021-05-20,/sites/sap/2021/05/12/how-hybrid-remote-work-i...,article/standard/default/standard,2,1,0.0,62.0,...,mobile,safari,united states,newsletter,innovation,none,,,subscriber,ios
4,pniluxtn9qn8sos,7600114442027281591,1634087240,2021-10-12,/,none,6,1,0.2,106.0,...,desktop,chrome,united states,direct,home,none,,,subscriber,windows


In [11]:
target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

## EDA

### User's whole behavior
**i.e., avg. top and sum(pvs) over whole GA history**

In [12]:
whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']

whole

Unnamed: 0,GA_fullVisitorId,subscription_status,sum_pvs,timeOnPage,avg_top
0,1000000948120006249,non_subscriber,1,521.00,521.00
1,10000078774942201969,non_subscriber,5,228.00,45.60
2,10000093321793562421,non_subscriber,1,0.00,0.00
3,10000127776222822445,non_subscriber,1,9.00,9.00
4,10000206856643135086,non_subscriber,5,406.00,81.20
...,...,...,...,...,...
509079,9999428991944010204,subscriber,22,3780.00,171.82
509080,9999453033634739955,non_subscriber,1,97.00,97.00
509081,9999617506304669367,non_subscriber,1,52.00,52.00
509082,9999621821965685242,non_subscriber,1,12.00,12.00


* Distribution of sum(pvs)
    * Subs typically have more pvs than non-subs

In [13]:
whole.groupby('subscription_status').sum_pvs.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,3.0,66.97
std,6.81,364.58
min,1.0,1.0
25%,1.0,9.0
50%,1.0,22.0
75%,3.0,56.0
max,2242.0,44770.0


* Distribution of avg(top)
    * Subs spend more avg. top than non-subs on a whole

In [19]:
whole.groupby('subscription_status').avg_top.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,72.66,135.43
std,122.61,105.12
min,0.0,0.0
25%,7.0,65.25
50%,36.5,113.94
75%,88.0,177.65
max,3364.0,2497.0


### Users' Unique Pageviews in each session (avg, median)

In [59]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'GA_scrollDepth': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'GA_scrollDepth': 'mean', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'unique_pageViews', 'timeOnPage': 'top_per_session'}, inplace=True)
session

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,unique_pageViews,GA_scrollDepth,top_per_session
0,1000000948120006249,1635456756,1,0.25,521.00
1,10000078774942201969,1629090740,1,0.25,15.00
2,10000078774942201969,1632746707,2,1.00,91.50
3,10000078774942201969,1634413951,1,0.75,30.00
4,10000093321793562421,1635454123,1,0.00,0.00
...,...,...,...,...,...
3296967,9999428991944010204,1631618760,1,0.25,22.00
3296968,9999453033634739955,1633461706,1,0.75,97.00
3296969,9999617506304669367,1633783846,1,0.50,52.00
3296970,9999621821965685242,1633456653,1,0.50,12.00


In [60]:
pageViews = session.groupby('GA_fullVisitorId').agg({'unique_pageViews': ['mean', 'median']}).reset_index()

# rename cols
pageViews.columns = [' '.join(col).strip() for col in pageViews.columns.values]
pageViews.rename(columns={'unique_pageViews mean':'unique_pageviews_mean', 'unique_pageViews median': 'unique_pageviews_median'}, inplace=True)

# join target class
pageViews = pd.merge(pageViews, target_class, how="left", on = "GA_fullVisitorId")
pageViews

Unnamed: 0,GA_fullVisitorId,unique_pageviews_mean,unique_pageviews_median,subscription_status
0,1000000948120006249,1.00,1.00,non_subscriber
1,10000078774942201969,1.33,1.00,non_subscriber
2,10000093321793562421,1.00,1.00,non_subscriber
3,10000127776222822445,1.00,1.00,non_subscriber
4,10000206856643135086,1.00,1.00,non_subscriber
...,...,...,...,...
509079,9999428991944010204,1.19,1.00,subscriber
509080,9999453033634739955,1.00,1.00,non_subscriber
509081,9999617506304669367,1.00,1.00,non_subscriber
509082,9999621821965685242,1.00,1.00,non_subscriber


* Average and Median of (unique pageviews in each sessions)

    * Avg. pv per session is more for subs
    * Median pv per session is 1 for the most part as expected w/ subs median being slighlty higher

In [17]:
pageViews.groupby('subscription_status').unique_pageviews_mean.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,1.02,2.3
std,0.2,4.45
min,1.0,1.0
25%,1.0,1.33
50%,1.0,1.73
75%,1.0,2.47
max,58.0,356.0


In [18]:
pageViews.groupby('subscription_status').unique_pageviews_median.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,1.01,1.88
std,0.19,4.44
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,2.0
max,58.0,356.0


### Users' Time on Page in each session (avg, median)

In [20]:
timeOnPage = session.groupby('GA_fullVisitorId').agg({'top_per_session': ['mean', 'median']}).reset_index()

# rename cols
timeOnPage.columns = [' '.join(col).strip() for col in timeOnPage.columns.values]
timeOnPage.rename(columns={'top_per_session mean':'top_mean', 'top_per_session median': 'top_median'}, inplace=True)

# join target class
timeOnPage = pd.merge(timeOnPage, target_class, how="left", on = "GA_fullVisitorId")
timeOnPage

Unnamed: 0,GA_fullVisitorId,top_mean,top_median,subscription_status
0,1000000948120006249,521.00,521.00,non_subscriber
1,10000078774942201969,45.50,30.00,non_subscriber
2,10000093321793562421,0.00,0.00,non_subscriber
3,10000127776222822445,9.00,9.00,non_subscriber
4,10000206856643135086,81.20,32.00,non_subscriber
...,...,...,...,...
509079,9999428991944010204,157.59,73.25,subscriber
509080,9999453033634739955,97.00,97.00,non_subscriber
509081,9999617506304669367,52.00,52.00,non_subscriber
509082,9999621821965685242,12.00,12.00,non_subscriber


* Average and Median of (sum of time on page in each session)

    * Both avg and median distributions for subscribers are higher than non-subs

In [21]:
timeOnPage.groupby('subscription_status').top_mean.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,80.19,205.58
std,153.09,325.75
min,0.0,0.0
25%,7.0,82.12
50%,37.0,143.3
75%,90.83,243.17
max,4444.0,33517.0


In [22]:
timeOnPage.groupby('subscription_status').top_median.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,70.39,122.39
std,149.16,291.95
min,0.0,0.0
25%,3.0,34.0
50%,29.0,70.0
75%,76.0,132.0
max,4444.0,33517.0


### Pageviews in referral sources, country, device OS

* Subs coming more from organic search, referral, newsletter, social
* Non-subs more from content aggregators and direct in addition to organic search

In [23]:
pvs_eda(df, 'GA_referralGroup')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_referralGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
organic search,54.53,65.92,1,1
direct,21.64,8.72,2,2
newsletter,0.02,8.62,8,3
referral,0.4,7.74,5,4
organic social (dark),2.86,4.31,4,5
organic social (forbes),0.34,3.02,6,6
content aggregators,20.15,1.57,3,7
paid search,0.05,0.09,7,8
paid display,0.0,0.01,11,9
paid web,0.01,0.01,9,10


* Subs highest in US. 
* Non-subs presence is in other countries also

In [24]:
pvs_eda(df, 'GA_country').drop('(not set)').head(10)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
united states,68.6,88.76,1,1
russia,0.05,1.78,52,2
canada,8.18,1.32,2,3
united kingdom,6.31,1.2,3,4
japan,0.13,0.82,28,5
australia,3.65,0.67,4,6
india,2.56,0.47,5,8
singapore,0.86,0.25,6,9
germany,0.46,0.21,8,10
mexico,0.22,0.19,18,11


* Subs pvs highest from desktop 
* Non-subs pvs highest from phone

In [25]:
pvs_eda(df, 'deviceOS')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
deviceOS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,0.09,43.09,4,1
macintosh,0.35,40.59,3,2
ios,45.46,8.68,2,3
android,54.05,7.09,1,4
other,0.05,0.55,5,5


### Pageviews in various content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)
* NOTE: during modeling also - calculate percentage of pvs in each category

In [21]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

Shape before:  (7866937, 25)
Shape after:  (5493926, 25)


In [22]:
cat = content.copy()
cat.isna().sum()

piano_id                    1211957
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               846409
timeOnPage                   137018
GA_cmsNaturalId                   0
title                         32799
publish_date                  32799
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       1034572
tier2                       1393428
subscription_status               0
deviceOS                          0
natid_start                       0
mon_year                          0
dtype: int64

In [23]:
print("Before - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("Before - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

shortlisted_channel = joblib.load("pri_channel_shortlisted.pkl")
shortlisted_section = joblib.load("pri_section_shortlisted.pkl")

# primary channel
cat["GA_primaryChannel"] = np.where(cat["GA_primaryChannel"].isin(shortlisted_channel), 
                                    cat["GA_primaryChannel"], "other")

# primary section 
cat["GA_primarySection"] = np.where(cat["GA_primarySection"].isin(shortlisted_section), 
                                    cat["GA_primarySection"], "other")

print("After shortlisting - unique PC: ", len(cat.GA_primaryChannel.unique()))
print("After shortlisting - unique PS: ", len(cat.GA_primarySection.unique()), "\n")

# fillna with 0
cat.timeOnPage = cat.timeOnPage.fillna(0)

# replace empty and NULL with "none"
cat.tier1 = cat.tier1.replace(r'^\s*$', "none", regex=True)
cat.tier1 = cat.tier1.fillna("none")

# replace empty and NULL with "none"
cat.tier2 = cat.tier2.replace(r'^\s*$', "none", regex=True)
cat.tier2 = cat.tier2.fillna("none")

print("Unique all T1s: ", len(cat.tier1.unique()))  
print("Unique all T2s: ", len(cat.tier1.unique()))  

Before - unique PC:  32
Before - unique PS:  120 

After shortlisting - unique PC:  24
After shortlisting - unique PS:  50 

Unique all T1s:  31
Unique all T2s:  31


* Pageviews - Tier 1

    * Subs more likely to read
        * Busi & Fin, Personal Fin, News & Politics, Careers, Travel
    * Non-subs more likely to read
        * Tech, Personal Finance, Video Gaming, Science, Television 

In [25]:
pvs_eda(cat, 'tier1', drop_cols=True)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business and Finance,14.51,32.62,3,1
Personal Finance,15.59,10.55,2,2
News and Politics,8.74,9.37,5,3
Technology & Computing,18.27,8.62,1,4
Medical Health,8.5,6.18,6,5
Video Gaming,9.9,3.33,4,6
Travel,2.89,3.18,7,7
Careers,1.03,2.6,15,8
Sports,2.44,2.37,10,9
Food & Drink,0.88,2.2,18,10


* Pageviews - Tier 2

    * Subs more likely to read
        * Busi, Industries, Economy, Diseases, Personal Investing
    * Non-subs more likely to read
        * Economy, Games, Computing, Personal Debt, Consumer Electronics

In [26]:
pvs_eda(cat, 'tier2', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
tier2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,4.62,19.32,8,1
Industries,3.0,8.65,11,2
Economy,7.85,6.69,3,3
Computing,11.62,6.09,1,4
Personal Investing,4.7,5.37,7,5
Politics,5.04,5.3,5,6
Diseases and Conditions,3.67,4.11,9,7
Travel Type,1.46,2.06,16,8
Vaccines,2.87,2.05,12,9
Personal Debt,8.09,1.96,2,10


* Pageviews - Primary Channel

    * Subs more likely to read
        * Busi, leadership, real estate, small business, billionaires, lifestyle
    * Non-subs more likely to read
        * Busi, innovation, money

In [27]:
pvs_eda(cat, 'GA_primaryChannel', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primaryChannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
leadership,5.15,23.24,5,1
business,18.49,22.94,3,2
innovation,39.66,18.52,1,3
money,24.31,16.49,2,4
lifestyle,7.62,8.12,4,5
small business,1.11,4.48,6,6
billionaires,0.73,2.16,8,7
real estate,0.62,1.32,9,8
shopping,0.43,0.58,10,9
asia,0.81,0.57,7,10


* Pageviews - Primary Section

    * Subs more likely to read
        * careers, forbeswomen, leadership strategy, entrepreneurs, travel??
    * Non-subs more likely to read
        * games, crypto & blockchain, personal finance, consumer tech

In [28]:
pvs_eda(cat, 'GA_primarySection', drop_cols=True).head(20)

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_primarySection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
careers,1.71,7.12,14,1
leadership strategy,1.11,6.35,18,2
games,14.83,5.22,1,3
crypto & blockchain,9.84,5.0,4,4
forbeswomen,1.24,4.93,17,5
travel,4.0,4.6,8,6
markets,2.4,4.24,11,7
hollywood & entertainment,3.57,4.1,9,8
personal finance,11.41,3.87,2,9
investing,1.89,3.79,13,10


### Avg. TOP in various content categories
* Content categories = IAB Tier 1, Tier 2

In [34]:
t1 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier1',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t1.GA_fullVisitorId)

# calc avg. top
t1_top = t1["timeOnPage"]/t1["GA_pageViews"]
t1_top = t1_top.fillna(0)

t1_top["GA_fullVisitorId"] = fvids
t1_top = pd.merge(t1_top, target_class, how="left", on="GA_fullVisitorId")

* Average time on page - Tier 1

    * Subs spending more time on
        * Busi & Fin, Personal Fin, News & Politics, Careers
    * Non-subs spending more time on
        * Video Gaming

In [35]:
top_eda(t1_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'})

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business and Finance,15.38,124.7,1,1
Personal Finance,14.24,78.29,2,2
Technology & Computing,14.21,66.85,3,3
News and Politics,7.0,60.41,5,4
Medical Health,9.88,60.07,4,5
Careers,1.61,35.53,15,6
Travel,3.64,33.46,7,7
Style & Fashion,1.88,27.92,13,8
Sports,2.94,26.5,11,9
Shopping,1.4,26.38,16,10


* Average time on page - Tier 2

    * Subs spending more time on
        * Business, Industries
    * Non-subs spending more time on
        * Computing, Economy, Personal Debt

In [29]:
t2 = cat.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2.GA_fullVisitorId)

# calc avg. top
t2_top = t2["timeOnPage"]/t2["GA_pageViews"]
t2_top = t2_top.fillna(0)

t2_top["GA_fullVisitorId"] = fvids
t2_top = pd.merge(t2_top, target_class, how="left", on="GA_fullVisitorId")

top_eda(t2_top).rename(columns={'non_subscriber': 'non_subscriber avg. top', 'subscriber': 'subscriber avg. top'}).head(15)

subscription_status,non_subscriber avg. top,subscriber avg. top,non_subscriber_rank,subscriber_rank
Business,7.72,100.84,2,1
Industries,4.24,75.79,7,2
Computing,9.7,54.85,1,3
Economy,5.69,54.83,5,4
Personal Investing,4.19,52.46,8,5
Diseases and Conditions,4.9,48.25,6,6
Politics,3.54,37.99,11,7
Vaccines,3.65,24.47,10,8
Travel Type,1.74,23.52,15,9
Consumer Electronics,6.38,22.75,4,10


### Avg. article views in each month

In [41]:
user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'subscription_status', 'mon_year']).GA_pageViews.sum()).reset_index().rename(columns=
                                                                                                                                                   {'GA_pageViews':'total_article_views'})
user_per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,mon_year,total_article_views
0,1000000948120006249,non_subscriber,2021-10,1
1,10000078774942201969,non_subscriber,2021-08,1
2,10000078774942201969,non_subscriber,2021-09,2
3,10000078774942201969,non_subscriber,2021-10,2
4,10000093321793562421,non_subscriber,2021-10,1
...,...,...,...,...
1018327,9999428991944010204,subscriber,2021-09,3
1018328,9999453033634739955,non_subscriber,2021-10,1
1018329,9999617506304669367,non_subscriber,2021-10,1
1018330,9999621821965685242,non_subscriber,2021-10,1


In [42]:
per_mon =  pd.DataFrame(user_per_mon.groupby(['GA_fullVisitorId', 'subscription_status']).total_article_views.mean()).reset_index() 

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round() 
per_mon

Unnamed: 0,GA_fullVisitorId,subscription_status,total_article_views
0,1000000948120006249,non_subscriber,1.00
1,10000078774942201969,non_subscriber,1.67
2,10000093321793562421,non_subscriber,1.00
3,10000127776222822445,non_subscriber,1.00
4,10000206856643135086,non_subscriber,1.25
...,...,...,...
501296,9999428991944010204,subscriber,3.50
501297,9999453033634739955,non_subscriber,1.00
501298,9999617506304669367,non_subscriber,1.00
501299,9999621821965685242,non_subscriber,1.00


* Avg(actual articles per month)
    * Subs on whole have been reading more articles in a month than non-subs

In [43]:
per_mon.groupby('subscription_status').total_article_views.describe().T 

subscription_status,non_subscriber,subscriber
count,409988.0,91313.0
mean,1.37,13.46
std,0.82,25.88
min,1.0,1.0
25%,1.0,4.33
50%,1.0,8.0
75%,1.5,14.8
max,89.67,2849.55


### Bounce rate

In [44]:
print(session.shape)
print(len(session.GA_fullVisitorId.unique()))

(3296972, 5)
509084


In [45]:
def b_rate(g):
    '''for each fvid: calculate percentage of sessions comprising only 1PV'''
    
    # count sessions w/ pv = 1
    sessions_w_1pv = g[g['unique_pageViews']==1].shape[0]
    
    # count total sessions
    total_sessions = g.shape[0]
    
    # calculate ratio
    return (sessions_w_1pv)/total_sessions

In [47]:
br = pd.DataFrame(session.groupby('GA_fullVisitorId').apply(lambda x: b_rate(x))).reset_index().rename(columns={0:'bounce_rate'}) # takes 3mins 10.07

In [48]:
br = pd.merge(br, target_class, how="left", on="GA_fullVisitorId")
br

Unnamed: 0,GA_fullVisitorId,bounce_rate,subscription_status
0,1000000948120006249,1.00,non_subscriber
1,10000078774942201969,0.67,non_subscriber
2,10000093321793562421,1.00,non_subscriber
3,10000127776222822445,1.00,non_subscriber
4,10000206856643135086,1.00,non_subscriber
...,...,...,...
509079,9999428991944010204,0.81,subscriber
509080,9999453033634739955,1.00,non_subscriber
509081,9999617506304669367,1.00,non_subscriber
509082,9999621821965685242,1.00,non_subscriber


* Non-subs have higher bounce rate than subs

In [49]:
br.groupby('subscription_status').bounce_rate.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,0.98,0.55
std,0.1,0.3
min,0.0,0.0
25%,1.0,0.37
50%,1.0,0.6
75%,1.0,0.77
max,1.0,1.0


### Content views rate

In [61]:
natid_page_map = df[['GA_pagePath', 'GA_cmsNaturalId', 'publish_date']].sort_values('publish_date', ascending=False).drop_duplicates('GA_pagePath')

In [62]:
page = pd.merge(page, 
                natid_page_map, 
                how="left", 
                on="GA_pagePath")
page

Unnamed: 0,GA_fullVisitorId,GA_visitStartTime,GA_pagePath,GA_pageViews,GA_scrollDepth,timeOnPage,GA_cmsNaturalId,publish_date
0,1000000948120006249,1635456756,/sites/abigailabesamis/2021/10/22/this-lazy-pe...,1,0.25,521.00,blogandpostid/blog/post/6190-6172f0890a6329000...,2021-10-22 13:27:54
1,10000078774942201969,1629090740,/sites/williamhaseltine/2021/08/10/it-is-time-...,1,0.25,15.00,blogandpostid/blog/post/5566-61126dd6dd43d6000...,2021-08-10 08:55:52
2,10000078774942201969,1632746707,/sites/deloitte/2021/08/03/to-mitigate-risk-in...,1,,31.00,blogandpostid/blog/post/3585-610964dfb4025a000...,2021-08-03 12:30:19
3,10000078774942201969,1632746707,/sites/ellevate/2013/10/18/what-to-expect-duri...,1,1.00,152.00,blogandpostid/blog/post/1386-2670,2013-10-18 11:26:00
4,10000078774942201969,1634413951,/sites/forbes-personal-shopper/2021/09/21/dagn...,1,0.75,30.00,blogandpostid/blog/post/4983-6148df6746a14f000...,2021-09-21 13:14:04
...,...,...,...,...,...,...,...,...
5400712,9999428991944010204,1631618760,/sites/leahcampbell/2021/06/26/moderna-or-pfiz...,1,0.25,22.00,blogandpostid/blog/post/50736-60d79acc0157d100...,2021-06-26 17:26:44
5400713,9999453033634739955,1633461706,/sites/japan/2020/10/30/powered-by-entrepreneu...,1,0.75,97.00,blogandpostid/blog/post/5436-5f9c3ef5d05362000...,2020-10-30 12:41:37
5400714,9999617506304669367,1633783846,/sites/zackfriedman/2021/10/09/why-student-loa...,1,0.50,52.00,blogandpostid/blog/post/4804-615e63fbd2392d000...,2021-10-09 08:30:00
5400715,9999621821965685242,1633456653,/sites/paultassi/2021/10/01/cdpr-warns-investo...,1,0.50,12.00,blogandpostid/blog/post/1174-615706f3accd12000...,2021-10-01 09:06:06


In [70]:
def c_views_rate(g):
    '''for each fvid: calculate percentage PVs that are actually views on content pages; vs non-content pages such as the home page, channel/section landing pages, author pages, etc.'''
    
    # sum pvs on actual content for user
    content_sum_pv = g[g.GA_cmsNaturalId.str.contains("blogandpostid|blogandslideid|galleryid|video")].GA_pageViews.sum()
    
    # sum all pvs for user
    total_pv = g.GA_pageViews.sum()
    
    # calculate ratio
    return (content_sum_pv)/total_pv

In [72]:
cvr = pd.DataFrame(page.groupby('GA_fullVisitorId').apply(lambda x: c_views_rate(x))).reset_index().rename(columns={0:'content_views_rate'}) # takes 5mins

In [71]:
cvr = pd.merge(cvr, target_class, how="left", on="GA_fullVisitorId")
cvr

* Non-subs have higher content views rate i.e., they mostly come for just the content. Subs roam around on the website

In [78]:
cvr.groupby('subscription_status').content_views_rate.describe().T

subscription_status,non_subscriber,subscriber
count,409988.0,99096.0
mean,0.99,0.7
std,0.05,0.29
min,0.01,0.0
25%,1.0,0.55
50%,1.0,0.78
75%,1.0,0.94
max,1.0,1.0


In [82]:
# both these metrics (bounce_rate and content_views_rate) have slight positive correlation

pd.merge(cvr, br, how="inner", on = "GA_fullVisitorId").corr()

Unnamed: 0,content_views_rate,bounce_rate
content_views_rate,1.0,0.62
bounce_rate,0.62,1.0
