## Description:

This file contains:
* Reading in raw input data from SQL tables. Exhaustive list of features contained in this raw input data is mentioned below.

In [1]:
'''Helper'''
import pandas as pd
from functools import reduce
import numpy as np
import joblib
import datetime
import time
import re

'''GCS Utils'''
from gcs_utils import *

'''Display'''
import warnings
warnings.filterwarnings('ignore') 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

'''BQ'''
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [2]:
def convert_time(time):
    return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')

### Data from SQL Tables

In [3]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

--- 34.20732355117798 seconds ---
Shape:  (6556736, 23)


In [4]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

--- 15.373432159423828 seconds ---
Before: (1192679, 23)
After removing  10 suspicious fvids: (1192343, 23)


In [6]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape, "\n")

print("Total fvids in df: ", len(df.GA_fullVisitorId.unique()), "\n")

print(df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts(), "\n")

Shape:  (7749079, 23) 

Total fvids in df:  507937 

non_subscriber    409990
subscriber         97947
Name: subscription_status, dtype: int64 



In [7]:
df.isna().sum()

piano_id                    1192343
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               866698
timeOnPage                   142767
GA_cmsNaturalId                   0
title                       2366498
publish_date                2366497
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       3342473
tier2                       3696489
subscription_status               0
dtype: int64

In [9]:
# sanity check

whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']
whole.groupby('subscription_status').sum_pvs.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,2.91,66.94
std,5.34,358.19
min,1.0,1.0
25%,1.0,9.0
50%,1.0,22.0
75%,3.0,56.0
max,492.0,46250.0


* **Imputation**
    * time on page = 0
    * natid = none
    * Tier1, Tier2 missing/empty = other
    * PC, PS, Country, Device OS shortlisted. Rest = other

In [7]:
# fill NAs
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('none') 

# fill NAs
df.timeOnPage = df.timeOnPage.fillna(0)

In [8]:
# short list prim channel. NOTE - 'none' not included below 
shortlisted_channel = ['business', 'leadership', 'money', 'innovation', 'lifestyle', 'home', 
                       'billionaires', 'small business', 'consumer', 'shopping', 'industry', 'investing', 
                       'tech', 'entrepreneurs', 'newsletters', 'asia', 'opinion', 'real estate', 'lists', 
                       'forbes finds', 'under 30', 'advisor', 'video']

# short list prim section. NOTE - 'none' not included below 
shortlisted_section = ['careers', 'personal finance', 'markets', 'forbeswomen', 'leadership strategy', 'healthcare', 'travel', 'sportsmoney', 'retail', 'entrepreneurs', 'science', 
                        'taxes', 'policy', 'consumer tech', 'investing', 'retirement', 'education', 'cmo network', 'real estate', 'hollywood & entertainment', 'cybersecurity', 
                        'aerospace & defense', 'diversity & inclusion', 'energy', 'food & drink', 'enterprise & cloud', 'enterprise tech', 'transportation', 'crypto & blockchain', 'games', 
                        'money & politics', 'media', 'fintech', 'venture capital', 'forbeslife', 'vices', 'manufacturing', 'small business strategy', 'hedge funds & private equity', 
                        'arts', 'ai', 'cio network', 'cars & bikes', 'banking & insurance', 'cfo network', 'spirits', 'cloud', 'dining', 'confirmation', 'wealth management']

shortlisted_os = ["android", "ios", "macintosh", "windows"]

shortlisted_country = ['united states', 'russia', 'canada', 'united kingdom', 'japan', 'australia', 'india', 'singapore', 'germany', 'philippines']

In [9]:
# replace NULL and empty with "other"
df.tier1 = df.tier1.replace(r'^\s*$', "other", regex=True)
df.tier1 = df.tier1.fillna("other")

# replace empty and NULL with "other"
df.tier2 = df.tier2.replace(r'^\s*$', "other", regex=True)
df.tier2 = df.tier2.fillna("other")

df["GA_primaryChannel"] = np.where(df["GA_primaryChannel"].isin(shortlisted_channel), df["GA_primaryChannel"], "other")

df["GA_primarySection"] = np.where(df["GA_primarySection"].isin(shortlisted_section), df["GA_primarySection"], "other")

df["GA_deviceOperatingSystem"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), df["GA_deviceOperatingSystem"], "other")

df["GA_country"] = np.where(df["GA_country"].isin(shortlisted_country), df["GA_country"], "other")

In [10]:
# after imputation

df.isna().sum()

piano_id                    1192343
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               866698
timeOnPage                        0
GA_cmsNaturalId                   0
title                       2366498
publish_date                2366497
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                             0
tier2                             0
subscription_status               0
dtype: int64

In [11]:
# setting aside target class

target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### Training Data - Features

**Numerical Features**
1. User metrics
    * unique pageviews per session (avg, median)
    * timeOnPage per session (avg, median) <br><br>
2. Session metrics
    * bounceRate  <br><br>
3. Content metrics
    * contentViewsRate (contentViews / pageViews) <br>

**Categorical Features**
1. Content categories 
    * Sum pvs - Tier1, Tier2, Primary Channel, Primary Section
    * Avg. top - Tier1 <br><br>
2. Timestamp features: 

    * Weekday vs Weekend 
        * Sum pvs & avg. top
    * Business Hours vs Non-Business Hours *(Differentiated by US and other countries)*
        * Sum pvs & avg. top
    * Day of week
        * avg. top
    * Day of month
        * avg. top
    * Hourly *(Differentiated by US and other countries)*
        * avg. top
    * Minute *(Differentiated by US and other countries)*
        * avg. top
    * Month?? -- check with rob: may not be available at prediction time<br><br>
3. Device OS - Sum pvs
4. Referral source - Sum pvs
5. Countries - Sum pvs<br><br>

NOTE: Keeping sum as the aggregation metric for pvs in categorical features for now so that during modeling different treatments can be tried like Percentage of pvs or avg. pvs or one-hot encoding
<br>

**Extra features for v2:**
* Avg. monthly article views 
* Avg. top per person - Tier2
* Timestamp features: (Percentage of pvs) -- 
    * Hourly
    * Day of week
    * Month
    * Day of month
    * Minute  

In [13]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'session_pvs', 'timeOnPage': 'session_top'}, inplace=True)

* **Feature 1: Users' Unique Pageviews in each session (avg, median)**

In [14]:
pvs = session.groupby('GA_fullVisitorId').agg({'session_pvs': ['mean', 'median']}).reset_index()

# rename cols
pvs.columns = [' '.join(col).strip() for col in pvs.columns.values]
pvs.rename(columns={'session_pvs mean':'session_pvs_mean', 'session_pvs median': 'session_pvs_median'}, inplace=True)
pvs

Unnamed: 0,GA_fullVisitorId,session_pvs_mean,session_pvs_median
0,10000145548747950113,1.00,1.00
1,10000150578546110250,1.00,1.00
2,10000154461756058018,1.00,1.00
3,10000244735129548321,1.00,1.00
4,10000256049462665870,1.00,1.00
...,...,...,...
507932,9999935617354687741,1.33,1.00
507933,9999969986945834961,1.00,1.00
507934,9999974732591335515,1.00,1.00
507935,9999975861633509300,1.00,1.00


* **Feature 2:  Users' Time on Page in each session (avg, median)**

In [15]:
top = session.groupby('GA_fullVisitorId').agg({'session_top': ['mean', 'median']}).reset_index()

# rename cols
top.columns = [' '.join(col).strip() for col in top.columns.values]
top.rename(columns={'session_top mean':'session_top_mean', 'session_top median': 'session_top_median'}, inplace=True)

top

Unnamed: 0,GA_fullVisitorId,session_top_mean,session_top_median
0,10000145548747950113,454.00,454.00
1,10000150578546110250,14.50,12.00
2,10000154461756058018,52.00,52.00
3,10000244735129548321,32.00,32.00
4,10000256049462665870,15.00,15.00
...,...,...,...
507932,9999935617354687741,44.83,63.50
507933,9999969986945834961,262.33,143.00
507934,9999974732591335515,42.00,42.00
507935,9999975861633509300,60.00,60.00


In [16]:
def sum_pvs(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # fillna
    df_cat = df_cat.fillna(0)

    return df_cat

In [17]:
def calc_top(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values=['timeOnPage', 'GA_pageViews'],
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # set aside fvids
    fvids = list(df_cat.GA_fullVisitorId)

    # calc avg. top
    df_top = df_cat["timeOnPage"]/df_cat["GA_pageViews"]
    
    # fillna
    df_top = df_top.fillna(0)

    # join fvids
    df_top["GA_fullVisitorId"] = fvids
    
    return df_top

* **Feature 3: Referral sources - sum pvs**

In [18]:
# sum pvs of user in referral categories
ref = sum_pvs(df, 'GA_referralGroup')

# rename
ref.columns = ref.columns.map(lambda x : 'rf_'+x if x !='GA_fullVisitorId' else x)

ref

GA_referralGroup,GA_fullVisitorId,rf_content aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic search,rf_organic social (dark),rf_organic social (forbes),rf_paid display,rf_paid search,rf_paid social (dark),rf_paid social (forbes),rf_paid web,rf_referral
0,10000145548747950113,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,3.00,0.00,0.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 4: Country - sum pvs**

In [19]:
subs_top_ct = ['united states', 'russia', 'canada', 'united kingdom', 'japan']
nonsubs_top_ct = ['australia', 'india', 'singapore', 'germany', 'philippines']

shortlisted_countries = subs_top_ct + nonsubs_top_ct

In [20]:
# per user, sum pvs
country = sum_pvs(df, 'GA_country')

country.columns = country.columns.map(lambda x : 'ct_'+x if x !='GA_fullVisitorId' else x)
country

GA_country,GA_fullVisitorId,ct_australia,ct_canada,ct_germany,ct_india,ct_japan,ct_other,ct_philippines,ct_russia,ct_singapore,ct_united kingdom,ct_united states
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00
507933,9999969986945834961,0.00,6.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 5: Device OS - sum pvs**

In [21]:
# per user,sum pvs

device_os = sum_pvs(df, 'GA_deviceOperatingSystem')

device_os.columns = device_os.columns.map(lambda x : 'dos_'+x if x !='GA_fullVisitorId' else x)

device_os

GA_deviceOperatingSystem,GA_fullVisitorId,dos_android,dos_ios,dos_macintosh,dos_other,dos_windows
0,10000145548747950113,0.00,1.00,0.00,0.00,0.00
1,10000150578546110250,0.00,4.00,0.00,0.00,0.00
2,10000154461756058018,1.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,1.00,0.00,0.00,0.00
4,10000256049462665870,0.00,1.00,0.00,0.00,0.00
...,...,...,...,...,...,...
507932,9999935617354687741,4.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,6.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,1.00,0.00,0.00,0.00
507935,9999975861633509300,1.00,0.00,0.00,0.00,0.00


### Content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)

* **Feature 6: Tier 1 - sum pvs**

In [22]:
print("Unique Tier 1: ", len(df.tier1.unique())) 

# per user, sum pvs only in above non-null tier1s
t1 = sum_pvs(df, 'tier1')

t1.columns = t1.columns.map(lambda x : 't1_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t1

Unique Tier 1:  31


tier1,GA_fullVisitorId,t1_Automotive_pvs,t1_Books and Literature_pvs,t1_Business and Finance_pvs,t1_Careers_pvs,t1_Content Channel_pvs,t1_Education_pvs,t1_Events and Attractions_pvs,t1_Family and Relationships_pvs,t1_Fine Art_pvs,...,t1_Religion & Spirituality_pvs,t1_Science_pvs,t1_Shopping_pvs,t1_Sports_pvs,t1_Style & Fashion_pvs,t1_Technology & Computing_pvs,t1_Television_pvs,t1_Travel_pvs,t1_Video Gaming_pvs,t1_other_pvs
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,3.00,0.00,0.00,0.00,1.00
507933,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,2.00,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 7: Tier 2 - sum pvs**

In [23]:
print("Unique Tier 2: ", len(df.tier2.unique())) 

# per user, sum pvs only in above non-null tier1s
t2 = sum_pvs(df, 'tier2')

t2.columns = t2.columns.map(lambda x : 't2_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t2

Unique Tier 2:  323


tier2,GA_fullVisitorId,t2_Action and Adventure Movies_pvs,t2_Adult Contemporary Music_pvs,t2_Adult Education_pvs,t2_Alcoholic Beverages_pvs,t2_Alternative Music_pvs,t2_American Football_pvs,t2_Amusement and Theme Parks_pvs,t2_Animation Movies_pvs,t2_Animation TV_pvs,...,t2_Women's Fashion_pvs,t2_Women's Health_pvs,t2_Workshops and Classes_pvs,t2_World Cuisines_pvs,t2_World Movies_pvs,t2_Wrestling_pvs,t2_Young Adult Literature_pvs,t2_Zoos & Aquariums_pvs,t2_eSports_pvs,t2_other_pvs
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
507933,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00


* **Feature 8: Prim Channel- sum pvs**

In [24]:
pc = sum_pvs(df, 'GA_primaryChannel')

pc.columns = pc.columns.map(lambda x : 'pc_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
pc

GA_primaryChannel,GA_fullVisitorId,pc_advisor_pvs,pc_asia_pvs,pc_billionaires_pvs,pc_business_pvs,pc_consumer_pvs,pc_entrepreneurs_pvs,pc_forbes finds_pvs,pc_home_pvs,pc_industry_pvs,...,pc_money_pvs,pc_newsletters_pvs,pc_opinion_pvs,pc_other_pvs,pc_real estate_pvs,pc_shopping_pvs,pc_small business_pvs,pc_tech_pvs,pc_under 30_pvs,pc_video_pvs
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 9: Prim Section - sum pvs**

In [25]:
ps = sum_pvs(df, 'GA_primarySection')

ps.columns = ps.columns.map(lambda x : 'ps_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
ps

GA_primarySection,GA_fullVisitorId,ps_aerospace & defense_pvs,ps_ai_pvs,ps_arts_pvs,ps_banking & insurance_pvs,ps_careers_pvs,ps_cars & bikes_pvs,ps_cfo network_pvs,ps_cio network_pvs,ps_cloud_pvs,...,ps_science_pvs,ps_small business strategy_pvs,ps_spirits_pvs,ps_sportsmoney_pvs,ps_taxes_pvs,ps_transportation_pvs,ps_travel_pvs,ps_venture capital_pvs,ps_vices_pvs,ps_wealth management_pvs
0,10000145548747950113,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000150578546110250,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
2,10000154461756058018,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,10000244735129548321,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,10000256049462665870,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,9999935617354687741,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507933,9999969986945834961,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507934,9999974732591335515,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
507935,9999975861633509300,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 10: Tier 1 - Avg. time on page**

In [26]:
t1_top = calc_top(df, 'tier1')

# rename
t1_top.columns = t1_top.columns.map(lambda x : 't1_'+ str(x) + '_top' if x !='GA_fullVisitorId' else x)

t1_top

tier1,t1_Automotive_top,t1_Books and Literature_top,t1_Business and Finance_top,t1_Careers_top,t1_Content Channel_top,t1_Education_top,t1_Events and Attractions_top,t1_Family and Relationships_top,t1_Fine Art_top,t1_Food & Drink_top,...,t1_Science_top,t1_Shopping_top,t1_Sports_top,t1_Style & Fashion_top,t1_Technology & Computing_top,t1_Television_top,t1_Travel_top,t1_Video Gaming_top,t1_other_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,454.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,34.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,15.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,66.00,0.00,0.00,0.00,0.00,9999935617354687741
507933,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,263.50,0.00,0.00,0.00,261.75,0.00,0.00,0.00,0.00,9999969986945834961
507934,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,42.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,60.00,9999975861633509300


* **Feature 11: Bounce rate** 

In [27]:
def b_rate(g):
    '''for each fvid: calculate percentage of sessions comprising only 1PV'''
    
    # count sessions w/ pv = 1
    sessions_w_1pv = g[g['session_pvs']==1].shape[0]
    
    # count total sessions
    total_sessions = g.shape[0]
    
    # calculate ratio
    return (sessions_w_1pv)/total_sessions

br = pd.DataFrame(session.groupby('GA_fullVisitorId').apply(lambda x: b_rate(x))).reset_index().rename(columns={0:'bounce_rate'}) # takes 3mins
br

Unnamed: 0,GA_fullVisitorId,bounce_rate
0,10000145548747950113,1.00
1,10000150578546110250,1.00
2,10000154461756058018,1.00
3,10000244735129548321,1.00
4,10000256049462665870,1.00
...,...,...
507932,9999935617354687741,0.67
507933,9999969986945834961,1.00
507934,9999974732591335515,1.00
507935,9999975861633509300,1.00


* **Feature 12: Content views rate**

In [28]:
natid_page_map = df[['GA_pagePath', 'GA_cmsNaturalId', 'publish_date']].sort_values('publish_date', ascending=False).drop_duplicates('GA_pagePath')

# join page path with their natids
page = pd.merge(page, 
                natid_page_map, 
                how="left", 
                on="GA_pagePath")

In [29]:
def c_views_rate(g):
    '''for each fvid: calculate percentage PVs that are actually views on content pages; vs non-content pages such as the home page, channel/section landing pages, author pages, etc.'''
    
    # sum pvs on actual content for user
    content_sum_pv = g[g.GA_cmsNaturalId.str.contains("blogandpostid|blogandslideid|galleryid|video")].GA_pageViews.sum()
    
    # sum all pvs for user
    total_pv = g.GA_pageViews.sum()
    
    # calculate ratio
    return (content_sum_pv)/total_pv


cvr = pd.DataFrame(page.groupby('GA_fullVisitorId').apply(lambda x: c_views_rate(x))).reset_index().rename(columns={0:'content_views_rate'}) # takes 5mins
cvr

Unnamed: 0,GA_fullVisitorId,content_views_rate
0,10000145548747950113,1.00
1,10000150578546110250,1.00
2,10000154461756058018,1.00
3,10000244735129548321,1.00
4,10000256049462665870,1.00
...,...,...
507932,9999935617354687741,1.00
507933,9999969986945834961,1.00
507934,9999974732591335515,1.00
507935,9999975861633509300,1.00


### Timestamp features

In [30]:
df['GA_date']=pd.to_datetime(df['GA_date'],errors='coerce')

df['dayofweek'] = df.GA_date.dt.day_name()

week_dict = {True: "weekday", False: "weekend"}

df['weekday'] = ((df.GA_date.dt.dayofweek)// 5 != 1).astype("category")
df['weekday'] = df['weekday'].map(week_dict)

df['day'] = df.GA_date.dt.day
df['month'] = df.GA_date.dt.month

# time
df['session_time']=df['GA_visitStartTime'].apply(convert_time)

df['session_time']=pd.to_datetime(df['session_time'],errors='coerce')

df['est_time']=df['session_time'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

# for US, calculate hour from EST conversion. Else calculate hour from GMT
df['hour']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.hour,
                    df.session_time.dt.hour)

df['minute']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.minute,
                      df.session_time.dt.minute)

df['business_hours']= np.where((df['hour']>=8) & (df['hour']<18), 'business_hours', 'non_business_hours')

* **Feature 13:  Weekend/Weekday - Sum pvs**

In [31]:
# per user, sum pvs in categories
wk_df = sum_pvs(df, 'weekday')

wk_df.columns = wk_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
wk_df

weekday,GA_fullVisitorId,weekend_pvs,weekday_pvs
0,10000145548747950113,0,1
1,10000150578546110250,0,4
2,10000154461756058018,1,0
3,10000244735129548321,0,1
4,10000256049462665870,0,1
...,...,...,...
507932,9999935617354687741,3,1
507933,9999969986945834961,0,6
507934,9999974732591335515,0,1
507935,9999975861633509300,0,1


* **Feature 14:  Weekend/Weekday - Avg. top**

In [32]:
wk_top = calc_top(df, 'weekday')

# rename
wk_top.columns = wk_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

wk_top

weekday,weekend_top,weekday_top,GA_fullVisitorId
0,0.00,454.00,10000145548747950113
1,0.00,14.50,10000150578546110250
2,52.00,0.00,10000154461756058018
3,0.00,32.00,10000244735129548321
4,0.00,15.00,10000256049462665870
...,...,...,...
507932,42.33,71.00,9999935617354687741
507933,0.00,262.33,9999969986945834961
507934,0.00,42.00,9999974732591335515
507935,0.00,60.00,9999975861633509300


* **Feature 15: Busi vs Non-Busi hours - Sum pvs**

In [33]:
# per user, sum pvs in categories
busi_df = sum_pvs(df, 'business_hours')

busi_df.columns = busi_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
busi_df

business_hours,GA_fullVisitorId,business_hours_pvs,non_business_hours_pvs
0,10000145548747950113,0.00,1.00
1,10000150578546110250,1.00,3.00
2,10000154461756058018,1.00,0.00
3,10000244735129548321,0.00,1.00
4,10000256049462665870,0.00,1.00
...,...,...,...
507932,9999935617354687741,0.00,4.00
507933,9999969986945834961,3.00,3.00
507934,9999974732591335515,1.00,0.00
507935,9999975861633509300,0.00,1.00


* **Feature 16: Busi vs Non-Busi hours - Avg. top**

In [34]:
busi_top = calc_top(df, 'business_hours')

# rename
busi_top.columns = busi_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

busi_top

business_hours,business_hours_top,non_business_hours_top,GA_fullVisitorId
0,0.00,454.00,10000145548747950113
1,2.00,18.67,10000150578546110250
2,52.00,0.00,10000154461756058018
3,0.00,32.00,10000244735129548321
4,0.00,15.00,10000256049462665870
...,...,...,...
507932,0.00,49.50,9999935617354687741
507933,175.67,349.00,9999969986945834961
507934,42.00,0.00,9999974732591335515
507935,0.00,60.00,9999975861633509300


* **Feature 14:  Day of week - Avg. top**

In [35]:
dow_top = calc_top(df, 'dayofweek')

# rename
dow_top.columns = dow_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

dow_top

dayofweek,Friday_top,Monday_top,Saturday_top,Sunday_top,Thursday_top,Tuesday_top,Wednesday_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,454.00,0.00,10000145548747950113
1,0.00,17.00,0.00,0.00,12.00,0.00,0.00,10000150578546110250
2,0.00,0.00,52.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,32.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,15.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...
507932,0.00,0.00,0.00,42.33,71.00,0.00,0.00,9999935617354687741
507933,365.67,0.00,0.00,0.00,238.50,0.00,0.00,9999969986945834961
507934,0.00,42.00,0.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,0.00,0.00,0.00,0.00,0.00,60.00,0.00,9999975861633509300


* **Feature 15: Day of month - Avg. top**

In [36]:
dom_top = calc_top(df, 'day')

# rename
dom_top.columns = dom_top.columns.map(lambda x : 'day_of_mon_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

dom_top

day,day_of_mon_1_top,day_of_mon_2_top,day_of_mon_3_top,day_of_mon_4_top,day_of_mon_5_top,day_of_mon_6_top,day_of_mon_7_top,day_of_mon_8_top,day_of_mon_9_top,day_of_mon_10_top,...,day_of_mon_23_top,day_of_mon_24_top,day_of_mon_25_top,day_of_mon_26_top,day_of_mon_27_top,day_of_mon_28_top,day_of_mon_29_top,day_of_mon_30_top,day_of_mon_31_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,454.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,34.00,0.00,0.00,0.00,12.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000150578546110250
2,0.00,52.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,32.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,15.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,0.00,0.00,63.50,71.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999935617354687741
507933,0.00,0.00,0.00,0.00,365.67,0.00,0.00,0.00,12.00,0.00,...,0.00,465.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999969986945834961
507934,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999975861633509300


* **Feature 16: Hourly - Avg. top**
    * Here for example when country = US, hour 9 is 9 EST and for non-US countries, hour 9 is 9 GMT

In [37]:
hour_top = calc_top(df, 'hour')

# rename
hour_top.columns = hour_top.columns.map(lambda x : 'hour_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

hour_top

hour,hour_0_top,hour_1_top,hour_2_top,hour_3_top,hour_4_top,hour_5_top,hour_6_top,hour_7_top,hour_8_top,hour_9_top,...,hour_15_top,hour_16_top,hour_17_top,hour_18_top,hour_19_top,hour_20_top,hour_21_top,hour_22_top,hour_23_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,454.00,0.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,22.00,0.00,34.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,52.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,32.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,15.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,71.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,63.50,0.00,0.00,9999935617354687741
507933,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,465.00,0.00,0.00,12.00,811.00,224.00,0.00,9999969986945834961
507934,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,42.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,60.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999975861633509300


* **Feature 17: Minute -  avg. top**

In [38]:
minute_top = calc_top(df, 'minute')

# rename
minute_top.columns = minute_top.columns.map(lambda x : 'minute_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

minute_top

minute,minute_0_top,minute_1_top,minute_2_top,minute_3_top,minute_4_top,minute_5_top,minute_6_top,minute_7_top,minute_8_top,minute_9_top,...,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000145548747950113
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,2.00,0.00,0.00,10000150578546110250
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000154461756058018
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000244735129548321
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000256049462665870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507932,0.00,0.00,0.00,0.00,0.00,71.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999935617354687741
507933,0.00,0.00,0.00,0.00,0.00,0.00,0.00,12.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999969986945834961
507934,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,42.00,0.00,0.00,0.00,0.00,9999974732591335515
507935,0.00,0.00,60.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999975861633509300


In [39]:
# # FEATURE: Per month - sum pvs ---- these features may not be present at prediction time? - discuss
    
# # ask rob - how much history of user to consider during prediction

# month_top = calc_top(df, 'month')

# # rename
# month_top.columns = month_top.columns.map(lambda x : 'mon_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

# month_top

### Combine all features in 1 dataframe

In [86]:
inner_join_list = [pvs, top, br, cvr, ref, country, device_os, t1, t2, pc, ps, t1_top, wk_df, wk_top, busi_df, busi_top, dow_top, dom_top, hour_top, minute_top, target_class]

final_df = reduce(lambda x, y: pd.merge(x, y, on = 'GA_fullVisitorId', how="inner"), inner_join_list)

final_df.shape

(507937, 627)

In [87]:
# clean column names - can only contain letters and underscores

col_names = final_df.columns
col_names = [re.sub('[^A-Za-z0-9]+', ' ', x.replace("&", "and").replace("'s", "")).strip().replace(" ", "_").lower() for x in col_names]
final_df.columns = col_names

In [88]:
print("Any nulls?: ", final_df.isna().sum().unique())

final_df.tail()

Any nulls?:  [0]


Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,bounce_rate,content_views_rate,rf_content_aggregators,rf_direct,rf_fbia,...,minute_51_top,minute_52_top,minute_53_top,minute_54_top,minute_55_top,minute_56_top,minute_57_top,minute_58_top,minute_59_top,subscription_status
507932,9999935617354687741,1.33,1.0,44.83,63.5,0.67,1.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_subscriber
507933,9999969986945834961,1.0,1.0,262.33,143.0,1.0,1.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_subscriber
507934,9999974732591335515,1.0,1.0,42.0,42.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,non_subscriber
507935,9999975861633509300,1.0,1.0,60.0,60.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_subscriber
507936,9999976993988464070,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_subscriber


In [89]:
print("Total users in input df:", len(df.GA_fullVisitorId.unique()))
print("Total users in train df:", len(final_df.ga_fullvisitorid.unique()))

Total users in input df: 507937
Total users in train df: 507937


In [90]:
BUCKET_NAME = 'bi-subscription-modeling'
BUCKET_FOLDER = 'train-val-data'

# upload .csv 
file_name = 'training_data_11192021.csv'

upload_csv_file(final_df, 
                BUCKET_NAME, 
                BUCKET_FOLDER, 
                file_name) 

File uploaded to gs://bi-subscription-modeling/train-val-data/training_data_11192021.csv


In [91]:
file_name = 'training_data_11192021.pkl'

# upload .pkl to bucket
upload_pkl_file(final_df, 
                BUCKET_NAME, 
                BUCKET_FOLDER, 
                file_name)

File uploaded to gs://bi-subscription-modeling/train-val-data/training_data_11192021.pkl


* --- Cross checking ---

In [92]:
# check files are working

check_pkl = pd.read_pickle('training_data_11192021.pkl')

print(check_pkl.shape)
print(check_pkl.isna().sum().unique())

(507937, 627)
[0]


In [93]:
# check files are working

chk_csv = pd.read_csv('training_data_11192021.csv', dtype={'ga_fullvisitorid': object})

print(chk_csv.shape)
print(chk_csv.isna().sum().unique())

(507937, 627)
[0]


In [84]:
# just checking col names

t1_pv_cols =        [col for col in final_df.columns if ('t1_' in col) & ('_pvs' in col)]
t1_top_cols =       [col for col in final_df.columns if ('t1_' in col) & ('_top' in col)]
t2_pv_cols =        [col for col in final_df.columns if 't2_' in col]
pc_cols =           [col for col in final_df.columns if 'pc_' in col]
ps_cols =           [col for col in final_df.columns if 'ps_' in col]
deviceos_cols =     [col for col in final_df.columns if 'dos_' in col]
referral_cols =     [col for col in final_df.columns if 'rf_' in col]
country_cols =      [col for col in final_df.columns if 'ct_' in col]

wk_pv_cols =        [col for col in final_df.columns if ('week' in col) & ('_pvs' in col)]
wk_top_cols =       [col for col in final_df.columns if ('week' in col) & ('_top' in col)]
busi_pv_cols =      [col for col in final_df.columns if ('business_hours' in col) & ('_pvs' in col)]
busi_top_cols =     [col for col in final_df.columns if ('business_hours' in col) & ('_top' in col)]
dom_cols =          [col for col in final_df.columns if 'day_of_mon_' in col] 
hour_cols =         [col for col in final_df.columns if 'hour_' in col] 
minute_cols =       [col for col in final_df.columns if 'minute_' in col]
dow_cols =          ['Friday_top', 'Monday_top', 'Saturday_top', 'Sunday_top', 'Thursday_top', 'Tuesday_top', 'Wednesday_top']


# remove mis-assigned ones
pc_cols.remove('t2_pc_games_pvs')
ps_cols.remove('t1_family_and_relationships_pvs')
ps_cols.remove('t1_family_and_relationships_top')
ps_cols.remove('t2_apprenticeships_pvs')
ps_cols.remove('t2_celebrity_relationships_pvs')
ps_cols.remove('t2_workshops_and_classes_pvs')


categorical_cols = (t1_pv_cols +  t1_top_cols + t2_pv_cols + pc_cols + ps_cols + deviceos_cols + referral_cols + country_cols +
                    wk_pv_cols + wk_top_cols + busi_pv_cols + busi_top_cols + dom_cols + hour_cols + minute_cols  + dow_cols
                   )

print(len(categorical_cols))

numerical_cols = ['session_pvs_mean', 'session_pvs_median', 'session_top_mean', 'session_top_median', 'bounce_rate', 'content_views_rate']

print(len(numerical_cols))

619
6


### Curiosity
* From our subscriber pool, check how many are active

In [10]:
start_time = time.time()

query_string = """
  WITH eligible_users AS ( 
      
       SELECT 
           DISTINCT 
               user_id_uid, 
               resource_id_rid, 
               start_date,
               status, 
               subscription_trial_end_date
              
    FROM
        `api-project-901373404215.piano.subscriber_details`
    WHERE 
        # Filter for the 'universal' subscriptions only
        resource_id_rid IN UNNEST(['RKPEVDB', 'R8W03AS'])
         # filter for active only
               AND status='active'
               AND total__refunded<1
               AND cast(dt_updated as date)=current_date('America/New_York')
        )
    
    # join user_id_uids with GA's pianoIDs and For each pianoID, get their fullvids. De-duplicate.
     SELECT 
         DISTINCT
            ga_pianoId,
            user_id_uid,
            ga_fullvisitorid,
            resource_id_rid,
            start_date,
            status, 
            subscription_trial_end_date
    FROM
        eligible_users
         INNER JOIN 
         `api-project-901373404215.DataMart.v_DataMart_updated` 
    ON 
        LOWER(ga_pianoId) = LOWER(user_id_uid)
"""
raw = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(raw.shape)

raw.head()

--- 17.220924615859985 seconds ---
(85945, 7)


Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,start_date,status,subscription_trial_end_date
0,pnijex9plqjpcu7,PNIjeX9PLqjpcu7,1376004787634939856,R8W03AS,2020-11-12 16:27:33 -0500,active,
1,pnixoandhqs7cc3,PNIxoaNDhqs7cc3,5227854419298338848,R8W03AS,2021-04-26 23:02:21 -0400,active,
2,pnicve24eqqhno9,PNIcVe24eqqhno9,4988943432533366714,R8W03AS,2021-03-24 15:35:14 -0400,active,
3,pni3nsy8sql6zp5,PNI3NSy8Sql6zp5,2679702063997694573,R8W03AS,2020-12-11 15:32:41 -0500,active,
4,pni3jwt4tqsncus,PNI3JWt4tqsncus,2763434062124537469,R8W03AS,2021-09-02 22:49:15 -0400,active,2021-10-02 22:49:15 -0400


In [20]:
subs_pool = subs_data[['GA_fullVisitorId']].drop_duplicates(keep='first')

print("Our total subs pool", subs_pool.shape)

Our total subs pool (97947, 1)


In [24]:
check = pd.merge(subs_pool, 
                 raw[['ga_fullvisitorid', 'status']], 
                 how="inner", left_on="GA_fullVisitorId", right_on="ga_fullvisitorid")

print("Total active subscribers among subs pool", len(check.ga_fullvisitorid.unique()))

print("% of active subs", round(len(check.ga_fullvisitorid.unique())/len(subs_pool.GA_fullVisitorId.unique()), 2))

Total active subscribers among subs pool 79691
% of active subs 0.81


### Extra features
* May be use during v2

In [None]:
# --- Tier 2 - Avg. time on page ---

t2_df = df.pivot_table(index=['GA_fullVisitorId'], 
                     columns='tier2',
                     values=['timeOnPage', 'GA_pageViews'], 
                     aggfunc='sum', 
                     fill_value=0).reset_index()
# set aside fvids
fvids = list(t2_df.GA_fullVisitorId)
# calc avg. top
t2_top = t2_df["timeOnPage"]/t2_df["GA_pageViews"]
t2_top = t2_top.fillna(0)
t2_top["GA_fullVisitorId"] = fvids
# rename
t2_top.columns = t2_top.columns.map(lambda x : 't2_'+ str(x) + '_top' if x !='GA_fullVisitorId' else x)
t2_top

# --- Day of week - Percentage of pvs ---
dow = calc_perc_pvs(df, 'dayofweek')
dow.columns = dow.columns.map(lambda x : 'day_'+str(x)+'_pvs' if x !='GA_fullVisitorId' else x)
dow

# --- Hourly - Percentage of pvs ---
hour = calc_perc_pvs(df, 'hour')
hour.columns = hour.columns.map(lambda x : 'hour_'+str(x)+'_pvs' if x !='GA_fullVisitorId' else x)
hour

# --- Day of month - Percentage of pvs ---
day_of_mon = calc_perc_pvs(df, 'day')
day_of_mon.columns = day_of_mon.columns.map(lambda x : 'day_of_mon_'+str(x)+'_pvs' if x !='GA_fullVisitorId' else x)
day_of_mon

# --- Minute of hour - Percentage of pvs ---
minute = calc_perc_pvs(df, 'minute')
minute.columns = minute.columns.map(lambda x : 'minute_'+str(x) if x !='GA_fullVisitorId' else x)
minute

# --- Month of year -Percentage of pvs --- 
mon = calc_perc_pvs(df, 'month')
mon.columns = mon.columns.map(lambda x : 'mon_'+str(x)+'_pvs' if x !='GA_fullVisitorId' else x)
mon

* **Feature: Avg. Monthly article views**
    * Minor Concern: 
        * During prediction, suppose Today's date: 2021-11-18. And person XYZ is eligible
        
        * If we are considering past 90 days history of XYZ from today's date, SQL will pull data from 2021-8-18 till 2021-11-18
        * This way, users' all pvs for months Sep and Oct will get captured. Example sum(pvs in whole Sep) = say 10 and sum(pvs in whole Oct) = say 10 will be computed 
        * But for month Aug - pvs will not be considered for whole month. Just from 2021-08-18 till 2021-08-31 = say 1 (which is under-representation for that month). And suppose total pvs of user in Aug = 5
        * So monthly avg pvs for user (10+10+1)/3 will come down to 7. Instead of the actual (10+10+5)/3 = 8.3

In [None]:
content = df.copy()

# extract the start of natid string
content["natid_start"] = content.GA_cmsNaturalId.str.split("/").str[0]

print("Shape before: ", content.shape)

# keep only blogs, slides, magazine data
content = content[(content['natid_start'].str.contains('blogandpostid')) | (content['natid_start'].str.contains('blogandslideid')) | (content['natid_start'].str.contains('magazine'))]

print("Shape after: ", content.shape)

# get month-year
content.GA_date = pd.to_datetime(content.GA_date)
content["mon_year"] = content['GA_date'].dt.to_period('M')

# for each user calc - how many pageviews read each month they came

user_per_mon = pd.DataFrame(content.groupby(['GA_fullVisitorId', 'mon_year']).GA_pageViews.sum()).reset_index().rename(columns= {'GA_pageViews':'total_article_views'})

# average above df at user level. Interpret: user "9999935617354687741" reads on avg 2 articles per month

per_mon =  pd.DataFrame(user_per_mon.groupby('GA_fullVisitorId').total_article_views.mean()).reset_index() 

per_mon.GA_cmsNaturalId = per_mon.total_article_views.round() 

per_mon


# note - fvids are less because some users that came dint read actual article content but other stuff