## Description:

This file contains:
* Reading in raw input data from SQL tables. Exhaustive list of features contained in this raw input data is mentioned below.

In [1]:
'''Helper'''
import pandas as pd
from functools import reduce
import numpy as np
import joblib
import datetime
import time
import re

'''GCS Utils'''
from gcs_utils import *

'''Display'''
import warnings
warnings.filterwarnings('ignore') 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

'''BQ'''
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [2]:
def convert_time(time):
    return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')

### Data from SQL Tables

In [3]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

--- 10.005239963531494 seconds ---
Shape:  (5251696, 23)


In [4]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv3_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

--- 11.994099617004395 seconds ---
Before: (6014971, 23)
After removing  395 suspicious fvids: (5938101, 23)


In [5]:
df = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df.shape, "\n")

print("Total fvids in df: ", len(df.GA_fullVisitorId.unique()), "\n")

print(df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts(), "\n")

Shape:  (11189797, 23) 

Total fvids in df:  506500 

non_subscriber    409605
subscriber         96895
Name: subscription_status, dtype: int64 



In [6]:
df.isna().sum()

piano_id                    5938101
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               836011
timeOnPage                   579132
GA_cmsNaturalId                   4
title                       1828555
publish_date                1828552
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 4
GA_primarySection                 4
tier1                       3252305
tier2                       3806583
subscription_status               0
dtype: int64

In [8]:
# sanity check - >3pv in dec '21

whole = df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole["avg_top"] = whole['timeOnPage']/whole['sum_pvs']
whole.groupby('subscription_status').sum_pvs.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,14.5,54.2
std,22.84,179.55
min,4.0,1.0
25%,5.0,7.0
50%,8.0,19.0
75%,16.0,47.0
max,5261.0,18024.0


* **Imputation**
    * time on page = 0
    * natid = none
    * Tier1, Tier2 missing/empty = other
    * PC, PS, Country, Device OS shortlisted. Rest = other

In [9]:
# fill NAs
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('none') 

# fill NAs
df.timeOnPage = df.timeOnPage.fillna(0)

In [10]:
# short list prim channel. NOTE - 'none' not included below 
shortlisted_channel = ['business', 'leadership', 'money', 'innovation', 'lifestyle', 'home', 
                       'billionaires', 'small business', 'consumer', 'shopping', 'industry', 'investing', 
                       'tech', 'entrepreneurs', 'newsletters', 'asia', 'opinion', 'real estate', 'lists', 
                       'forbes finds', 'under 30', 'advisor', 'video']

# short list prim section. NOTE - 'none' not included below 
shortlisted_section = ['careers', 'personal finance', 'markets', 'forbeswomen', 'leadership strategy', 'healthcare', 'travel', 'sportsmoney', 'retail', 'entrepreneurs', 'science', 
                        'taxes', 'policy', 'consumer tech', 'investing', 'retirement', 'education', 'cmo network', 'real estate', 'hollywood & entertainment', 'cybersecurity', 
                        'aerospace & defense', 'diversity & inclusion', 'energy', 'food & drink', 'enterprise & cloud', 'enterprise tech', 'transportation', 'crypto & blockchain', 'games', 
                        'money & politics', 'media', 'fintech', 'venture capital', 'forbeslife', 'vices', 'manufacturing', 'small business strategy', 'hedge funds & private equity', 
                        'arts', 'ai', 'cio network', 'cars & bikes', 'banking & insurance', 'cfo network', 'spirits', 'cloud', 'dining', 'confirmation', 'wealth management']

shortlisted_os = ["android", "ios", "macintosh", "windows"]

shortlisted_country = ['united states', 'russia', 'canada', 'united kingdom', 'japan', 'australia', 'india', 'singapore', 'germany', 'philippines']

In [11]:
# replace NULL and empty with "other"
df.tier1 = df.tier1.replace(r'^\s*$', "other", regex=True)
df.tier1 = df.tier1.fillna("other")

# replace empty and NULL with "other"
df.tier2 = df.tier2.replace(r'^\s*$', "other", regex=True)
df.tier2 = df.tier2.fillna("other")

df["GA_primaryChannel"] = np.where(df["GA_primaryChannel"].isin(shortlisted_channel), df["GA_primaryChannel"], "other")

df["GA_primarySection"] = np.where(df["GA_primarySection"].isin(shortlisted_section), df["GA_primarySection"], "other")

df["GA_deviceOperatingSystem"] = np.where(df["GA_deviceOperatingSystem"].isin(shortlisted_os), df["GA_deviceOperatingSystem"], "other")

df["GA_country"] = np.where(df["GA_country"].isin(shortlisted_country), df["GA_country"], "other")

In [12]:
# after imputation

df.isna().sum()

piano_id                    5938101
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                     0
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth               836011
timeOnPage                        0
GA_cmsNaturalId                   0
title                       1828555
publish_date                1828552
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                             0
tier2                             0
subscription_status               0
dtype: int64

In [13]:
# setting aside target class

target_class = df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first')

### Training Data - Features

**Numerical Features**
1. User metrics
    * unique pageviews per session (avg, median)
    * timeOnPage per session (avg, median) <br><br>
2. Session metrics
    * bounceRate  <br><br>
3. Content metrics
    * contentViewsRate (contentViews / pageViews) <br>

**Categorical Features**
1. Content categories 
    * Sum pvs - Tier1, Tier2, Primary Channel, Primary Section
    * Avg. top - Tier1 <br><br>
2. Timestamp features: 

    * Weekday vs Weekend 
        * Sum pvs & avg. top
    * Business Hours vs Non-Business Hours *(Differentiated by US and other countries)*
        * Sum pvs & avg. top
    * Day of week
        * avg. top
    * Day of month
        * avg. top
    * Hourly *(Differentiated by US and other countries)*
        * avg. top
    * Minute *(Differentiated by US and other countries)* -- NOT DOING
        * avg. top
    * Month?? -- check with rob: may not be available at prediction time -- NOT DOING<br><br>
3. Device OS - Sum pvs
4. Referral source - Sum pvs
5. Countries - Sum pvs<br><br>

NOTE: Keeping sum as the aggregation metric for pvs in categorical features for now so that during modeling different treatments can be tried like Percentage of pvs or avg. pvs or one-hot encoding
<br>

**Extra features for v2:**
* Avg. monthly article views 
* Avg. top per person - Tier2
* Timestamp features: (Percentage of pvs) -- 
    * Hourly
    * Day of week
    * Month
    * Day of month
    * Minute  

In [14]:
# user's per pagepath GA data
page = df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'session_pvs', 'timeOnPage': 'session_top'}, inplace=True)

* **Feature 1: Users' Unique Pageviews in each session (avg, median)**

In [15]:
pvs = session.groupby('GA_fullVisitorId').agg({'session_pvs': ['mean', 'median']}).reset_index()

# rename cols
pvs.columns = [' '.join(col).strip() for col in pvs.columns.values]
pvs.rename(columns={'session_pvs mean':'session_pvs_mean', 'session_pvs median': 'session_pvs_median'}, inplace=True)
pvs

Unnamed: 0,GA_fullVisitorId,session_pvs_mean,session_pvs_median
0,10000016509294234976,1.00,1.00
1,10000106390586747955,1.00,1.00
2,10000168040775107380,1.04,1.00
3,1000020157047685619,1.13,1.00
4,1000023912379635020,1.00,1.00
...,...,...,...
506495,999959015144227694,1.00,1.00
506496,999963101136943011,1.27,1.00
506497,9999887933023878246,1.00,1.00
506498,9999891587854811928,1.00,1.00


In [16]:
# sanity check -- orig

# pd.merge(pvs, target_class, how="left", on="GA_fullVisitorId").groupby('subscription_status').describe().T

* **Feature 2:  Users' Time on Page in each session (avg, median)**

In [17]:
top = session.groupby('GA_fullVisitorId').agg({'session_top': ['mean', 'median']}).reset_index()

# rename cols
top.columns = [' '.join(col).strip() for col in top.columns.values]
top.rename(columns={'session_top mean':'session_top_mean', 'session_top median': 'session_top_median'}, inplace=True)

top

Unnamed: 0,GA_fullVisitorId,session_top_mean,session_top_median
0,10000016509294234976,137.80,81.00
1,10000106390586747955,9.75,0.00
2,10000168040775107380,66.90,0.00
3,1000020157047685619,93.57,43.00
4,1000023912379635020,88.50,48.00
...,...,...,...
506495,999959015144227694,146.00,96.00
506496,999963101136943011,82.16,7.00
506497,9999887933023878246,18.90,20.50
506498,9999891587854811928,131.57,26.00


In [18]:
def sum_pvs(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # fillna
    df_cat = df_cat.fillna(0)

    return df_cat

In [19]:
def calc_top(input_df, cat_col_name):
    
    # pivot on user
    df_cat = pd.pivot_table(
        input_df, 
        values=['timeOnPage', 'GA_pageViews'],
        columns= cat_col_name,
        index='GA_fullVisitorId',
        aggfunc='sum')\
        .reset_index()
    
    # set aside fvids
    fvids = list(df_cat.GA_fullVisitorId)

    # calc avg. top
    df_top = df_cat["timeOnPage"]/df_cat["GA_pageViews"]
    
    # fillna
    df_top = df_top.fillna(0)

    # join fvids
    df_top["GA_fullVisitorId"] = fvids
    
    return df_top

* **Feature 3: Referral sources - sum pvs**

In [20]:
# sum pvs of user in referral categories
ref = sum_pvs(df, 'GA_referralGroup')

# rename
ref.columns = ref.columns.map(lambda x : 'rf_'+x if x !='GA_fullVisitorId' else x)

ref

GA_referralGroup,GA_fullVisitorId,rf_content aggregators,rf_direct,rf_fbia,rf_newsletter,rf_organic search,rf_organic social (dark),rf_organic social (forbes),rf_paid display,rf_paid search,rf_paid social (dark),rf_paid social (forbes),rf_paid web,rf_push notification,rf_referral,rf_typeerror: cannot read property 'match' of null
0,10000016509294234976,0.00,0.00,0.00,0.00,5.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000106390586747955,0.00,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000168040775107380,0.00,0.00,0.00,0.00,61.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,1000020157047685619,46.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,1000023912379635020,0.00,0.00,0.00,0.00,6.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,6.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506496,999963101136943011,0.00,0.00,0.00,0.00,22.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506497,9999887933023878246,10.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506498,9999891587854811928,0.00,0.00,0.00,0.00,7.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 4: Country - sum pvs**

In [21]:
subs_top_ct = ['united states', 'russia', 'canada', 'united kingdom', 'japan']
nonsubs_top_ct = ['australia', 'india', 'singapore', 'germany', 'philippines']

shortlisted_countries = subs_top_ct + nonsubs_top_ct

In [22]:
# per user, sum pvs
country = sum_pvs(df, 'GA_country')

country.columns = country.columns.map(lambda x : 'ct_'+x if x !='GA_fullVisitorId' else x)
country

GA_country,GA_fullVisitorId,ct_australia,ct_canada,ct_germany,ct_india,ct_japan,ct_other,ct_philippines,ct_russia,ct_singapore,ct_united kingdom,ct_united states
0,10000016509294234976,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
1,10000106390586747955,0.00,0.00,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00
2,10000168040775107380,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,61.00
3,1000020157047685619,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,46.00
4,1000023912379635020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,6.00
506496,999963101136943011,0.00,0.00,0.00,0.00,0.00,18.00,0.00,0.00,0.00,0.00,4.00
506497,9999887933023878246,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10.00
506498,9999891587854811928,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,7.00


* **Feature 5: Device OS - sum pvs**

In [23]:
# per user,sum pvs

device_os = sum_pvs(df, 'GA_deviceOperatingSystem')

device_os.columns = device_os.columns.map(lambda x : 'dos_'+x if x !='GA_fullVisitorId' else x)

device_os

GA_deviceOperatingSystem,GA_fullVisitorId,dos_android,dos_ios,dos_macintosh,dos_other,dos_windows
0,10000016509294234976,0.00,5.00,0.00,0.00,0.00
1,10000106390586747955,0.00,4.00,0.00,0.00,0.00
2,10000168040775107380,61.00,0.00,0.00,0.00,0.00
3,1000020157047685619,46.00,0.00,0.00,0.00,0.00
4,1000023912379635020,0.00,6.00,0.00,0.00,0.00
...,...,...,...,...,...,...
506495,999959015144227694,0.00,6.00,0.00,0.00,0.00
506496,999963101136943011,0.00,22.00,0.00,0.00,0.00
506497,9999887933023878246,10.00,0.00,0.00,0.00,0.00
506498,9999891587854811928,0.00,7.00,0.00,0.00,0.00


### Content categories
* Content categories = IAB Tier 1, Tier 2, PC (shortlisted), PS (shortlisted)

* **Feature 6: Tier 1 - sum pvs**

In [24]:
print("Unique Tier 1: ", len(df.tier1.unique())) 

# per user, sum pvs only in above non-null tier1s
t1 = sum_pvs(df, 'tier1')

t1.columns = t1.columns.map(lambda x : 't1_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t1

Unique Tier 1:  31


tier1,GA_fullVisitorId,t1_Automotive_pvs,t1_Books and Literature_pvs,t1_Business and Finance_pvs,t1_Careers_pvs,t1_Content Channel_pvs,t1_Education_pvs,t1_Events and Attractions_pvs,t1_Family and Relationships_pvs,t1_Fine Art_pvs,...,t1_Religion & Spirituality_pvs,t1_Science_pvs,t1_Shopping_pvs,t1_Sports_pvs,t1_Style & Fashion_pvs,t1_Technology & Computing_pvs,t1_Television_pvs,t1_Travel_pvs,t1_Video Gaming_pvs,t1_other_pvs
0,10000016509294234976,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,4.00,0.00,0.00,0.00,1.00
1,10000106390586747955,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00
2,10000168040775107380,0.00,0.00,24.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,22.00
3,1000020157047685619,1.00,0.00,3.00,2.00,0.00,0.00,0.00,0.00,0.00,...,0.00,2.00,6.00,0.00,1.00,15.00,0.00,0.00,0.00,2.00
4,1000023912379635020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,0.00,0.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,1.00
506496,999963101136943011,0.00,0.00,5.00,2.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00,5.00
506497,9999887933023878246,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,7.00,0.00,0.00,0.00,1.00
506498,9999891587854811928,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00


* **Feature 7: Tier 2 - sum pvs**

In [25]:
print("Unique Tier 2: ", len(df.tier2.unique())) 

# per user, sum pvs only in above non-null tier1s
t2 = sum_pvs(df, 'tier2')

t2.columns = t2.columns.map(lambda x : 't2_'+ str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
t2

Unique Tier 2:  324


tier2,GA_fullVisitorId,t2_Action and Adventure Movies_pvs,t2_Adult Contemporary Music_pvs,t2_Adult Education_pvs,t2_Alcoholic Beverages_pvs,t2_Alternative Music_pvs,t2_American Football_pvs,t2_Amusement and Theme Parks_pvs,t2_Animation Movies_pvs,t2_Animation TV_pvs,...,t2_Women's Fashion_pvs,t2_Women's Health_pvs,t2_Workshops and Classes_pvs,t2_World Cuisines_pvs,t2_World Movies_pvs,t2_Wrestling_pvs,t2_Young Adult Literature_pvs,t2_Zoos & Aquariums_pvs,t2_eSports_pvs,t2_other_pvs
0,10000016509294234976,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
1,10000106390586747955,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00
2,10000168040775107380,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,22.00
3,1000020157047685619,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00
4,1000023912379635020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
506496,999963101136943011,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,5.00
506497,9999887933023878246,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00
506498,9999891587854811928,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 8: Prim Channel- sum pvs**

In [26]:
pc = sum_pvs(df, 'GA_primaryChannel')

pc.columns = pc.columns.map(lambda x : 'pc_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
pc

GA_primaryChannel,GA_fullVisitorId,pc_advisor_pvs,pc_asia_pvs,pc_billionaires_pvs,pc_business_pvs,pc_consumer_pvs,pc_entrepreneurs_pvs,pc_forbes finds_pvs,pc_home_pvs,pc_industry_pvs,...,pc_money_pvs,pc_newsletters_pvs,pc_opinion_pvs,pc_other_pvs,pc_real estate_pvs,pc_shopping_pvs,pc_small business_pvs,pc_tech_pvs,pc_under 30_pvs,pc_video_pvs
0,10000016509294234976,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000106390586747955,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000168040775107380,0.00,0.00,0.00,3.00,0.00,0.00,0.00,0.00,0.00,...,58.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,1000020157047685619,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,0.00,...,6.00,0.00,0.00,0.00,1.00,8.00,0.00,0.00,0.00,0.00
4,1000023912379635020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506496,999963101136943011,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,...,17.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506497,9999887933023878246,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506498,9999891587854811928,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 9: Prim Section - sum pvs**

In [27]:
ps = sum_pvs(df, 'GA_primarySection')

ps.columns = ps.columns.map(lambda x : 'ps_'+ str(x) + '_pvs' if x !='GA_fullVisitorId' else x)
ps

GA_primarySection,GA_fullVisitorId,ps_aerospace & defense_pvs,ps_ai_pvs,ps_arts_pvs,ps_banking & insurance_pvs,ps_careers_pvs,ps_cars & bikes_pvs,ps_cfo network_pvs,ps_cio network_pvs,ps_cloud_pvs,...,ps_science_pvs,ps_small business strategy_pvs,ps_spirits_pvs,ps_sportsmoney_pvs,ps_taxes_pvs,ps_transportation_pvs,ps_travel_pvs,ps_venture capital_pvs,ps_vices_pvs,ps_wealth management_pvs
0,10000016509294234976,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,10000106390586747955,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,10000168040775107380,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,1000020157047685619,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,4.00,0.00,0.00,0.00,1.00,0.00,1.00,1.00,0.00,0.00
4,1000023912379635020,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,5.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,999959015144227694,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506496,999963101136943011,0.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
506497,9999887933023878246,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00
506498,9999891587854811928,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


* **Feature 10: Tier 1 - Avg. time on page**

In [28]:
t1_top = calc_top(df, 'tier1')

# rename
t1_top.columns = t1_top.columns.map(lambda x : 't1_'+ str(x) + '_top' if x !='GA_fullVisitorId' else x)

t1_top

tier1,t1_Automotive_top,t1_Books and Literature_top,t1_Business and Finance_top,t1_Careers_top,t1_Content Channel_top,t1_Education_top,t1_Events and Attractions_top,t1_Family and Relationships_top,t1_Fine Art_top,t1_Food & Drink_top,...,t1_Science_top,t1_Shopping_top,t1_Sports_top,t1_Style & Fashion_top,t1_Technology & Computing_top,t1_Television_top,t1_Travel_top,t1_Video Gaming_top,t1_other_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,152.00,0.00,0.00,0.00,81.00,10000016509294234976
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9.75,10000106390586747955
2,0.00,0.00,16.96,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,61.50,10000168040775107380
3,2.00,0.00,306.67,57.00,0.00,0.00,0.00,0.00,0.00,29.00,...,18.50,64.83,0.00,26.00,82.80,0.00,0.00,0.00,106.50,1000020157047685619
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,36.00,0.00,0.00,0.00,0.00,0.00,0.00,63.60,1000023912379635020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,0.00,0.00,64.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,196.00,0.00,0.00,0.00,0.00,999959015144227694
506496,0.00,0.00,3.40,682.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,130.00,47.80,999963101136943011
506497,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,24.00,0.00,23.57,0.00,0.00,0.00,0.00,9999887933023878246
506498,88.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,607.00,0.00,0.00,0.00,9999891587854811928


* **Feature 11: Bounce rate** 

In [29]:
def b_rate(g):
    '''for each fvid: calculate percentage of sessions comprising only 1PV'''
    
    # count sessions w/ pv = 1
    sessions_w_1pv = g[g['session_pvs']==1].shape[0]
    
    # count total sessions
    total_sessions = g.shape[0]
    
    # calculate ratio
    return (sessions_w_1pv)/total_sessions

br = pd.DataFrame(session.groupby('GA_fullVisitorId').apply(lambda x: b_rate(x))).reset_index().rename(columns={0:'bounce_rate'}) # takes 3mins
br

Unnamed: 0,GA_fullVisitorId,bounce_rate
0,10000016509294234976,1.00
1,10000106390586747955,1.00
2,10000168040775107380,0.96
3,1000020157047685619,0.87
4,1000023912379635020,1.00
...,...,...
506495,999959015144227694,1.00
506496,999963101136943011,0.80
506497,9999887933023878246,1.00
506498,9999891587854811928,1.00


* **Feature 12: Content views rate**

In [30]:
natid_page_map = df[['GA_pagePath', 'GA_cmsNaturalId', 'publish_date']].sort_values('publish_date', ascending=False).drop_duplicates('GA_pagePath')

# join page path with their natids
page = pd.merge(page, 
                natid_page_map, 
                how="left", 
                on="GA_pagePath")

In [31]:
def c_views_rate(g):
    '''for each fvid: calculate percentage PVs that are actually views on content pages; vs non-content pages such as the home page, channel/section landing pages, author pages, etc.'''
    
    # sum pvs on actual content for user
    content_sum_pv = g[g.GA_cmsNaturalId.str.contains("blogandpostid|blogandslideid|galleryid|video")].GA_pageViews.sum()
    
    # sum all pvs for user
    total_pv = g.GA_pageViews.sum()
    
    # calculate ratio
    return (content_sum_pv)/total_pv


cvr = pd.DataFrame(page.groupby('GA_fullVisitorId').apply(lambda x: c_views_rate(x))).reset_index().rename(columns={0:'content_views_rate'}) # takes 5mins
cvr

Unnamed: 0,GA_fullVisitorId,content_views_rate
0,10000016509294234976,1.00
1,10000106390586747955,1.00
2,10000168040775107380,1.00
3,1000020157047685619,1.00
4,1000023912379635020,1.00
...,...,...
506495,999959015144227694,1.00
506496,999963101136943011,1.00
506497,9999887933023878246,1.00
506498,9999891587854811928,1.00


### Timestamp features

In [32]:
df['GA_date']=pd.to_datetime(df['GA_date'],errors='coerce')

df['dayofweek'] = df.GA_date.dt.day_name()

week_dict = {True: "weekday", False: "weekend"}

df['weekday'] = ((df.GA_date.dt.dayofweek)// 5 != 1).astype("category")
df['weekday'] = df['weekday'].map(week_dict)

df['day'] = df.GA_date.dt.day
df['month'] = df.GA_date.dt.month

# time
df['session_time']=df['GA_visitStartTime'].apply(convert_time)

df['session_time']=pd.to_datetime(df['session_time'],errors='coerce')

df['est_time']=df['session_time'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

# for US, calculate hour from EST conversion. Else calculate hour from GMT
df['hour']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.hour,
                    df.session_time.dt.hour)

df['minute']=np.where((df['GA_country']=="united states"),
                    df.est_time.dt.minute,
                      df.session_time.dt.minute)

df['business_hours']= np.where((df['hour']>=8) & (df['hour']<18), 'business_hours', 'non_business_hours')

* **Feature 13:  Weekend/Weekday - Sum pvs**

In [33]:
# per user, sum pvs in categories
wk_df = sum_pvs(df, 'weekday')

wk_df.columns = wk_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
wk_df

weekday,GA_fullVisitorId,weekend_pvs,weekday_pvs
0,10000016509294234976,3,2
1,10000106390586747955,0,4
2,10000168040775107380,19,42
3,1000020157047685619,18,28
4,1000023912379635020,1,5
...,...,...,...
506495,999959015144227694,2,4
506496,999963101136943011,5,17
506497,9999887933023878246,5,5
506498,9999891587854811928,1,6


* **Feature 14:  Weekend/Weekday - Avg. top**

In [34]:
wk_top = calc_top(df, 'weekday')

# rename
wk_top.columns = wk_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

wk_top

weekday,weekend_top,weekday_top,GA_fullVisitorId
0,165.67,96.00,10000016509294234976
1,0.00,9.75,10000106390586747955
2,47.58,67.19,10000168040775107380
3,100.50,117.32,1000020157047685619
4,0.00,70.80,1000023912379635020
...,...,...,...
506495,0.00,146.00,999959015144227694
506496,41.60,105.47,999963101136943011
506497,21.40,16.40,9999887933023878246
506498,607.00,52.33,9999891587854811928


* **Feature 15: Busi vs Non-Busi hours - Sum pvs**

In [35]:
# per user, sum pvs in categories
busi_df = sum_pvs(df, 'business_hours')

busi_df.columns = busi_df.columns.map(lambda x : str(x) +'_pvs' if x !='GA_fullVisitorId' else x)
busi_df

business_hours,GA_fullVisitorId,business_hours_pvs,non_business_hours_pvs
0,10000016509294234976,3.00,2.00
1,10000106390586747955,2.00,2.00
2,10000168040775107380,29.00,32.00
3,1000020157047685619,13.00,33.00
4,1000023912379635020,2.00,4.00
...,...,...,...
506495,999959015144227694,3.00,3.00
506496,999963101136943011,7.00,15.00
506497,9999887933023878246,5.00,5.00
506498,9999891587854811928,2.00,5.00


* **Feature 16: Busi vs Non-Busi hours - Avg. top**

In [36]:
busi_top = calc_top(df, 'business_hours')

# rename
busi_top.columns = busi_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

busi_top

business_hours,business_hours_top,non_business_hours_top,GA_fullVisitorId
0,72.00,236.50,10000016509294234976
1,0.00,19.50,10000106390586747955
2,74.34,49.06,10000168040775107380
3,115.15,109.00,1000020157047685619
4,18.00,79.50,1000023912379635020
...,...,...,...
506495,64.00,130.67,999959015144227694
506496,40.43,114.53,999963101136943011
506497,18.80,19.00,9999887933023878246
506498,371.00,35.80,9999891587854811928


* **Feature 14:  Day of week - Avg. top**

In [37]:
dow_top = calc_top(df, 'dayofweek')

# rename
dow_top.columns = dow_top.columns.map(lambda x : str(x) + '_top' if x !='GA_fullVisitorId' else x)

dow_top

dayofweek,Friday_top,Monday_top,Saturday_top,Sunday_top,Thursday_top,Tuesday_top,Wednesday_top,GA_fullVisitorId
0,0.00,64.00,0.00,165.67,128.00,0.00,0.00,10000016509294234976
1,0.00,0.00,0.00,0.00,0.00,19.50,0.00,10000106390586747955
2,15.00,6.83,19.78,72.60,17.00,238.10,15.33,10000168040775107380
3,65.00,45.00,36.80,125.00,241.67,49.60,71.40,1000020157047685619
4,79.50,0.00,0.00,0.00,36.00,0.00,0.00,1000023912379635020
...,...,...,...,...,...,...,...,...
506495,196.00,0.00,0.00,0.00,96.00,0.00,0.00,999959015144227694
506496,4.00,26.50,41.60,0.00,53.40,282.40,0.00,999963101136943011
506497,0.00,0.00,0.00,35.67,28.00,8.50,18.50,9999887933023878246
506498,88.00,0.00,607.00,0.00,45.20,0.00,0.00,9999891587854811928


* **Feature 15: Day of month - Avg. top**

In [38]:
dom_top = calc_top(df, 'day')

# rename
dom_top.columns = dom_top.columns.map(lambda x : 'day_of_mon_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

dom_top

day,day_of_mon_1_top,day_of_mon_2_top,day_of_mon_3_top,day_of_mon_4_top,day_of_mon_5_top,day_of_mon_6_top,day_of_mon_7_top,day_of_mon_8_top,day_of_mon_9_top,day_of_mon_10_top,...,day_of_mon_23_top,day_of_mon_24_top,day_of_mon_25_top,day_of_mon_26_top,day_of_mon_27_top,day_of_mon_28_top,day_of_mon_29_top,day_of_mon_30_top,day_of_mon_31_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,409.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,7.00,0.00,0.00,64.00,128.00,0.00,10000016509294234976
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,10000106390586747955
2,0.00,891.00,0.00,0.00,0.00,0.00,6.00,33.75,29.00,38.00,...,20.00,5.25,0.00,56.18,20.00,0.00,0.00,0.00,0.00,10000168040775107380
3,30.50,71.00,0.00,17.50,96.00,26.00,0.00,93.00,123.00,0.00,...,0.00,57.00,93.00,47.00,0.00,0.00,14.50,56.50,0.00,1000020157047685619
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,36.00,0.00,0.00,0.00,0.00,0.00,0.00,1000023912379635020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,0.00,0.00,196.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,96.00,0.00,0.00,0.00,0.00,0.00,0.00,999959015144227694
506496,0.00,130.00,4.00,41.60,0.00,456.33,0.00,0.00,21.50,0.00,...,9.00,0.00,0.00,0.00,0.00,7.00,0.00,0.00,0.00,999963101136943011
506497,0.00,0.00,0.00,0.00,26.00,0.00,0.00,0.00,28.00,57.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999887933023878246
506498,0.00,0.00,0.00,0.00,0.00,0.00,20.00,0.00,51.50,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9999891587854811928


* **Feature 16: Hourly - Avg. top**
    * Here for example when country = US, hour 9 is 9 EST and for non-US countries, hour 9 is 9 GMT

In [39]:
hour_top = calc_top(df, 'hour')

# rename
hour_top.columns = hour_top.columns.map(lambda x : 'hour_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

hour_top

hour,hour_0_top,hour_1_top,hour_2_top,hour_3_top,hour_4_top,hour_5_top,hour_6_top,hour_7_top,hour_8_top,hour_9_top,...,hour_15_top,hour_16_top,hour_17_top,hour_18_top,hour_19_top,hour_20_top,hour_21_top,hour_22_top,hour_23_top,GA_fullVisitorId
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,128.00,0.00,...,7.00,0.00,0.00,64.00,0.00,409.00,0.00,0.00,0.00,10000016509294234976
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,39.00,0.00,10000106390586747955
2,80.86,1.00,0.00,12.00,0.00,0.00,34.00,20.00,1.00,10.86,...,2.00,0.00,0.00,0.00,0.00,52.20,2.00,164.33,15.62,10000168040775107380
3,47.67,71.00,0.00,21.00,0.00,0.00,0.00,0.00,376.50,123.00,...,20.00,200.00,16.00,135.00,0.00,93.00,44.17,43.00,211.67,1000020157047685619
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,36.00,0.00,0.00,0.00,0.00,129.00,0.00,30.00,1000023912379635020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506495,0.00,196.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,999959015144227694
506496,0.00,0.00,0.00,0.00,0.00,21.50,0.00,243.83,7.00,0.00,...,0.00,0.00,0.00,0.00,1.33,0.00,57.00,0.00,94.00,999963101136943011
506497,0.00,0.00,14.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,24.00,0.00,26.00,17.00,9999887933023878246
506498,20.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,135.00,0.00,...,0.00,0.00,0.00,26.00,22.00,88.00,0.00,23.00,0.00,9999891587854811928


* **Feature 17: Minute -  avg. top**

In [40]:
# WONT DO

# minute_top = calc_top(df, 'minute')

# # rename
# minute_top.columns = minute_top.columns.map(lambda x : 'minute_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

# minute_top

In [41]:
# # FEATURE: Per month - sum pvs ---- these features may not be present at prediction time? - discuss
    
# # ask rob - how much history of user to consider during prediction

# month_top = calc_top(df, 'month')

# # rename
# month_top.columns = month_top.columns.map(lambda x : 'mon_' + str(x) + '_top' if x !='GA_fullVisitorId' else x)

# month_top

### Combine all features in 1 dataframe

In [42]:
inner_join_list = [pvs, top, br, cvr, ref, country, device_os, t1, t2, pc, ps, t1_top, wk_df, wk_top, busi_df, busi_top, dow_top, dom_top, hour_top, target_class]

final_df = reduce(lambda x, y: pd.merge(x, y, on = 'GA_fullVisitorId', how="inner"), inner_join_list)

final_df.shape

(506500, 570)

In [43]:
# clean column names - can only contain letters and underscores

col_names = final_df.columns
col_names = [re.sub('[^A-Za-z0-9]+', ' ', x.replace("&", "and").replace("'s", "")).strip().replace(" ", "_").lower() for x in col_names]
final_df.columns = col_names

In [44]:
print("Any nulls?: ", final_df.isna().sum().unique())

final_df.tail()

Any nulls?:  [0]


Unnamed: 0,ga_fullvisitorid,session_pvs_mean,session_pvs_median,session_top_mean,session_top_median,bounce_rate,content_views_rate,rf_content_aggregators,rf_direct,rf_fbia,...,hour_15_top,hour_16_top,hour_17_top,hour_18_top,hour_19_top,hour_20_top,hour_21_top,hour_22_top,hour_23_top,subscription_status
506495,999959015144227694,1.0,1.0,146.0,96.0,1.0,1.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,non_subscriber
506496,999963101136943011,1.27,1.0,82.16,7.0,0.8,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.33,0.0,57.0,0.0,94.0,non_subscriber
506497,9999887933023878246,1.0,1.0,18.9,20.5,1.0,1.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,24.0,0.0,26.0,17.0,non_subscriber
506498,9999891587854811928,1.0,1.0,131.57,26.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,26.0,22.0,88.0,0.0,23.0,0.0,non_subscriber
506499,9999996821801111495,1.0,1.0,193.67,132.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,445.0,0.0,0.0,2.0,132.0,0.0,0.0,non_subscriber


In [45]:
print("Total users in input df:", len(df.GA_fullVisitorId.unique()))
print("Total users in train df:", len(final_df.ga_fullvisitorid.unique()))

Total users in input df: 506500
Total users in train df: 506500


* **SANITY CHECK - >3pv filter in dec '21**

In [46]:
final_df.groupby('subscription_status')[['session_pvs_mean', 'session_pvs_median']].describe().T.iloc[1:]

Unnamed: 0,subscription_status,non_subscriber,subscriber
session_pvs_mean,mean,1.07,2.27
session_pvs_mean,std,0.24,4.64
session_pvs_mean,min,1.0,1.0
session_pvs_mean,25%,1.0,1.29
session_pvs_mean,50%,1.0,1.67
session_pvs_mean,75%,1.05,2.38
session_pvs_mean,max,20.83,356.0
session_pvs_median,count,409605.0,96895.0
session_pvs_median,mean,1.03,1.85
session_pvs_median,std,0.22,4.63


In [47]:
final_df.groupby('subscription_status').session_top_mean.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,126.16,165.58
std,231.59,157.38
min,0.0,0.0
25%,26.5,75.93
50%,61.0,129.6
75%,129.19,206.61
max,7840.0,7119.0


In [1]:
# pvs_eda(df, 'GA_referralGroup')

In [None]:
pvs_eda(df, 'GA_deviceOperatingSystem')

* **SANITY CHECK - >1pv filter in dec '21**

In [44]:
final_df.groupby('subscription_status')[['session_pvs_mean', 'session_pvs_median']].describe().T.iloc[1:]

Unnamed: 0,subscription_status,non_subscriber,subscriber
session_pvs_mean,mean,1.05,2.27
session_pvs_mean,std,0.21,4.64
session_pvs_mean,min,1.0,1.0
session_pvs_mean,25%,1.0,1.29
session_pvs_mean,50%,1.0,1.67
session_pvs_mean,75%,1.0,2.38
session_pvs_mean,max,17.0,356.0
session_pvs_median,count,409924.0,96895.0
session_pvs_median,mean,1.04,1.85
session_pvs_median,std,0.2,4.63


In [45]:
final_df.groupby('subscription_status').session_top_mean.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,131.36,165.58
std,255.68,157.38
min,0.0,0.0
25%,18.22,75.93
50%,50.5,129.6
75%,120.0,206.61
max,4791.0,7119.0


In [46]:
pd.DataFrame(final_df.groupby('subscription_status').bounce_rate.mean()).rename(columns={'bounce_rate': 'avg. bounce_rate'})

Unnamed: 0_level_0,avg. bounce_rate
subscription_status,Unnamed: 1_level_1
non_subscriber,0.95
subscriber,0.57


In [47]:
pd.DataFrame(final_df.groupby('subscription_status').content_views_rate.mean()).rename(columns={'content_views_rate': 'avg. content_views_rate'})

Unnamed: 0_level_0,avg. content_views_rate
subscription_status,Unnamed: 1_level_1
non_subscriber,1.0
subscriber,0.71


In [49]:
pvs_eda(df, 'GA_referralGroup')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_referralGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
organic search,50.43,66.06,1,1
newsletter,0.02,8.89,8,2
direct,15.91,8.11,3,3
referral,0.43,7.67,5,4
organic social (dark),1.28,4.32,4,5
organic social (forbes),0.12,2.9,6,6
content aggregators,31.74,1.84,2,7
push notification,0.0,0.1,10,8
paid search,0.05,0.09,7,9
paid web,0.01,0.01,9,10


In [50]:
pvs_eda(df, 'GA_deviceOperatingSystem')

subscription_status,% of non_subscriber pvs,% of subscriber pvs,non_subscriber_rank,subscriber_rank
GA_deviceOperatingSystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
windows,0.05,42.16,4,1
macintosh,0.25,40.89,3,2
ios,42.08,9.09,2,3
android,57.59,7.36,1,4
other,0.03,0.49,5,5


In [51]:
BUCKET_NAME = 'bi-subscription-modeling'
BUCKET_FOLDER = 'train-val-data/pv3-filter-train-val'

# upload .csv 
file_name = 'pv3_training_data_01062022.csv'

upload_csv_file(final_df, 
                BUCKET_NAME, 
                BUCKET_FOLDER, 
                file_name) 

File uploaded to gs://bi-subscription-modeling/train-val-data/pv3-filter-train-val/pv3_training_data_01062022.csv


In [52]:
file_name = 'pv3_training_data_01062022.pkl'

# upload .pkl to bucket
upload_pkl_file(final_df, 
                BUCKET_NAME, 
                BUCKET_FOLDER, 
                file_name)

File uploaded to gs://bi-subscription-modeling/train-val-data/pv3-filter-train-val/pv3_training_data_01062022.pkl


In [53]:
# just checking col names

t1_pv_cols =        [col for col in final_df.columns if ('t1_' in col) & ('_pvs' in col)]
t1_top_cols =       [col for col in final_df.columns if ('t1_' in col) & ('_top' in col)]
t2_pv_cols =        [col for col in final_df.columns if 't2_' in col]
pc_cols =           [col for col in final_df.columns if 'pc_' in col]
ps_cols =           [col for col in final_df.columns if 'ps_' in col]
deviceos_cols =     [col for col in final_df.columns if 'dos_' in col]
referral_cols =     [col for col in final_df.columns if 'rf_' in col]
country_cols =      [col for col in final_df.columns if 'ct_' in col]

wk_pv_cols =        [col for col in final_df.columns if ('week' in col) & ('_pvs' in col)]
wk_top_cols =       [col for col in final_df.columns if ('week' in col) & ('_top' in col)]
busi_pv_cols =      [col for col in final_df.columns if ('business_hours' in col) & ('_pvs' in col)]
busi_top_cols =     [col for col in final_df.columns if ('business_hours' in col) & ('_top' in col)]
dom_cols =          [col for col in final_df.columns if 'day_of_mon_' in col] 
hour_cols =         [col for col in final_df.columns if 'hour_' in col] 
minute_cols =       [col for col in final_df.columns if 'minute_' in col]
dow_cols =          ['Friday_top', 'Monday_top', 'Saturday_top', 'Sunday_top', 'Thursday_top', 'Tuesday_top', 'Wednesday_top']


# remove mis-assigned ones
pc_cols.remove('t2_pc_games_pvs')
ps_cols.remove('t1_family_and_relationships_pvs')
ps_cols.remove('t1_family_and_relationships_top')
ps_cols.remove('t2_apprenticeships_pvs')
ps_cols.remove('t2_celebrity_relationships_pvs')
ps_cols.remove('t2_workshops_and_classes_pvs')


categorical_cols = (t1_pv_cols +  t1_top_cols + t2_pv_cols + pc_cols + ps_cols + deviceos_cols + referral_cols + country_cols +
                    wk_pv_cols + wk_top_cols + busi_pv_cols + busi_top_cols + dom_cols + hour_cols + minute_cols  + dow_cols
                   )

print(len(categorical_cols))

numerical_cols = ['session_pvs_mean', 'session_pvs_median', 'session_top_mean', 'session_top_median', 'bounce_rate', 'content_views_rate']

print(len(numerical_cols))

562
6
