In [1]:
'''Helper'''
import pandas as pd
import numpy as np
import time

'''Display'''
import warnings
warnings.filterwarnings('ignore') 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

'''BQ'''
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [2]:
# ----- GET SUBS DATA -----
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

# ----- GET NON-SUBS DATA -----

start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

--- 10.618674993515015 seconds ---
Shape:  (6556736, 23)
--- 4.014449119567871 seconds ---
Before: (1192679, 23)
After removing  10 suspicious fvids: (1192343, 23)


In [3]:
df = pd.concat([subs_data, nonsubs_data])

# fill NAs
df.GA_cmsNaturalId = df.GA_cmsNaturalId.fillna('none') 
df.timeOnPage = df.timeOnPage.fillna(0)

print("Total fvids in df: ", len(df.GA_fullVisitorId.unique()), "\n")

print(df[['GA_fullVisitorId', 'subscription_status']].drop_duplicates(keep='first').subscription_status.value_counts(), "\n")

Total fvids in df:  507937 

non_subscriber    409990
subscriber         97947
Name: subscription_status, dtype: int64 



In [4]:
# keep only non-subs
raw_df = df[df.subscription_status=="non_subscriber"]

len(raw_df.GA_fullVisitorId.unique())

409990

### All non-subscribers from raw data - sum(pvs) per person

* **From our total non-subscriber pool, check distribution of people having more than 1 pvs and other buckets**

In [5]:
whole_ns = raw_df.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})

whole_ns["avg_top"] = whole_ns['timeOnPage']/whole_ns['sum_pvs']

In [6]:
whole_ns.describe()

Unnamed: 0,sum_pvs,timeOnPage,avg_top
count,409990.0,409990.0,409990.0
mean,2.91,235.32,72.66
std,5.34,619.08,123.51
min,1.0,0.0,0.0
25%,1.0,9.0,7.0
50%,1.0,59.0,36.0
75%,3.0,201.0,87.86
max,492.0,84268.0,2941.0


In [7]:
# cut sum(pvs) per person into buckets and calculate % people in each bucket

whole_ns['range'] = pd.cut(whole_ns.sum_pvs, [0, 1, 3 , 5, 429])

print(whole_ns.range.value_counts(normalize=True).sort_index())

whole_ns.range.value_counts().sort_index()

(0, 1]     0.57
(1, 3]     0.24
(3, 5]     0.08
(5, 429]   0.11
Name: range, dtype: float64


(0, 1]      234324
(1, 3]       96629
(3, 5]       32140
(5, 429]     46896
Name: range, dtype: int64

* Make sure all kinds of non-sub behavior captured in our 3 train, val, test sets

In [None]:
BUCKET_NAME = 'bi-subscription-modeling'
BUCKET_FOLDER = 'train-val-data'
file_name = 'training_data_11192021.csv'

download_file(BUCKET_NAME, 
              BUCKET_FOLDER, 
              file_name) 

In [8]:
from sklearn.model_selection import train_test_split
SEED = 42

# read training data

df = pd.read_csv('training_data_11192021.csv', dtype={'ga_fullvisitorid': object})

print(df.shape)
print(df.subscription_status.value_counts(), "\n")

X = df.iloc[:, :-1]
y = df["subscription_status"]

# split the data in training and remaining dataset

X_train, X_rem, y_train, y_rem = train_test_split(X,
                                                  y, 
                                                  train_size=0.7, 
                                                  stratify = y,
                                                  random_state = SEED 
                                                 )


# make test and val sets
X_val, X_test, y_val, y_test = train_test_split(X_rem,
                                                y_rem, 
                                                test_size=0.5,
                                                stratify = y_rem,
                                                random_state = SEED 
                                                )

print("--- Train set ---", X_train.shape, y_train.shape, round(X_train.shape[0]/df.shape[0], 2))

print("--- Val set ---", X_val.shape, y_val.shape, round(X_val.shape[0]/df.shape[0], 2))

print("--- Test set ---", X_test.shape, y_test.shape, round(X_test.shape[0]/df.shape[0], 2))

train_set = X_train.join(y_train)
val_set = X_val.join(y_val)
test_set = X_test.join(y_test)

(507937, 627)
non_subscriber    409990
subscriber         97947
Name: subscription_status, dtype: int64 

--- Train set --- (355555, 626) (355555,) 0.7
--- Val set --- (76191, 626) (76191,) 0.15
--- Test set --- (76191, 626) (76191,) 0.15


### Train set non-subscribers - sum(pvs) per person

* **From train set's non-subscriber pool, check distribution of people having more than 1 pvs and other buckets**

In [9]:
# keep only non-subs
train_set_ns = train_set[train_set.subscription_status=="non_subscriber"]

print(train_set_ns.shape)
print(train_set_ns.subscription_status.unique(), "\n")

zz1 = pd.merge(train_set_ns, whole_ns, how="left", left_on = "ga_fullvisitorid", right_on="GA_fullVisitorId")

zz1['cut_train_sum_pv'] = pd.cut(zz1.sum_pvs, [0, 1, 3 , 5, 380])

print(zz1['cut_train_sum_pv'].value_counts(normalize=True).sort_index(), "\n")

zz1['cut_train_sum_pv'].value_counts().sort_index()

(286992, 627)
['non_subscriber'] 

(0, 1]     0.57
(1, 3]     0.24
(3, 5]     0.08
(5, 380]   0.11
Name: cut_train_sum_pv, dtype: float64 



(0, 1]      164041
(1, 3]       67648
(3, 5]       22452
(5, 380]     32851
Name: cut_train_sum_pv, dtype: int64

### Val set non-subscribers - sum(pvs) per person

* **From val set's non-subscriber pool, check distribution of people having more than 1 pvs and other buckets**

In [10]:
# keep only non-subs
val_set_ns = val_set[val_set.subscription_status=="non_subscriber"]

print(val_set_ns.shape)
print(val_set_ns.subscription_status.unique(), "\n")

zz2 = pd.merge(val_set_ns, whole_ns, how="left", left_on = "ga_fullvisitorid", right_on="GA_fullVisitorId")

zz2['cut_val_sum_pv'] = pd.cut(zz2.sum_pvs, [0, 1, 3 , 5, 176])

print(zz2['cut_val_sum_pv'].value_counts(normalize=True).sort_index(), "\n")

zz2['cut_val_sum_pv'].value_counts().sort_index()

(61499, 627)
['non_subscriber'] 

(0, 1]     0.57
(1, 3]     0.23
(3, 5]     0.08
(5, 176]   0.11
Name: cut_val_sum_pv, dtype: float64 



(0, 1]      35186
(1, 3]      14419
(3, 5]       4826
(5, 176]     7068
Name: cut_val_sum_pv, dtype: int64

In [1]:
4826+7068

11894

### Test set non-subscribers - sum(pvs) per person

* **From test set's non-subscriber pool, check distribution of people having more than 1 pvs and other buckets**

In [11]:
# keep only non-subs
test_set_ns = test_set[test_set.subscription_status=="non_subscriber"]

print(test_set_ns.shape)
print(test_set_ns.subscription_status.unique(), "\n")

zz3 = pd.merge(test_set_ns, whole_ns, how="left", left_on = "ga_fullvisitorid", right_on="GA_fullVisitorId")

zz3['cut_test_sum_pv'] = pd.cut(zz3.sum_pvs, [0, 1, 3 , 5, 492])

print(zz3['cut_test_sum_pv'].value_counts(normalize=True).sort_index(), "\n")

zz3['cut_test_sum_pv'].value_counts().sort_index()

(61499, 627)
['non_subscriber'] 

(0, 1]     0.57
(1, 3]     0.24
(3, 5]     0.08
(5, 492]   0.11
Name: cut_val_sum_pv, dtype: float64 



(0, 1]      35097
(1, 3]      14562
(3, 5]       4862
(5, 492]     6978
Name: cut_val_sum_pv, dtype: int64

### All non-subscribers from raw data - (pvs per session) per person

* **From our total non-subscriber pool, check distribution of people in various buckets of pvs. per session**

In [12]:
# user's per pagepath GA data
page = raw_df.groupby(['GA_fullVisitorId', 'GA_visitStartTime', 'GA_pagePath']).agg({'GA_pageViews': 'max', 'GA_scrollDepth': 'max', 'timeOnPage': 'sum'}).reset_index()

# user's per session GA data
session = page.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum', 'GA_scrollDepth': 'mean', 'timeOnPage': 'mean'}).reset_index()

session.rename(columns={'GA_pageViews': 'unique_pageViews', 'timeOnPage': 'top_per_session'}, inplace=True)

# --- per session pvs (mean,  median) ---
pageViews = session.groupby('GA_fullVisitorId').agg({'unique_pageViews': ['mean', 'median']}).reset_index()

# rename cols
pageViews.columns = [' '.join(col).strip() for col in pageViews.columns.values]
pageViews.rename(columns={'unique_pageViews mean':'unique_pageviews_mean', 'unique_pageViews median': 'unique_pageviews_median'}, inplace=True)

# --- per session top (mean,  median) ---
timeOnPage = session.groupby('GA_fullVisitorId').agg({'top_per_session': ['mean', 'median']}).reset_index()

# rename cols
timeOnPage.columns = [' '.join(col).strip() for col in timeOnPage.columns.values]
timeOnPage.rename(columns={'top_per_session mean':'top_mean', 'top_per_session median': 'top_median'}, inplace=True)

In [13]:
pageViews['cut_unique_pageviews_mean'] = pd.cut(pageViews.unique_pageviews_mean, [0, 1, 3 , 5, 39])

print(pageViews['cut_unique_pageviews_mean'].value_counts(normalize=True), "\n")

pageViews['cut_unique_pageviews_mean'].value_counts()

(0, 1]    0.95
(1, 3]    0.05
(3, 5]    0.00
(5, 39]   0.00
Name: cut_unique_pageviews_mean, dtype: float64 



(0, 1]     387875
(1, 3]      22019
(3, 5]         62
(5, 39]        34
Name: cut_unique_pageviews_mean, dtype: int64

### Train set non-subscribers - (pvs per session) per person

* **From train set's non-subscriber pool, check distribution of people in various buckets of pvs. per session**

In [14]:
train_set_ns['cut_session_pvs_mean'] = pd.cut(train_set_ns.session_pvs_mean, [0, 1, 3 , 5, 39])

print(train_set_ns['cut_session_pvs_mean'].value_counts(normalize=True), "\n")

train_set_ns['cut_session_pvs_mean'].value_counts()

(0, 1]    0.95
(1, 3]    0.05
(3, 5]    0.00
(5, 39]   0.00
Name: cut_session_pvs_mean, dtype: float64 



(0, 1]     271433
(1, 3]      15494
(3, 5]         43
(5, 39]        22
Name: cut_session_pvs_mean, dtype: int64

### Val set non-subscribers - (pvs per session) per person

* **From val set's non-subscriber pool, check distribution of people in various buckets of pvs. per session**

In [15]:
val_set_ns['cut_session_pvs_mean'] = pd.cut(val_set_ns.session_pvs_mean, [0, 1, 3 , 5, 39])

print(val_set_ns['cut_session_pvs_mean'].value_counts(normalize=True))
val_set_ns['cut_session_pvs_mean'].value_counts()

(0, 1]    0.95
(1, 3]    0.05
(3, 5]    0.00
(5, 39]   0.00
Name: cut_session_pvs_mean, dtype: float64


(0, 1]     58175
(1, 3]      3305
(3, 5]        13
(5, 39]        6
Name: cut_session_pvs_mean, dtype: int64

### Test set non-subscribers - (pvs per session) per person

* **From test set's non-subscriber pool, check distribution of people in various buckets of pvs. per session**

In [16]:
test_set_ns['cut_session_pvs_mean'] = pd.cut(test_set_ns.session_pvs_mean, [0, 1, 3 , 5, 39])

print(test_set_ns['cut_session_pvs_mean'].value_counts(normalize=True))

test_set_ns['cut_session_pvs_mean'].value_counts()

(0, 1]    0.95
(1, 3]    0.05
(3, 5]    0.00
(5, 39]   0.00
Name: cut_session_pvs_mean, dtype: float64


(0, 1]     58267
(1, 3]      3220
(3, 5]         6
(5, 39]        6
Name: cut_session_pvs_mean, dtype: int64