In [2]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore') 

import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

In [3]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

df_nofilter = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df_nofilter.shape, "\n")

print("Total fvids in df: ", len(df_nofilter.GA_fullVisitorId.unique()), "\n")

--- 10.571234703063965 seconds ---
Shape:  (6556736, 23)
--- 4.581924200057983 seconds ---
Before: (1192679, 23)
After removing  10 suspicious fvids: (1192343, 23)
Shape:  (7749079, 23) 

Total fvids in df:  507937 



In [4]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

df_pv1_filter = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df_pv1_filter.shape, "\n")

print("Total fvids in df: ", len(df_pv1_filter.GA_fullVisitorId.unique()), "\n")

--- 9.33493185043335 seconds ---
Shape:  (5251696, 23)
--- 4.405194520950317 seconds ---
Before: (2478658, 23)
After removing  76 suspicious fvids: (2474748, 23)
Shape:  (7726444, 23) 

Total fvids in df:  506819 



In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_subs_ga`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

subs_data["subscription_status"] = "subscriber"

# drop unnecessary col & rename
subs_data.drop(['user_id_uid', 'resource_id_rid'], axis=1, inplace=True)

subs_data.rename(columns={'ga_pianoId': 
                          'piano_id'}, 
                 inplace=True)

print("Shape: ", subs_data.shape)

start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv3_nonsubs_ga`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

nonsubs_data["subscription_status"] = "non_subscriber"

print("Before:", nonsubs_data.shape)

# some suspicious fvids have no pianoID but their dfpZone has 'subscriber' in it - remove them

nonsubs_data.GA_dfpNewZone = nonsubs_data.GA_dfpNewZone.fillna('none')

suspicious_fvid = nonsubs_data[nonsubs_data.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()

nonsubs_data = nonsubs_data[~nonsubs_data.GA_fullVisitorId.isin(suspicious_fvid)]

print("After removing ", 
      len(suspicious_fvid), 
      "suspicious fvids:", nonsubs_data.shape)

df_pv3_filter = pd.concat([subs_data, nonsubs_data])

print("Shape: ", df_pv3_filter.shape, "\n")

print("Total fvids in df: ", len(df_pv3_filter.GA_fullVisitorId.unique()), "\n")

--- 9.556484937667847 seconds ---
Shape:  (5251696, 23)
--- 9.451661109924316 seconds ---
Before: (6014971, 23)
After removing  395 suspicious fvids: (5938101, 23)
Shape:  (11189797, 23) 

Total fvids in df:  506500 



In [6]:
def pvs_eda(input_df, cat_col_name, drop_cols=False):
    
    # pivot
    df_cat = pd.pivot_table(
        input_df, 
        values='GA_pageViews',
        columns= cat_col_name,
        index='subscription_status',
        aggfunc='sum')\
        .reset_index()
    
    # drop none and other cols
    if drop_cols==True:
        
        if 'none' in list(df_cat.columns):
            df_cat.drop('none', axis=1, inplace=True)
        
        if 'other' in list(df_cat.columns):
            df_cat.drop('other', axis=1, inplace=True)

    
    # percentage calc below
    df_cat = df_cat.fillna(0)
    df_cat = df_cat.T
    
    new_header = df_cat.iloc[0] # grab the first row for the header
    df_cat = df_cat[1:] # take the data minus the header row
    df_cat.columns = new_header
  
    df_cat['% of non_subscriber pvs'] = (df_cat['non_subscriber'] / df_cat['non_subscriber'].sum()) * 100
    df_cat['% of subscriber pvs'] = (df_cat['subscriber'] / df_cat['subscriber'].sum()) * 100
    
    df_cat.drop(['non_subscriber', 'subscriber'], axis=1, inplace=True)

    df_cat['non_subscriber_rank'] = df_cat['% of non_subscriber pvs'].rank(ascending=False).astype(int)
    df_cat['subscriber_rank'] = df_cat['% of subscriber pvs'].rank(ascending=False).astype(int)

    df_cat = df_cat.sort_values('subscriber_rank')
    
    return df_cat


# df_nofilter = pd.read_csv("training_data_11192021.csv")
# df_pv1_filter = pd.read_csv("pv1_training_data_01062022.csv")
# df_pv3_filter = pd.read_csv("pv3_training_data_01062022.csv")

In [None]:
# feature distributions overlap more as we progress

* Orig - pvs per person

In [9]:
whole_1 = df_nofilter.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole_1["avg_top"] = whole_1['timeOnPage']/whole_1['sum_pvs']
whole_1.groupby('subscription_status').sum_pvs.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,2.91,66.94
std,5.34,358.19
min,1.0,1.0
25%,1.0,9.0
50%,1.0,22.0
75%,3.0,56.0
max,492.0,46250.0


* PV filter >1 - pvs per person

In [7]:
whole_2 = df_pv1_filter.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole_2["avg_top"] = whole_2['timeOnPage']/whole_2['sum_pvs']
whole_2.groupby('subscription_status').sum_pvs.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,6.04,54.2
std,10.06,179.55
min,2.0,1.0
25%,2.0,7.0
50%,3.0,19.0
75%,6.0,47.0
max,1098.0,18024.0


* PV filter >3 - pvs per person

In [8]:
whole_3 = df_pv3_filter.groupby(['GA_fullVisitorId', 'subscription_status']).agg({'GA_pageViews': 'sum', 'timeOnPage': 'sum'}).reset_index().rename(columns={'GA_pageViews': 'sum_pvs'})
whole_3["avg_top"] = whole_3['timeOnPage']/whole_3['sum_pvs']
whole_3.groupby('subscription_status').sum_pvs.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,14.5,54.2
std,22.84,179.55
min,4.0,1.0
25%,5.0,7.0
50%,8.0,19.0
75%,16.0,47.0
max,5261.0,18024.0


* Orig - avg top per person

In [15]:
whole_1.groupby('subscription_status').avg_top.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,72.66,135.96
std,123.51,107.22
min,0.0,0.0
25%,7.0,65.11
50%,36.0,113.85
75%,87.86,178.19
max,2941.0,2713.0


* PV filter >1 - avg top per person

In [16]:
whole_2.groupby('subscription_status').avg_top.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,92.85,137.01
std,138.22,106.19
min,0.0,0.0
25%,17.2,66.82
50%,46.77,114.8
75%,104.5,178.82
max,2434.0,3537.5


In [17]:
whole_3.groupby('subscription_status').avg_top.describe().T.iloc[1:]

subscription_status,non_subscriber,subscriber
mean,92.0,137.01
std,104.69,106.19
min,0.0,0.0
25%,25.5,66.82
50%,58.15,114.8
75%,116.57,178.82
max,1449.0,3537.5


* Non-subs pool = Those who came in Oct'21 with no pv filter
    * Features from their GA history = Jan '21 to Oct '21

In [5]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_ga`
"""

ns_nofilter = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print("Shape: ", ns_nofilter.shape)
print("Unique non-sub fullvids: ", len(ns_nofilter.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", ns_nofilter.duplicated().any(), "\n")

--- 5.28026556968689 seconds ---
Shape:  (1192679, 22)
Unique non-sub fullvids:  410000 

Duplicates?:  True 



In [10]:
suspicious_fvid_nofilter = ns_nofilter[ns_nofilter.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
len(suspicious_fvid_nofilter)

10

* Non-subs pool = Those who came in Dec'21 with >1 pv filter
    * Features from their GA history = Jan '21 to Dec '21

In [7]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv1_nonsubs_ga`
"""

ns_pv1_filter = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print("Shape: ", ns_pv1_filter.shape)
print("Unique non-sub fullvids: ", len(ns_pv1_filter.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", ns_pv1_filter.duplicated().any(), "\n")

--- 3.7816431522369385 seconds ---
Shape:  (2478658, 22)
Unique non-sub fullvids:  410000 

Duplicates?:  True 



In [12]:
suspicious_fvid_pv1filter = ns_pv1_filter[ns_pv1_filter.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
len(suspicious_fvid_pv1filter)

76

* Non-subs pool = Those who came in Dec'21 with >3 pv filter
    * Features from their GA history = Jan '21 to Dec '21

In [8]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.smpv3_nonsubs_ga`
"""

ns_pv3_filter = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print("Shape: ", ns_pv3_filter.shape)
print("Unique non-sub fullvids: ", len(ns_pv3_filter.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", ns_pv3_filter.duplicated().any(), "\n")

--- 8.812063455581665 seconds ---
Shape:  (6014971, 22)
Unique non-sub fullvids:  410000 

Duplicates?:  True 



In [18]:
ns_pv3_filter.GA_dfpNewZone = ns_pv3_filter.GA_dfpNewZone.fillna('none')

suspicious_fvid_pv3filter = ns_pv3_filter[ns_pv3_filter.GA_dfpNewZone.str.contains('/subscriber/')].GA_fullVisitorId.unique()
len(suspicious_fvid_pv3filter)

395

* FVID intersection between 3 tables

In [25]:
no_pv1 = pd.merge(ns_nofilter[['GA_fullVisitorId']].drop_duplicates(keep='first'), 
                  ns_pv1_filter[['GA_fullVisitorId']].drop_duplicates(keep='first'), on=['GA_fullVisitorId'], how='inner')
no_pv1.shape

(865, 1)

In [26]:
no_pv3 = pd.merge(ns_nofilter[['GA_fullVisitorId']].drop_duplicates(keep='first'), 
                  ns_pv3_filter[['GA_fullVisitorId']].drop_duplicates(keep='first'), on=['GA_fullVisitorId'], how='inner')
no_pv3.shape

(1601, 1)

In [27]:
pv1_pv3 = pd.merge(ns_nofilter[['GA_fullVisitorId']].drop_duplicates(keep='first'), 
                  ns_pv3_filter[['GA_fullVisitorId']].drop_duplicates(keep='first'), on=['GA_fullVisitorId'], how='inner')
pv1_pv3.shape

(1601, 1)

* Suspicious FVID intersection between 3 tables

In [28]:
len([x for x in suspicious_fvid_nofilter if x in suspicious_fvid_pv1filter])

0

In [29]:
len([x for x in suspicious_fvid_nofilter if x in suspicious_fvid_pv3filter])

0

In [30]:
len([x for x in suspicious_fvid_pv1filter if x in suspicious_fvid_pv3filter])

39

* Check

In [41]:
# non-subs pool orig
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_nonsubs_pool`
"""

sm_nonsubs_pool = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))
print("Shape: ", sm_nonsubs_pool.shape)


start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.sm_pred_ns_pool`
"""

# for prediction - non-subs pool orig
sm_pred_ns_pool = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print("Shape: ", sm_pred_ns_pool.shape)

--- 4.3037474155426025 seconds ---
Shape:  (410000, 3)
--- 5.466241836547852 seconds ---
Shape:  (600000, 3)


In [44]:
pd.merge(sm_nonsubs_pool[['ga_fullvisitorid']], sm_pred_ns_pool[['ga_fullvisitorid']], how="inner", on="ga_fullvisitorid")

Unnamed: 0,ga_fullvisitorid
0,5992610480279103811
1,12483903402323082477
2,1861921945347904623
3,9799352427386252145
4,17197990784528439405
...,...
1178,17252594532856133119
1179,11102421430835610168
1180,16042947245132483460
1181,1036755021071626715
