In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import numpy as np
from google.cloud import bigquery
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import joblib
import warnings
warnings.filterwarnings('ignore') 
import re

bq_client = bigquery.Client()

In [2]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage
import time

bqclient = bigquery.Client()
bqstorageclient = bigquery_storage.BigQueryReadClient()

## Exploring subscriber data for 'unlimited' subscription type 

* **Make data set for EDA**
    * Fetching GA data
    
        * SUBSCRIBERS:
            * Subscriber id > fullvid > session > pagePaths > GA data for pagePaths <BR><BR>
        * NON-SUBSCRIBERS:
            * fullvid > session > pagePaths > GA data for pagePaths

NOTE: 
* Data not grouped by anything. For respective features, group by during dev time
* Data not filtered by any date range - decide apporpriate date filtering during eda
    
#### --SUBSCRIBERS--
* **Make subscribers raw IDs dataset**

In [3]:
start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.raw_subscriber` AS

    # get ALL user_id_uids having unlimited type subscription only
    WITH eligible_users AS ( 
      
       SELECT 
           DISTINCT 
               user_id_uid, 
               resource_id_rid, 
               start_date,
               status, 
               subscription_trial_end_date
               
               # filter for active only
               --AND status='active'
               --AND total__refunded<1
               --AND cast(dt_updated as date)=current_date('America/New_York')
    FROM
        `api-project-901373404215.piano.subscriber_details`
    WHERE 
        # Filter for the 'universal' subscriptions only
        resource_id_rid IN UNNEST(['RKPEVDB', 'R8W03AS'])
        )
    
    # join user_id_uids with GA's pianoIDs and For each pianoID, get their fullvids. De-duplicate.
     SELECT 
         DISTINCT
            ga_pianoId,
            user_id_uid,
            ga_fullvisitorid,
            resource_id_rid,
            start_date,
            status, 
            subscription_trial_end_date
    FROM
        eligible_users
         INNER JOIN 
         `api-project-901373404215.DataMart.DataMart6` 
    ON 
        LOWER(ga_pianoId) = LOWER(user_id_uid)
"""


make_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 8.657292604446411 seconds ---


* Query raw subscribers data (sanity check)

In [3]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.raw_subscriber`
"""
raw = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(raw.shape)

--- 3.216677665710449 seconds ---
(125366, 7)


In [4]:
print("Unique unlimited subscribers: ", len(raw.user_id_uid.unique()))
print("Unique unlimited fullvids: ", len(raw.ga_fullvisitorid.unique()), "\n")
print("Duplicates?: ", raw.duplicated().any(), "\n")
raw.head()

Unique unlimited subscribers:  41512
Unique unlimited fullvids:  98089 

Duplicates?:  False 



Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,start_date,status,subscription_trial_end_date
0,pniajfki7qfd903,PNIajFKi7qfd903,16644116536832264178,R8W03AS,2020-11-01 09:17:07 -0500,expired,
1,pniajfki7qfd903,PNIajFKi7qfd903,3242418031940008152,R8W03AS,2020-11-01 09:17:07 -0500,active,
2,pniajfki7qfd903,PNIajFKi7qfd903,3242418031940008152,R8W03AS,2020-11-01 09:17:07 -0500,expired,
3,pniajfki7qfd903,PNIajFKi7qfd903,5815314647275138317,R8W03AS,2020-11-01 09:17:07 -0500,active,
4,pniajfki7qfd903,PNIajFKi7qfd903,16644116536832264178,R8W03AS,2020-11-01 09:17:07 -0500,active,


In [5]:
# every piano ID has on average 2 fullvids

pd.DataFrame(raw.groupby('ga_pianoId').ga_fullvisitorid.nunique().describe())

Unnamed: 0,ga_fullvisitorid
count,41512.0
mean,2.37
std,4.88
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,201.0


In [6]:
print("Min subscription start", raw.start_date.min())
print("Max subscription start", raw.start_date.max())

Min subscription start 2020-11-01 09:17:07 -0500
Max subscription start 2021-11-01 03:01:47 -0400


In [7]:
# EXTRA

# query takes all the 41k users for EDA (despite account status) because they subscribed in the first place so behavior matters

raw_acc = raw[['user_id_uid', 'status']].drop_duplicates(keep='first')
print(":: Types of account status ::")
print(raw_acc.status.value_counts(), "\n")

print(":: Nulls in whole data ::")
print(raw.isna().sum(), "\n")

# duplicate pianoid -fullvid rows present :: because for those combo - multiple account info presnt

inspect = pd.DataFrame(raw.groupby(['user_id_uid', 'ga_fullvisitorid']).size()).reset_index().rename(columns={0:'size'})

# inspect[inspect['size']>1]

print(":: Duplicate pianoID - fvid entries present ::")
raw[(raw.user_id_uid=='29QXHHnco3XmrcrwNTxoP8p687x1') & (raw.ga_fullvisitorid =='1225771203912128738')]

:: Types of account status ::
active             41155
cancelled           5033
payment failure     2908
expired             2188
upgraded              14
Name: status, dtype: int64 

:: Nulls in whole data ::
ga_pianoId                          0
user_id_uid                         0
ga_fullvisitorid                    0
resource_id_rid                     0
start_date                          0
status                              0
subscription_trial_end_date    115684
dtype: int64 

:: Duplicate pianoID - fvid entries present ::


Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,start_date,status,subscription_trial_end_date
79820,29qxhhnco3xmrcrwntxop8p687x1,29QXHHnco3XmrcrwNTxoP8p687x1,1225771203912128738,R8W03AS,2021-04-28 11:21:48 -0400,active,
79821,29qxhhnco3xmrcrwntxop8p687x1,29QXHHnco3XmrcrwNTxoP8p687x1,1225771203912128738,R8W03AS,2021-04-28 11:21:48 -0400,cancelled,
112553,29qxhhnco3xmrcrwntxop8p687x1,29QXHHnco3XmrcrwNTxoP8p687x1,1225771203912128738,R8W03AS,2021-09-20 18:06:18 -0400,active,


* **Pull GA data for these subscribers (no date limit)**

In [8]:
start_time = time.time()

query_string = """ 

CREATE OR REPLACE TABLE `api-project-901373404215.skt.subscriber_ga_data` AS (

   # Get IAB Tiers for articles. There can be articles classified multiple times with different tier1 values given. 
   # To avoid duplication, partition by naturalid and take categorization by latest timestamp
   
   WITH content_iab AS (
   SELECT * FROM (
        SELECT 
            naturalId AS iab_natid, 
            RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
            categoryName,
            tier1,
            tier2
        FROM `api-project-901373404215.Content.mnet_iab_categories` where tier1 IS NOT NULL
        ) 
    WHERE mostrecent = 1
    ),

    # Get the natid and the corresponding title form the main content table. 
    # There are duplicates so partition by naturalid and get latest articles
                
    content AS (
      SELECT 
        natid AS content_natid,
        title,
        publish_date
      FROM (
        SELECT DISTINCT
          LOWER(NaturalId) AS natid,
          RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
          title,
          DATETIME(date, "America/New_York") as publish_date
        FROM
          `api-project-901373404215.Content.content`
        WHERE
          Visible is true
          AND type in (
          'blog',
          'blogslide',
          'magazine')
          )
      WHERE mostrecent = 1
    ),
    
    # Join the content iab table with the main content table to get the title with the iab category
    content_joined AS (
      SELECT
        * 
        EXCEPT (iab_natid, mostrecent)
      FROM
        content
      LEFT JOIN
        content_iab
      ON
        LOWER(content_natid) = LOWER(iab_natid)
    )
    
    # for all eligible subscribers - fetch their GA data for preparing features
    SELECT 
        DISTINCT
            z.ga_pianoId,
            user_id_uid,
            resource_id_rid,
            d.GA_fullVisitorId, 
            GA_visitStartTime, 
            GA_date, 
            GA_pagePath, 
            -- feature cols below
            GA_dfpNewZone, GA_visitNumber,
            GA_pageViews, GA_scrollDepth, timeOnPage,  
            GA_cmsNaturalId, title, publish_date, 
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
 
    FROM 
        `api-project-901373404215.skt.raw_subscriber` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
)
     """

make_subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 75.97472429275513 seconds ---


* Query subscribers' GA data (sanity check)

In [9]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.subscriber_ga_data`
"""

subs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(subs_data.shape)

--- 33.3010368347168 seconds ---
(6480915, 24)


In [10]:
print("Unique unlimited subscribers: ", len(subs_data.user_id_uid.unique()))
print("Unique unlimited fullvids: ", len(subs_data.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", subs_data.duplicated().any())

subs_data.head()

Unique unlimited subscribers:  41512
Unique unlimited fullvids:  98089 

Duplicates?:  False


Unnamed: 0,ga_pianoId,user_id_uid,resource_id_rid,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,...,publish_date,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,pnim0mduuqm6b9u,PNIm0mduuqm6b9u,R8W03AS,8712606165342867898,1633455847,2021-10-05,/profile/charles-simonyi/,none,28,1,...,NaT,macintosh,desktop,safari,united states,organic search,none,none,,
1,pnijcwlt5qs4ed6,PNIjcwLT5qs4ed6,R8W03AS,4947438164918869275,1619354408,2021-04-25,/subscribe,none,54,1,...,NaT,macintosh,desktop,safari,united states,organic search,none,none,,
2,pni4kuxsiqjc6rb,PNI4kUXSiqjc6rb,R8W03AS,7034246835622640955,1617660994,2021-04-05,/sites/windriver/people/paulmiller1/,none,587,1,...,NaT,windows,desktop,chrome,united states,organic search,innovation,none,,
3,pni89jmehqjc1af,PNI89jmehqjc1af,R8W03AS,5931981919139440594,1625668604,2021-07-07,/search/,none,254,1,...,NaT,macintosh,desktop,chrome,united states,organic search,none,none,,
4,pniqp0ddwqro6ly,PNIqp0DdWqro6ly,R8W03AS,4466671869544946889,1618319215,2021-04-13,/midas/,none,2,1,...,NaT,macintosh,desktop,safari,united states,direct,none,none,,


In [11]:
subs_data.isna().sum()

ga_pianoId                        0
user_id_uid                       0
resource_id_rid                   0
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_dfpNewZone                369241
GA_visitNumber                    0
GA_pageViews                      0
GA_scrollDepth              1176445
timeOnPage                    59703
GA_cmsNaturalId                  34
title                       1911521
publish_date                1911520
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       2976056
tier2                       3263968
dtype: int64

In [12]:
print(subs_data.GA_date.min())
print(subs_data.GA_date.max())

2018-01-01
2021-11-01


* Why no date filter in above query? - May offer more flexibility in experimenting with users' histories falling in different date ranges, such as:

    * full history of user
        * consists of all before + all after subscription user's behavior - obscured <br><br>
    * ga_date > jan 1, '21 
        * may consist of some before + all after subscription behavior - obscured<br><br>
    * ga_date within (subscribed date - 90 days)
        * consists of before/after subscription mutually exclusive behavior per person - hypothesis: something they did in pasy 90 days which led them to subsribe)
        * will need Python function like below or data pipeline


In [None]:
# Note: before/after subscription mutually exclusive behavior per person cannot be directly obtained by filtering pianoID null -- 
# because fvid 123 can have piano id null and fvid GA data after subscription date
    
# Example: ga_fullvisitorid = '2125747004827475854' GA_pianoID = 'pniimmquxqxev7'

In [15]:
# just a prototype on 1 user for -- ga_date within (subscribed date - 90 days)

raw[raw.ga_pianoId =='pniimmquxqxev7l']

Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,start_date,status,subscription_trial_end_date
120468,pniimmquxqxev7l,PNIIMmQuXqxev7l,3663613126660091315,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120469,pniimmquxqxev7l,PNIIMmQuXqxev7l,8210077433885756007,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120470,pniimmquxqxev7l,PNIIMmQuXqxev7l,1961813583447824657,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120471,pniimmquxqxev7l,PNIIMmQuXqxev7l,3237950183657409267,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120472,pniimmquxqxev7l,PNIIMmQuXqxev7l,2446258376655790451,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120473,pniimmquxqxev7l,PNIIMmQuXqxev7l,6255747600586462916,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120474,pniimmquxqxev7l,PNIIMmQuXqxev7l,6489370869177338808,RKPEVDB,2021-08-06 05:20:00 -0400,active,
120475,pniimmquxqxev7l,PNIIMmQuXqxev7l,2125747004827475854,RKPEVDB,2021-08-06 05:20:00 -0400,active,


In [24]:
# showing example on a radom user
user = subs_data[subs_data.ga_pianoId == 'pniimmquxqxev7l']

# user came on all following dates
user.GA_date.drop_duplicates(keep='first').reset_index(drop=True).sort_values()

423    2019-05-31
689    2019-06-01
787    2019-06-02
793    2019-06-03
757    2019-06-04
          ...    
756    2021-10-28
148    2021-10-29
123    2021-10-30
778    2021-10-31
87     2021-11-01
Name: GA_date, Length: 827, dtype: object

In [26]:
import datetime

user.GA_date = pd.to_datetime(user.GA_date)

# get user's subscription date

user['start_date'] = '2021-08-06'
user.start_date = pd.to_datetime(user.start_date)

# calculate cutoff date i.e. subscription date - 90 days
user['cutoff_90day_dt'] = user.start_date - datetime.timedelta(days=90)

# select user's GA data falling within 90 days before subscription -- MAKE FEATURES OUT OF THESE
user[(user.GA_date >=user.cutoff_90day_dt)  & (user.GA_date <user.start_date)].sort_values('GA_date')

Unnamed: 0,ga_pianoId,user_id_uid,resource_id_rid,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_dfpNewZone,GA_visitNumber,GA_pageViews,...,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2,start_date,cutoff_90day_dt
3170653,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1620485482,2021-05-08,/sites/alexknapp/2021/05/08/barney-ford-man-wh...,article/premium/default/standard,2984,1,...,desktop,chrome,united kingdom,organic search,innovation,science,Business and Finance,,2021-08-06,2021-05-08
4947539,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1620485482,2021-05-08,/home_usa/,none,2984,1,...,desktop,chrome,united kingdom,organic search,home,none,,,2021-08-06,2021-05-08
467326,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1620485482,2021-05-08,/,none,2984,1,...,desktop,chrome,united kingdom,organic search,home,none,,,2021-08-06,2021-05-08
5854175,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1620508620,2021-05-08,/,none,2985,1,...,desktop,chrome,united kingdom,organic search,home,none,,,2021-08-06,2021-05-08
156932,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1620485482,2021-05-08,/worlds-billionaires/,none,2984,1,...,desktop,chrome,united kingdom,organic search,billionaires,none,,,2021-08-06,2021-05-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5373310,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1628072296,2021-08-04,/worlds-billionaires/,none,3293,1,...,desktop,chrome,united kingdom,organic search,billionaires,none,,,2021-08-06,2021-05-08
3551490,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1628165866,2021-08-05,/worlds-billionaires/,none,3297,1,...,desktop,chrome,united kingdom,organic search,billionaires,none,,,2021-08-06,2021-05-08
3100677,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1628191917,2021-08-05,/,none,3298,1,...,desktop,chrome,united kingdom,organic search,home,none,,,2021-08-06,2021-05-08
2790426,pniimmquxqxev7l,PNIIMmQuXqxev7l,RKPEVDB,1961813583447824657,1628165866,2021-08-05,/,none,3297,1,...,desktop,chrome,united kingdom,organic search,home,none,,,2021-08-06,2021-05-08


#### --NON-SUBSCRIBERS--
* **Make non-subscriber raw IDs datasets**

#### --Trial 1--
* Initially - I used below as non-subscribers

In [32]:
start_time = time.time()

query_string = """

CREATE OR REPLACE TABLE `api-project-901373404215.skt.raw_nonsubscriber_first_try` AS
    
    SELECT
            rand() AS random_num,
            ga_pianoId,
            ga_fullvisitorid
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        WHERE 
            ga_date >= '2021-01-01' AND ga_fullvisitorid NOT IN 
                (
                SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.raw_subscriber`
                )
            AND GA_dfpNewZone NOT LIKE '%/subscriber/%' AND ga_pianoId IS NULL
        
        ORDER BY random_num
        LIMIT 99000
"""


make_ns_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 82.18251943588257 seconds ---


#### --Trial 2--
* it was discussed - we should include those who are on the verge of subscribing so I put HAVING num_distinct_articles > 3 condition

In [26]:
sql="""
    SELECT
            ga_fullvisitorid, 
            EXTRACT(MONTH FROM ga_date) AS ga_month,
            count(distinct GA_cmsNaturalId) AS num_distinct_articles
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        WHERE 
            ga_date >= '2021-01-01' AND ga_pianoId IS NULL AND STARTS_WITH(GA_cmsNaturalId, "blogandpostid/blog/post/") AND ga_fullvisitorid NOT IN 
                (
                SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.raw_subscriber`
                )
            
        GROUP BY ga_fullvisitorid, ga_month
        HAVING num_distinct_articles > 3
        
    """


test_new = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

In [27]:
print(test_new.shape)
print(len(test_new.ga_fullvisitorid.unique()))

(15893434, 3)
12262131


In [28]:
ns_fvids = list(test_new.ga_fullvisitorid)
s_fvids = list(raw.ga_fullvisitorid.unique())

len(list(set(s_fvids) & set(ns_fvids)))

0

In [33]:
test_new.sort_values('num_distinct_articles') #.sort_values('ga_fullvisitorid') #

Unnamed: 0,ga_fullvisitorid,ga_month,num_distinct_articles
0,10673517748155502561,11,4
9032741,11634748714944319366,3,4
9032747,3080157350841678290,3,4
9032748,11670277183713816511,3,4
9032749,16567267426158311906,3,4
...,...,...,...
8090571,8547335566077223880,2,2098
2245760,8335413032044288449,1,2523
9229107,1863772118756953053,3,2525
3583972,6861662169131553135,1,2634


In [42]:
# test_new[test_new['num_distinct_articles']==6].head(20)

test_new[test_new.ga_fullvisitorid =='5683233371196710015']

Unnamed: 0,ga_fullvisitorid,ga_month,num_distinct_articles
56,5683233371196710015,1,6


In [34]:
test_new['num_distinct_articles'].describe()

count   15893434.00
mean           6.60
std            6.51
min            4.00
25%            4.00
50%            5.00
75%            7.00
max         2709.00
Name: num_distinct_articles, dtype: float64

In [44]:
test_new[test_new.duplicated('ga_fullvisitorid', keep=False)].sort_values('ga_month')

Unnamed: 0,ga_fullvisitorid,ga_month,num_distinct_articles
4764560,5751545322196269271,1,4
4319044,5109890436689758248,1,12
4319043,531832779528080196,1,7
4319040,5428791639433616348,1,6
4319039,4608665911821336249,1,5
...,...,...,...
6164,10632057532602454561,11,5
6163,7445899110979278951,11,4
6161,4434222248022576630,11,4
4136,10872569431440311532,11,4


In [47]:
# sent these to Greg

test_new[test_new.ga_fullvisitorid=='5109890436689758248'].reset_index(drop=True)

Unnamed: 0,ga_fullvisitorid,ga_month,num_distinct_articles
0,5109890436689758248,1,12
1,5109890436689758248,2,19


In [3]:
sql = """SELECT
            ga_fullvisitorid, ga_pianoId,
            EXTRACT(MONTH FROM ga_date) AS ga_month,
            GA_cmsNaturalId
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        WHERE 
            ga_date >= '2021-01-01' AND ga_fullvisitorid = '5109890436689758248' """ #6240966596296019991

wth =  (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 
wth

Unnamed: 0,ga_fullvisitorid,ga_pianoId,ga_month,GA_cmsNaturalId
0,5109890436689758248,,2,blogandpostid/blog/post/50373-60134542cd624f00...
1,5109890436689758248,,2,blogandpostid/blog/post/50531-601ac3e481d69c00...
2,5109890436689758248,,2,blogandpostid/blog/post/5748-6019e4ad77fceb000...
3,5109890436689758248,,2,blogandpostid/blog/post/50769-6018aab382090300...
4,5109890436689758248,,2,blogandpostid/blog/post/50531-6017f3f2257e9800...
5,5109890436689758248,,2,blogandpostid/blog/post/50462-601959d15b470700...
6,5109890436689758248,,2,blogandpostid/blog/post/50276-6019ab74926ee800...
7,5109890436689758248,,2,blogandpostid/blog/post/50276-601998fe0d250e00...
8,5109890436689758248,,1,blogandpostid/blog/post/5621-60157564823d29000...
9,5109890436689758248,,1,blogandpostid/blog/post/5621-6015af7f823d29000...


#### --Trial 3--
* But I found what I claimed as non-subscribers had way more than 5 distinct articles read per month - clarified w/ Greg which gave rise to trial 4
* Also below query is wrong because ga_pianoId IS NULL is in where clause
    * what this does is - in GA, fvid 1234 had pianoID null in jan-2021 but not null in feb-2021.. then also this fvid will get picked up as eligible pool, given the row in jan-2021 had piano ID null

In [None]:
sql="""
    SELECT
            ga_fullvisitorid, 
            EXTRACT(MONTH FROM ga_date) AS ga_month,
            count(distinct GA_cmsNaturalId) AS num_distinct_articles
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        WHERE 
            ga_date >= '2021-09-01' AND ga_pianoId IS NULL AND STARTS_WITH(GA_cmsNaturalId, "blogandpostid/blog/post/") AND ga_fullvisitorid NOT IN 
                (
                SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.raw_subscriber`
                )
            
        GROUP BY ga_fullvisitorid, ga_month
        HAVING num_distinct_articles BETWEEN 2 AND 4
        
    """


test_solution = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

#### --Trial 4--
* Finally using below
    * Was able to find out that piano is in charge of calculating who should see the paywall so it is possible that there are users slipping through the cracks as far as seeing too many articles

In [None]:
start_time = time.time()

query_string = """
    CREATE OR REPLACE TABLE `api-project-901373404215.skt.raw_nonsubscriber_new` AS
    (
    SELECT
                rand() AS random_num,
                piano_id,
                ga_fullvisitorid
    FROM (
                SELECT
                    ga_fullvisitorid, 
                    EXTRACT(MONTH FROM ga_date) AS ga_month,
                    MAX(ga_pianoId) AS piano_id,
                    count(distinct GA_cmsNaturalId) AS num_distinct_articles
                FROM
                    `api-project-901373404215.DataMart.v_DataMart_updated`
                WHERE 
                    ga_date >= '2021-01-01' AND STARTS_WITH(GA_cmsNaturalId, "blogandpostid/blog/post/") AND ga_fullvisitorid NOT IN 
                        (
                        SELECT DISTINCT ga_fullvisitorid FROM `api-project-901373404215.skt.raw_subscriber`
                        )
                GROUP BY 
                    ga_fullvisitorid, ga_month
                HAVING 
                
                    # make sure the fvid never had a piano id and has >4 distinct articles per month in given time frame
                    
                    piano_id IS NULL AND 
                    num_distinct_articles > 4
    )
    ORDER BY random_num
    LIMIT 99000
    )
"""
                    

# make_ns_data = (
#     bqclient.query(query_string)
#     .result()
#     .to_dataframe(bqstorage_client=bqstorageclient)
# ) 

# print("--- %s seconds ---" % (time.time() - start_time)) # took 50sec

* Why not >=4:
    * 

* Query raw non-subscribers IDs' data (sanity check)

In [33]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.raw_nonsubscribers`
"""
ns = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(ns.shape)

--- 3.8889529705047607 seconds ---
(99000, 3)


In [34]:
ns.isna().sum()

random_num              0
ga_pianoId          99000
ga_fullvisitorid        0
dtype: int64

In [35]:
s_fvids = list(ns.ga_fullvisitorid)
ns_fvids = list(raw.ga_fullvisitorid.unique())

print("subscribed fvids: ", len(s_fvids))
print("non-subscribed fvids: ", len(ns_fvids))

print("any intersection between them?: ", list(set(s_fvids) & set(ns_fvids)))

subscribed fvids:  99000
non-subscribed fvids:  98089
any intersection between them?:  []


* **Pull GA data for these non-subscribers (no date limit)**

In [36]:
start_time = time.time()

query_string = """ 

CREATE OR REPLACE TABLE `api-project-901373404215.skt.nonsubscriber_ga_data` AS (

   # Get IAB Tiers for articles. There can be articles classified multiple times with different tier1 values given. 
   # To avoid duplication, partition by naturalid and take categorization by latest timestamp
   
   WITH content_iab AS (
   SELECT * FROM (
        SELECT 
            naturalId AS iab_natid, 
            RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
            categoryName,
            tier1,
            tier2
        FROM `api-project-901373404215.Content.mnet_iab_categories` where tier1 IS NOT NULL
        ) 
    WHERE mostrecent = 1
    ),

    # Get the natid and the corresponding title form the main content table. 
    # There are duplicates so partition by naturalid and get latest articles
    content AS (
      SELECT 
        natid AS content_natid,
        title
      FROM (
        SELECT DISTINCT
          LOWER(NaturalId) AS natid,
          RANK() OVER (PARTITION BY naturalid ORDER BY timestamp DESC) AS mostrecent,
          title
        FROM
          `api-project-901373404215.Content.content`
        WHERE
          Visible is true
          AND type in (
          'blog',
          'blogslide',
          'magazine')
          )
      WHERE mostrecent = 1
    ),
    
    # Join the content iab table with the main content table to get the title with the iab category
    content_joined AS (
      SELECT
        * 
        EXCEPT (iab_natid, mostrecent)
      FROM
        content
      LEFT JOIN
        content_iab
      ON
        LOWER(content_natid) = LOWER(iab_natid)
    )
    
    # for sampled non-subscribers - fetch their GA data for preparing features
    SELECT 
        DISTINCT
            z.ga_pianoId,
            d.GA_fullVisitorId, 
            GA_visitStartTime, 
            GA_date, 
            GA_pagePath, 
            -- feature cols below
            GA_pageViews, GA_scrollDepth, timeOnPage, GA_visitNumber, 
            GA_cmsNaturalId, title, GA_dfpNewZone,
            GA_deviceOperatingSystem, GA_deviceCategory, GA_deviceBrowser,
            GA_country, GA_referralGroup,
            GA_primaryChannel, GA_primarySection, tier1, tier2
  
    FROM 
        `api-project-901373404215.skt.raw_nonsubscriber` z
    INNER JOIN 
        `api-project-901373404215.DataMart.v_DataMart_updated` d
    ON 
        z.ga_fullvisitorid = d.GA_fullVisitorId
    LEFT JOIN
            content_joined cj
        ON d.GA_cmsNaturalId = cj.content_natid
)
     """

make_nsga_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 29.30656147003174 seconds ---


* Query non-subscribers' GA data (sanity check)

In [38]:
start_time = time.time()

query_string = """
  SELECT
       *
    FROM
        `api-project-901373404215.skt.nonsubscriber_ga_data`
"""

nonsubs_data = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

print(nonsubs_data.shape)

--- 20.239500284194946 seconds ---
(3428529, 21)


In [42]:
print("Unique unlimited fullvids: ", len(nonsubs_data.GA_fullVisitorId.unique()), "\n")
print("Duplicates?: ", nonsubs_data.duplicated().any())
nonsubs_data.head()

Unique unlimited fullvids:  98932 

Duplicates?:  False


Unnamed: 0,ga_pianoId,GA_fullVisitorId,GA_visitStartTime,GA_date,GA_pagePath,GA_pageViews,GA_scrollDepth,timeOnPage,GA_visitNumber,GA_cmsNaturalId,...,GA_dfpNewZone,GA_deviceOperatingSystem,GA_deviceCategory,GA_deviceBrowser,GA_country,GA_referralGroup,GA_primaryChannel,GA_primarySection,tier1,tier2
0,,1899443648975358559,1612828555,2021-02-08,/sites/maggiemcgrath/2021/02/08/jo-ann-jenkins...,1,,2.0,296,blogandpostid/blog/post/2799-601e122d8abb41000...,...,article/masthead/default/standard,windows,desktop,internet explorer,united states,organic social (dark),leadership,forbeswomen,Business and Finance,Business
1,,5387120297788574187,1614600175,2021-03-01,/sites/maggiemcgrath/2021/02/22/marian-wright-...,1,,556.0,38,blogandpostid/blog/post/2799-602ee37e68def0000...,...,article/masthead/default/standard,windows,desktop,chrome,austria,organic search,leadership,forbeswomen,News and Politics,Law
2,,11177974464118241085,1612089161,2021-01-31,/sites/maggiemcgrath/2021/01/27/an-exclusive-l...,1,1.0,63.0,32,blogandpostid/blog/post/2799-6010155c2ee113000...,...,article/masthead/default/standard,android,mobile,chrome,kenya,organic search,leadership,forbeswomen,Books and Literature,
3,,5387120297788574187,1614622084,2021-03-01,/sites/maggiemcgrath/2021/02/22/marian-wright-...,1,,4.0,43,blogandpostid/blog/post/2799-602ee37e68def0000...,...,article/masthead/default/standard,windows,desktop,chrome,austria,organic search,leadership,forbeswomen,News and Politics,Law
4,,5935135056489733807,1612238510,2021-02-01,/sites/maggiemcgrath/2021/02/01/political-firs...,1,,28.0,181,blogandpostid/blog/post/2799-60143fafa76879000...,...,article/masthead/default/standard,windows,desktop,internet explorer,united states,organic social (dark),leadership,forbeswomen,News and Politics,Politics


In [43]:
nonsubs_data.isna().sum()

ga_pianoId                  3428529
GA_fullVisitorId                  0
GA_visitStartTime                 0
GA_date                           0
GA_pagePath                       0
GA_pageViews                      0
GA_scrollDepth               875931
timeOnPage                   111646
GA_visitNumber                    0
GA_cmsNaturalId                 176
title                       1673649
GA_dfpNewZone                626703
GA_deviceOperatingSystem          0
GA_deviceCategory                 0
GA_deviceBrowser                  0
GA_country                        0
GA_referralGroup                  0
GA_primaryChannel                 0
GA_primarySection                 0
tier1                       2016619
tier2                       2120526
dtype: int64

In [44]:
print(nonsubs_data.GA_date.min())
print(nonsubs_data.GA_date.max())

2018-01-01
2021-10-31


In [None]:
nonsubs_data.GA_date = pd.to_datetime(nonsubs_data.GA_date)
nonsubs_data["mon_year"] = nonsubs_data["GA_date"].dt.to_period('M')
fvid_monyear = pd.DataFrame(nonsubs_data.groupby(['GA_fullVisitorId', 'mon_year']).GA_pageViews.sum()).reset_index()
fvid_monyear.sort_values('mon_year')

In [None]:
pd.DataFrame(fvid_monyear.groupby('GA_fullVisitorId').GA_pageViews.mean()).reset_index().GA_pageViews.describe()

In [None]:
test = nonsubs_data_new.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum'}).reset_index()
test

In [None]:
test = test.groupby(['GA_fullVisitorId']).agg({'GA_pageViews': ['mean', 'median']}).reset_index()

# rename cols
test.columns = [' '.join(col).strip() for col in test.columns.values]
test.rename(columns={'GA_pageViews mean':'pageviews_mean', 'GA_pageViews median': 'pageviews_median'}, inplace=True)
test

In [None]:
test.describe()

In [None]:
s_test = subs_data.groupby(['GA_fullVisitorId', 'GA_visitStartTime']).agg({'GA_pageViews': 'sum'}).reset_index()
s_test

In [None]:
s_test = s_test.groupby(['GA_fullVisitorId']).agg({'GA_pageViews': ['mean', 'median']}).reset_index()

# rename cols
s_test.columns = [' '.join(col).strip() for col in s_test.columns.values]
s_test.rename(columns={'GA_pageViews mean':'pageviews_mean', 'GA_pageViews median': 'pageviews_median'}, inplace=True)
s_test

In [None]:
s_test.describe()

* Joining with predicted C-levels for curiosity

In [62]:
start_time = time.time()

sql = """
 WITH ga_data AS (
        SELECT
            ga_pianoId,
            ga_fullvisitorid,
            SUM(ga_pageviews) AS pageviews
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        WHERE 
            ga_date >= '2021-01-01'
        GROUP BY 
            ga_pianoId,
            ga_fullvisitorid
    ), 
    
    clevels AS (SELECT 
        *
      FROM (
        SELECT 
            DISTINCT *,
            RANK() OVER (PARTITION BY GA_fullVisitorId ORDER BY date DESC) AS mostrecent,
        FROM
            `api-project-901373404215.lookalike.zoom_info_c_level`
          )
      WHERE 
          mostrecent = 1)

    SELECT
        ga_pianoId,
        user_id_uid,
        g.ga_fullvisitorid,
        resource_id_rid,
        pageviews,
        start_date,
        status,
        subscription_trial_end_date,
        c.date,
        managementLevel
        
        # filter for active only
        --AND status='active'
        --AND total__refunded<1
        --AND cast(dt_updated as date)=current_date('America/New_York')
    FROM
        `api-project-901373404215.piano.subscriber_details`
    INNER JOIN 
        ga_data g
    ON 
        LOWER(ga_pianoId) = LOWER(user_id_uid)
    LEFT JOIN
        clevels c
    ON 
        g.ga_fullvisitorid = c.GA_fullVisitorId
    WHERE 
        # Filter for the 'universal' subscriptions only
        resource_id_rid IN UNNEST(['RKPEVDB', 'R8W03AS'])

"""

raw = (
    bqclient.query(sql)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
) 

print("--- %s seconds ---" % (time.time() - start_time))

--- 60.337597370147705 seconds ---


In [27]:
raw

Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,pageviews,start_date,status,subscription_trial_end_date,date,managementLevel
0,pnive7xkiqvxjjw,PNIVE7xkiqvxjjw,481375460805594273,R8W03AS,15,2021-07-08 10:15:08 -0400,active,,2021-07-10,Non-Clevel
1,pnive7xkiqvxjjw,PNIVE7xkiqvxjjw,481375460805594273,R8W03AS,15,2021-07-08 10:15:08 -0400,active,,2021-07-10,Non-Clevel
2,pnive7xkiqvxjjw,PNIVE7xkiqvxjjw,481375460805594273,R8W03AS,15,2021-07-08 10:15:08 -0400,active,,2021-07-10,Non-Clevel
3,pnive7xkiqvxjjw,PNIVE7xkiqvxjjw,481375460805594273,R8W03AS,15,2021-07-08 10:15:08 -0400,active,,2021-07-10,Non-Clevel
4,pnive7xkiqvxjjw,PNIVE7xkiqvxjjw,481375460805594273,R8W03AS,15,2021-07-08 10:15:08 -0400,active,,2021-07-10,Non-Clevel
...,...,...,...,...,...,...,...,...,...,...
20348856,pnij7vcbpqnpfr1,PNIJ7vcbpqnpfr1,5849173886853270318,R8W03AS,2,2021-01-29 11:43:52 -0500,active,,2021-10-03,C-level
20348857,pnij7vcbpqnpfr1,PNIJ7vcbpqnpfr1,5849173886853270318,R8W03AS,2,2021-01-29 11:43:52 -0500,active,,2021-10-03,C-level
20348858,pniyokkkqqjquy1,PNIyokkkqqjquy1,7057134733610564932,R8W03AS,119,2020-11-13 11:56:46 -0500,active,,2021-10-26,Non-Clevel
20348859,pniyokkkqqjquy1,PNIyokkkqqjquy1,7057134733610564932,R8W03AS,119,2020-11-13 11:56:46 -0500,active,,2021-10-26,Non-Clevel


In [63]:
raw

Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,pageviews,start_date,status,subscription_trial_end_date,date,managementLevel
0,pnibxswd0qkzmbo,PNIBXsWD0qkzmbo,6751203650466200715,R8W03AS,34,2020-12-07 16:00:45 -0500,active,,,
1,pnibxswd0qkzmbo,PNIBXsWD0qkzmbo,6751203650466200715,R8W03AS,34,2020-12-07 16:00:45 -0500,active,,,
2,pnibxswd0qkzmbo,PNIBXsWD0qkzmbo,6751203650466200715,R8W03AS,34,2020-12-07 16:00:45 -0500,active,,,
3,pnibxswd0qkzmbo,PNIBXsWD0qkzmbo,6751203650466200715,R8W03AS,34,2020-12-07 16:00:45 -0500,active,,,
4,pnibxswd0qkzmbo,PNIBXsWD0qkzmbo,6751203650466200715,R8W03AS,34,2020-12-07 16:00:45 -0500,active,,,
...,...,...,...,...,...,...,...,...,...,...
20348856,pnizvgd4oqoycyt,PNIZVGD4oqoycyt,5537116288231965113,R8W03AS,5,2021-02-22 17:55:08 -0500,active,,,
20348857,pnizvgd4oqoycyt,PNIZVGD4oqoycyt,5537116288231965113,R8W03AS,5,2021-02-22 17:55:08 -0500,active,,,
20348858,pnizvgd4oqoycyt,PNIZVGD4oqoycyt,5537116288231965113,R8W03AS,5,2021-02-22 17:55:08 -0500,active,,,
20348859,pnizvgd4oqoycyt,PNIZVGD4oqoycyt,5537116288231965113,R8W03AS,5,2021-02-22 17:55:08 -0500,active,,,


In [65]:
raw.isna().sum()

ga_pianoId                            0
user_id_uid                           0
ga_fullvisitorid                      0
resource_id_rid                       0
pageviews                             0
start_date                            0
status                                0
subscription_trial_end_date    18373788
date                           11328661
managementLevel                11328661
dtype: int64

In [66]:
raw.managementLevel

0           None
1           None
2           None
3           None
4           None
            ... 
20348856    None
20348857    None
20348858    None
20348859    None
20348860    None
Name: managementLevel, Length: 20348861, dtype: object

In [31]:
raw[raw.managementLevel.isna()]

Unnamed: 0,ga_pianoId,user_id_uid,ga_fullvisitorid,resource_id_rid,pageviews,start_date,status,subscription_trial_end_date,date,managementLevel
962,pnidymf2rqn8uy4,PNIdymf2Rqn8uy4,711807422658097078,R8W03AS,2,2021-01-20 12:52:35 -0500,active,,,
963,pnidymf2rqn8uy4,PNIdymf2Rqn8uy4,711807422658097078,R8W03AS,2,2021-01-20 12:52:35 -0500,active,,,
964,pnidymf2rqn8uy4,PNIdymf2Rqn8uy4,711807422658097078,R8W03AS,2,2021-01-20 12:52:35 -0500,active,,,
965,pnidymf2rqn8uy4,PNIdymf2Rqn8uy4,711807422658097078,R8W03AS,2,2021-01-20 12:52:35 -0500,active,,,
966,pnidymf2rqn8uy4,PNIdymf2Rqn8uy4,711807422658097078,R8W03AS,2,2021-01-20 12:52:35 -0500,active,,,
...,...,...,...,...,...,...,...,...,...,...
20347997,pnih8oll9qktzlt,PNIH8Oll9qktzlt,1241197781528886256,R8W03AS,1,2020-12-04 15:01:41 -0500,active,,,
20347998,pnih8oll9qktzlt,PNIH8Oll9qktzlt,1241197781528886256,R8W03AS,1,2020-12-04 15:01:41 -0500,active,,,
20347999,pnih8oll9qktzlt,PNIH8Oll9qktzlt,1241197781528886256,R8W03AS,1,2020-12-04 15:01:41 -0500,active,,,
20348000,pnih8oll9qktzlt,PNIH8Oll9qktzlt,1241197781528886256,R8W03AS,1,2020-12-04 15:01:41 -0500,active,,,


In [55]:
len(raw.ga_fullvisitorid.unique())

91774

In [58]:
raw[raw.managementLevel.isna()].drop_duplicates('ga_fullvisitorid', keep='first').shape

(4148, 10)

In [57]:
cs = pd.DataFrame(raw[raw.managementLevel.notna()][['ga_fullvisitorid', 'managementLevel']].groupby('ga_fullvisitorid').managementLevel.max()).reset_index()
cs

Unnamed: 0,ga_fullvisitorid,managementLevel
0,1000096511026234346,C-level
1,1000104336364784244,C-level
2,1000148243863006938,Non-Clevel
3,1000234620854354107,C-level
4,10002649971411353587,C-level
...,...,...
87621,999554412387753230,Non-Clevel
87622,9996488767201202921,Non-Clevel
87623,999745586422280804,Non-Clevel
87624,999931751793163249,C-level


In [61]:
cs.managementLevel.value_counts()

Non-Clevel    61254
C-level       26372
Name: managementLevel, dtype: int64

* Curiosity: Is there change in behavior in terms of count(distinct articles per month) before and after person subscribed?

In [None]:
subs_data.GA_date = pd.to_datetime(subs_data.GA_date)
subs_data["mon_year"] = subs_data["GA_date"].dt.to_period('M')

s_fvid_monyear = pd.DataFrame(subs_data.groupby(['GA_fullVisitorId', 'mon_year']).GA_pageViews.sum()).reset_index()
s_fvid_monyear.sort_values('mon_year')

In [None]:
subs_data.GA_date = pd.to_datetime(subs_data.GA_date)
subs_data["mon_year"] = subs_data["GA_date"].dt.to_period('M')
subs_data.GA_cmsNaturalId = subs_data.GA_cmsNaturalId.fillna('None') 

In [None]:
s_fvid_monyear = pd.DataFrame(subs_data[subs_data['GA_cmsNaturalId'].str.contains("blogandpostid/blog/post/")].groupby(['GA_fullVisitorId', 
                                                                                                                        'mon_year']).GA_cmsNaturalId.nunique()).reset_index().rename(columns = {'GA_cmsNaturalId': 'num_distinct_articles'})

In [None]:
s_fvid_monyear

In [None]:
# this person subscribed on 2020-12-23 :: after subscription # articles much different than before
# not separating before and after subscription behavior will obscure the average, median statistic for pv counts. 

s_fvid_monyear[s_fvid_monyear.GA_fullVisitorId == '1000104336364784244'].sort_values('mon_year')

In [None]:
# this person subscribed on 2020-11-10 :: after subscription # articles much different than before
# here not separating before and after subscription behavior works in our favor 

s_fvid_monyear[s_fvid_monyear.GA_fullVisitorId == '4600017676050042510'].sort_values('mon_year')

In [None]:
# QUERIES TO DEMONSTRATE

SELECT * FROM `api-project-901373404215.skt.raw_subscribers` where ga_pianoId ='pniwyksrkqj4fwr'



        SELECT
        distinct
            ga_pianoId,
            ga_fullvisitorid
        FROM
            `api-project-901373404215.DataMart.DataMart6`
        where GA_pianoID = 'pniwyksrkqj4fwr'
  
            
--     SELECT
--         ga_pianoId, GA_date,
--         ga_fullvisitorid,
--         SUM(ga_pageviews) AS pageviews
--     FROM
--         `api-project-901373404215.DataMart.DataMart6`
--     WHERE 
--         ga_pianoId = 'pniwyksrkqj4fwr'
--     GROUP BY 
--         ga_pianoId,
--         ga_fullvisitorid,
--         GA_date
-- order by ga_pianoId, ga_fullvisitorid, GA_date