## Experiment with different thresholds (eg.45,60,75,90) and observe the effects
- Accurately join f_aa_hits_detail (for post_evar columns) and f_aa_hits_detail_engagement (for time on site seconds and has_engagement test column)



## Imports and Set Up

In [2]:
import boto3
import snowflake.connector
from pathlib import Path
from snowflake.connector.pandas_tools import write_pandas
from sqlalchemy import create_engine
from snowflake.connector.pandas_tools import pd_writer
from snowflake.connector import DictCursor
import pandas as pd
from snowflake.snowpark import functions as fn
from snowflake.snowpark import types as type
from snowflake.snowpark import DataFrame as df
import numpy as np
from snowflake.snowpark import Session
from snowflake.snowpark.types import StringType, BooleanType
from snowflake.snowpark.functions import udf
from snowflake.snowpark.functions import col

In [3]:
def snowflake_sesion(connection_parameters) -> Session:
    return Session.builder.configs(connection_parameters).create()

connection_parameters = {
    "user" : "sophie.jones@contractor.itech.media",
    "account" : "gs46004.eu-west-1",
    "role" : "PRD_ANALYST",
    "warehouse" : "PRD_WH",
    "database" : "PRD_DWH",
    "schema":"SANDBOX",
    "authenticator" : "externalbrowser"
}

main_session = snowflake_sesion(connection_parameters)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://netman.okta.com/app/snowflake/exk5idnkinyNtZ20m417/sso/saml?SAMLRequest=lZJBc9owEIX%2Fikc925IMpKDBZGiYtLQkJcFk2txUeyEabMnVyhjy6ytM3EkPyUxvGum93U%2F7dnx5KItgDxaV0QnhESMB6MzkSm8Tsk6vwyEJ0Emdy8JoSMgRkFxOxijLohLT2j3pe%2FhdA7rAF9Io2oeE1FYLI1Gh0LIEFC4Tq%2BnNQsQRE5U1zmSmIK8s7zskIljnCTtLjsrjPTlXCUqbpomaXmTslsaMMcpG1KtOkg%2Bd%2FuD%2F9IaeU9Y%2F6b3Cy5cvbJ%2BUPo%2FgPaxfZxGKL2m6DJffVykJph3qldFYl2BXYPcqg%2FX94gyAnmCL%2FQvG%2BhHUYeNHF%2FIItWk2hdxBZsqqdr5q5E90AzktzFb5j89nCal2Kp8vDikblp8Xy7uN%2FvptLZ%2F3P7f5cJDaO20M7%2F24WT0bbh%2FUrJ%2BR4KFLNj4lO0esYa5PeTp%2FxeJeyAYhH6Z8IDgT8UU04qNHEsw8lNLStc4OWoMrpY7MzsmWTVYV%2FYtN4bAbqFzvlD7euseYlX3%2BkSIaekqXnBdGtP3t5H%2FHMKav3S%2B7d%2BvjmM%2BWplDZMbg2tpTu7bR4xNsblYebViqglKqY5rkFRJ9aUZjmyoJ0fsWdrYHQybnrv0s%2B%2BQM

## Read in all Dataframes

In [4]:
f_aa_hits_detail_engagement = main_session.table('dimensional.f_aa_hits_detail_engagement')
f_aa_hits_detail_engagement.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'SITE_URL',
 'VISIT_DATE_TIME_UTC',
 'VISITOR_ID',
 'VISIT_NUM',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_USER_LANGUAGE_SK',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'VISIT_START_PAGE_URL',
 'VISIT_START_D_PAGE_HIST_SK',
 'HIT_TIME_UTC',
 'PAGE_URL',
 'D_PAGE_HIST_SK',
 'VISIT_PAGE_NUM',
 'IS_LANDING_PAGE',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT',
 'OUTBOUND_LINK',
 'PAGE_VERTICAL',
 'CHANNEL_NAME',
 'TRANSACTION_ID']

#^ has engagement and has clickout on a hit level

In [5]:
f_aa_hits_detail = main_session.table('dimensional.f_aa_hits_detail')
f_aa_hits_detail.columns

['VISIT_DATE_UTC',
 'VISIT_DATE_TIME_UTC',
 'FIRST_HIT_TIME_UTC',
 'LAST_HIT_TIME_UTC',
 'HIT_TIME_UTC',
 'POST_CUST_HIT_TIME_UTC',
 'REPORT_SUITE_DATE_TIME',
 'D_SITE_HIST_SK',
 'D_BROWSER_SK',
 'C_COLOR',
 'COLOR_DEPTH',
 'CONNECTION_TYPE',
 'D_USER_LANGUAGE_SK',
 'DAILY_VISITOR',
 'DOMAIN',
 'EVAR_PAGE_URL',
 'EVAR_PAGE_TAG_CATEGORIES',
 'EVAR_PAGE_TAG_VERTICALS',
 'EVAR_PAGE_TAG_COUNTRY_SK',
 'EVAR_PAGE_LANGUAGE_SK',
 'EVAR_PAGE_BREAD_CRUMBS',
 'EVAR_PAGE_ALTERNATIVE_URL',
 'EVAR_EXPERIMENT_CAMPAIGN_ID',
 'EVAR_EXPERIMENT_USER_ID',
 'EVAR_ARTICLE_AUTHOR',
 'EVAR_ARTICLE_PUBLICATION_DATE',
 'EVAR_ARTICLE_MODIFICATION_DATE',
 'EVAR_ARTICLE_TITLE',
 'EVAR_ARTICLE_CATEGORIES',
 'EVAR_ARTICLE_COMMENTS_NUMBER',
 'EVAR_LINK_LOCATION',
 'EVAR_LINK_URL',
 'EVAR_LINK_TYPE',
 'EVAR_IPHONE_SCREEN_SIZE',
 'EVAR_USER_AGENT',
 'EVAR_MARKETING_CLOUD_ID',
 'EVAR_TRANSACTION_ID',
 'EVAR_USER_PREVIOUS_PAGE',
 'EVAR_USER_LANGUAGE_SK',
 'EVAR_ARCADE_CLIENT_ID',
 'EVAR_ARCADE_GAME_ID',
 'EVAR_ARCADE_GEO

In [6]:
d_site_hist = main_session.table('dimensional.d_site_hist')
d_site_hist.columns

['D_SITE_HIST_SK',
 'SITE_URL',
 'SITE_NAME',
 'PILLAR',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TASK_ID',
 'ETL_LOGICAL_DATE_TIME',
 'SITE_NUMBER']

## Join Site Data from d_site_hist

In [7]:
f_aa_hits_detail = f_aa_hits_detail.select(['VISIT_DATE_UTC','HIT_TIME_UTC','EVAR_PAGE_URL','EVAR_PAGE_TAG_COUNTRY_SK','D_SITE_HIST_SK','post_evar_link_url','post_evar_link_type','post_prop_time_on_page'])
d_site_hist = d_site_hist.select(['D_SITE_HIST_SK','SITE_URL','SITE_NAME'])
#f_aa_hits_detail.count()

In [8]:
hits_detail = f_aa_hits_detail.join(d_site_hist, on ='D_SITE_HIST_SK', how='left')
#f_aa_hits_detail.count()

In [9]:
#filter unmatched rows
unmatched_rows = hits_detail.filter(hits_detail["SITE_NAME"].isNull())

# count the number of unmatched rows
num_unmatched_rows = unmatched_rows.count()

# print the result
print(f"There were {num_unmatched_rows} rows that didn't match.")

There were 0 rows that didn't match.


In [10]:
hits_detail.columns

['D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'HIT_TIME_UTC',
 'EVAR_PAGE_URL',
 'EVAR_PAGE_TAG_COUNTRY_SK',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME']

## Has clickout experimentation

Create has clickout column from

CASE s.site_url
        WHEN 'covers.com'
          THEN
            IFF(CONTAINS(LOWER(h.post_evar_link_url), 'sportsbookredirect') OR CONTAINS(LOWER(h.post_evar_link_url), 'metabet'), 1, 0)
        ELSE
          IFF(h.post_evar_link_type = 'Partner', 1, 0)
      END AS has_clickout

and test with column in f_aa_hits_detail_engagement

In [13]:
@udf(name='has_clickout', input_types=[StringType(), StringType(), StringType()], return_type=BooleanType(), is_permanent=False, replace=True, session=main_session)

def has_clickout_function(url,post_evar_link_url,post_evar_link_type) -> bool:
    if 'covers.com' in url:
        if ('sportsbookredirect' in post_evar_link_url) | ('metabet' in post_evar_link_url):
            return True
        elif post_evar_link_type == 'Partner':
            return True
        else:
            return False
    elif post_evar_link_type == 'Partner':
        return True
    else:
        return False

f_aa_hits_detail = f_aa_hits_detail.withColumn('has_clickout_test', has_clickout_function(col('SITE_URL'), col('post_evar_link_url'),col('post_evar_link_type')))
f_aa_hits_detail.columns

['D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'EVAR_PAGE_URL',
 'EVAR_PAGE_TAG_COUNTRY_SK',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME',
 'HAS_CLICKOUT_TEST']

In [14]:
f_aa_hits_detail.filter((col('HAS_CLICKOUT_TEST') == True) & (col('SITE_URL')=='covers.com')).show(5)

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"D_SITE_HIST_SK"  |"VISIT_DATE_UTC"  |"EVAR_PAGE_URL"                                 |"EVAR_PAGE_TAG_COUNTRY_SK"  |"POST_EVAR_LINK_URL"                                |"POST_EVAR_LINK_TYPE"  |"POST_PROP_TIME_ON_PAGE"  |"SITE_URL"  |"SITE_NAME"  |"HAS_CLICKOUT_TEST"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|33                |2021-03-15        |contests.covers.com/consensus/topconsensus/nba  |-1                          |https://www.covers.com/betting/sportsbookredire...  |External         

In [15]:
f_aa_hits_detail_engagement.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'SITE_URL',
 'VISIT_DATE_TIME_UTC',
 'VISITOR_ID',
 'VISIT_NUM',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_USER_LANGUAGE_SK',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'VISIT_START_PAGE_URL',
 'VISIT_START_D_PAGE_HIST_SK',
 'HIT_TIME_UTC',
 'PAGE_URL',
 'D_PAGE_HIST_SK',
 'VISIT_PAGE_NUM',
 'IS_LANDING_PAGE',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT',
 'OUTBOUND_LINK',
 'PAGE_VERTICAL',
 'CHANNEL_NAME',
 'TRANSACTION_ID']

In [16]:
f_aa_hits_detail.count()

9246003616

In [17]:
f_aa_hits_detail.columns

['D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'EVAR_PAGE_URL',
 'EVAR_PAGE_TAG_COUNTRY_SK',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME',
 'HAS_CLICKOUT_TEST']

In [17]:
f_aa_hits_detail = f_aa_hits_detail.with_column_renamed(col("EVAR_PAGE_TAG_COUNTRY_SK"), "D_COUNTRY_HIST_SK")

In [18]:
f_aa_hits_detail = f_aa_hits_detail.with_column_renamed(col("EVAR_PAGE_URL"), "PAGE_URL")

In [19]:
f_aa_hits_detail.count()

KeyboardInterrupt: 

In [20]:
test_df = f_aa_hits_detail.select('PAGE_URL','SITE_URL','VISIT_DATE_UTC',"D_COUNTRY_HIST_SK","post_evar_link_url","post_evar_link_type",'POST_PROP_TIME_ON_PAGE').join(
    f_aa_hits_detail_engagement.select('PAGE_URL','SITE_URL','VISIT_DATE_UTC','D_COUNTRY_HIST_SK','HAS_CLICKOUT','HAS_ENGAGEMENT'), 
    on = ['PAGE_URL','VISIT_DATE_UTC','D_COUNTRY_HIST_SK'] , 
    how='inner')
test_df.count()

In [27]:
clickout_test.columns

['PAGE_URL',
 'VISIT_DATE_UTC',
 'D_COUNTRY_HIST_SK',
 '"l_pgpq_SITE_URL"',
 'HAS_CLICKOUT_TEST',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 '"r_rnk7_SITE_URL"',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT']

In [28]:
clickout_test.select("PAGE_URL","HAS_CLICKOUT_TEST","HAS_CLICKOUT","post_evar_link_url","post_evar_link_type").show(25)

---------------------------------------------------------------------------------------------------------------------
|"PAGE_URL"                   |"HAS_CLICKOUT_TEST"  |"HAS_CLICKOUT"  |"POST_EVAR_LINK_URL"  |"POST_EVAR_LINK_TYPE"  |
---------------------------------------------------------------------------------------------------------------------
|www.casinospiele.de/spiele/  |False                |0               |[Missing]             |[Missing]              |
|www.casinospiele.de/spiele/  |False                |0               |[Missing]             |[Missing]              |
|www.casinospiele.de/spiele/  |False                |0               |[Missing]             |[Missing]              |
|www.casinospiele.de/spiele/  |False                |0               |[Missing]             |[Missing]              |
|www.casinospiele.de/spiele/  |False                |0               |[Missing]             |[Missing]              |
|www.casinospiele.de/spiele/  |False                |0  

In [34]:
clickout_test.filter(col("HAS_CLICKOUT")== 1).select("PAGE_URL",'VISIT_DATE_UTC','D_COUNTRY_HIST_SK',"HAS_CLICKOUT_TEST","HAS_CLICKOUT","post_evar_link_url","post_evar_link_type").show(25)

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"PAGE_URL"                            |"VISIT_DATE_UTC"  |"D_COUNTRY_HIST_SK"  |"HAS_CLICKOUT_TEST"  |"HAS_CLICKOUT"  |"POST_EVAR_LINK_URL"  |"POST_EVAR_LINK_TYPE"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|www.onlineroulette.org/de/paypal.php  |2021-01-09        |83                   |False                |1               |[Missing]             |[Missing]              |
|www.onlineroulette.org/de/paypal.php  |2021-01-09        |83                   |False                |1               |[Missing]             |[Missing]              |
|www.onlineroulette.org/de/paypal.php  |2021-01-09        |83                   |False                |1               |[Missing]             |[Missing]        

In [35]:
f_aa_hits_detail_engagement.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'SITE_URL',
 'VISIT_DATE_TIME_UTC',
 'VISITOR_ID',
 'VISIT_NUM',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_USER_LANGUAGE_SK',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'VISIT_START_PAGE_URL',
 'VISIT_START_D_PAGE_HIST_SK',
 'HIT_TIME_UTC',
 'PAGE_URL',
 'D_PAGE_HIST_SK',
 'VISIT_PAGE_NUM',
 'IS_LANDING_PAGE',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT',
 'OUTBOUND_LINK',
 'PAGE_VERTICAL',
 'CHANNEL_NAME',
 'TRANSACTION_ID']

In [41]:
f_aa_hits_detail_engagement.filter((col('PAGE_URL')=='www.onlineroulette.org/de/paypal.php') & (col('VISIT_DATE_UTC')=='2021-01-09') & (col("D_COUNTRY_HIST_SK")==83) & (col('Has_CLICKOUT')==1)).select(['VISIT_DATE_UTC','D_SITE_HIST_SK','D_COUNTRY_HIST_SK','SITE_URL','VISIT_ID', 'HIT_TIME_UTC','HAS_CLICKOUT','OUTBOUND_LINK',]).show(25)

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"VISIT_DATE_UTC"  |"D_SITE_HIST_SK"  |"D_COUNTRY_HIST_SK"  |"SITE_URL"          |"VISIT_ID"                                          |"HIT_TIME_UTC"       |"HAS_CLICKOUT"  |"OUTBOUND_LINK"     |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2021-01-09        |7                 |83                   |onlineroulette.org  |4715286718389962562_3552708344388170755_1_16101...  |2021-01-09 10:22:32  |1               |/go/betway.php      |
|2021-01-09        |7                 |83                   |onlineroulette.org  |4715286718389962562_3552708344388170755_1_16101...  |2021-01-09 10:22:24  |1               |/go/betway.php      |
|2021-01-09        |

Test if has clickout is correct

f_aa_hits_detail_engagement has clickout and has engagement logic is throughout this notebook and is created from columns from f_aa_hits_detial

## Recreate Engagement

Create Engagement based on has_clickout - go back and troubleshoot above later

Create has engagement test Column from the following logic

CASE
        WHEN has_clickout OR h.post_evar_link_type = 'Internal' OR NVL(TRY_TO_NUMBER(REPLACE(h.post_prop_time_on_page, '+', '')), 0) >= 30
          THEN 1
        ELSE 0
      END AS has_engagement

In [None]:
CASE
        WHEN has_clickout OR h.post_evar_link_type = 'Internal' OR NVL(TRY_TO_NUMBER(REPLACE(h.post_prop_time_on_page, '+', '')), 0) >= 30
          THEN 1
        ELSE 0
      END AS has_engagement

In [11]:
hits_detail.columns

['D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'HIT_TIME_UTC',
 'EVAR_PAGE_URL',
 'EVAR_PAGE_TAG_COUNTRY_SK',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME']

In [115]:
hits_detail.select('VISIT_DATE_UTC').count()

9298477464

In [116]:
hits_detail_sample = hits_detail.sample(0.0001)
hits_detail_sample.select('VISIT_DATE_UTC').count()

928969

In [117]:
hits_detail_sample.group_by(['HIT_TIME_UTC','EVAR_PAGE_URL','EVAR_PAGE_TAG_COUNTRY_SK']).agg({'VISIT_DATE_UTC':'count'}).count()

928420

In [118]:
f_aa_hits_detail_engagement_sample = f_aa_hits_detail_engagement.sample(0.0001)
print(f_aa_hits_detail_engagement_sample.select('VISIT_DATE_UTC').count())
print(f_aa_hits_detail_engagement_sample.group_by(['HIT_TIME_UTC','PAGE_URL','D_COUNTRY_HIST_SK']).agg({'VISIT_DATE_UTC':'count'}).count())

916723
916566


In [104]:
f_aa_hits_detail_engagement.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'SITE_URL',
 'VISIT_DATE_TIME_UTC',
 'VISITOR_ID',
 'VISIT_NUM',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_USER_LANGUAGE_SK',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'VISIT_START_PAGE_URL',
 'VISIT_START_D_PAGE_HIST_SK',
 'HIT_TIME_UTC',
 'PAGE_URL',
 'D_PAGE_HIST_SK',
 'VISIT_PAGE_NUM',
 'IS_LANDING_PAGE',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT',
 'OUTBOUND_LINK',
 'PAGE_VERTICAL',
 'CHANNEL_NAME',
 'TRANSACTION_ID']

In [12]:
hits_detail = hits_detail.with_column_renamed(col('EVAR_PAGE_URL'),'PAGE_URL')
hits_detail = hits_detail.with_column_renamed(col('EVAR_PAGE_TAG_COUNTRY_SK'),'D_COUNTRY_HIST_SK')

In [13]:
hits_detail_engagement = hits_detail.join(f_aa_hits_detail_engagement.select(['HIT_TIME_UTC','PAGE_URL','D_COUNTRY_HIST_SK','HAS_CLICKOUT','HAS_ENGAGEMENT']), on = ['HIT_TIME_UTC','PAGE_URL','D_COUNTRY_HIST_SK'], how = 'left')

In [121]:
hits_detail_engagement.select('VISIT_DATE_UTC').count()

9300799128

increased by 2%

In [123]:
#filter unmatched rows
unmatched_rows = hits_detail_engagement.filter(hits_detail_engagement["HAS_CLICKOUT"].isNull())

# count the number of unmatched rows
num_unmatched_rows = unmatched_rows.count()

# print the result
print(f"There were {num_unmatched_rows} rows that didn't match.")

KeyboardInterrupt: 

In [124]:
hits_detail_engagement.columns

['HIT_TIME_UTC',
 'PAGE_URL',
 'D_COUNTRY_HIST_SK',
 'D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT']

In [None]:
import re
print(int(re.findall(r'\d+', '120+')[0]))

In [14]:
from snowflake.snowpark.types import StringType, BooleanType, IntegerType
from snowflake.snowpark.functions import udf
from snowflake.snowpark.functions import col
import re

@udf(name='has_engagement', input_types=[BooleanType(), StringType(), StringType()], return_type=IntegerType(), is_permanent=False, replace=True, session=main_session)

def has_engagement_function(has_clickout,post_evar_link_type,post_prop_time_on_page) -> int:
    pptop = 0
    if str(post_prop_time_on_page) == '[Missing]':
        pptop = 0
    else:
        pptop = int(re.findall(r'\d+', str(post_prop_time_on_page))[0])

    if (has_clickout == True) or (post_evar_link_type == 'Internal') or (pptop>=30):
        return 1
    else:
        return 0

hits_detail_engagement_sample = hits_detail_engagement.sample(0.0001)
hits_detail_engagement_sample = hits_detail_engagement_sample.withColumn('has_engagement_test', has_engagement_function(col('HAS_CLICKOUT'),col('post_evar_link_type'),col('post_prop_time_on_page')))
hits_detail_engagement_sample.columns

['HIT_TIME_UTC',
 'PAGE_URL',
 'D_COUNTRY_HIST_SK',
 'D_SITE_HIST_SK',
 'VISIT_DATE_UTC',
 'POST_EVAR_LINK_URL',
 'POST_EVAR_LINK_TYPE',
 'POST_PROP_TIME_ON_PAGE',
 'SITE_URL',
 'SITE_NAME',
 'HAS_CLICKOUT',
 'HAS_ENGAGEMENT',
 'HAS_ENGAGEMENT_TEST']

In [15]:
hits_detail_engagement_sample.filter(col('HAS_ENGAGEMENT') != col('HAS_ENGAGEMENT_TEST')).count()

SnowparkSQLException: (1304): 000630 (57014): Statement reached its statement or warehouse timeout of 3,600 second(s) and was canceled.

In [None]:
hits_detail_engagement_sample.count()

In [127]:
hits_detail_engagement.filter((col("POST_EVAR_LINK_TYPE") != '[Missing]') | (col("POST_PROP_TIME_ON_PAGE")!='[Missing]')).select('PAGE_URL','VISIT_DATE_UTC','D_COUNTRY_HIST_SK','HAS_ENGAGEMENT','HAS_ENGAGEMENT_TEST','HAS_CLICKOUT','post_evar_link_type','post_prop_time_on_page').show(25)

In [None]:
#filter unmatched rows
unmatched_rows = result.filter(result["HAS_CLICKOUT"].isNull())

# count the number of unmatched rows
num_unmatched_rows = unmatched_rows.count()

# print the result
print(f"There were {num_unmatched_rows} rows that didn't match.")

In [None]:
f_aa_hits_detail.select()

In [None]:
could use category from d_page_hist and create and engament rate based on commercial; vs non comericsl

## Changed plan to recreate on visit level 
- as missing tos at hit level - could go back and calculate in the future

In [105]:
f_aa_engagement_by_site = main_session.table('dimensional.f_aa_engagement_by_site')
f_aa_engagement_by_site.columns

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://netman.okta.com/app/snowflake/exk5idnkinyNtZ20m417/sso/saml?SAMLRequest=lVJNT%2BMwFPwrkfec2ElTWKy2qEuEtrsFsrRlJW4meU2tJHbwc5r236%2FTDwQHkPZmPc94xjNvdL2rK28LBqVWYxIGjHigMp1LVYzJannrfyceWqFyUWkFY7IHJNeTEYq6avi0tRv1CK8toPXcQwp5fzEmrVFcC5TIlagBuc34Yno351HAuEAEY50cOVFylE5rY23DKe26LugGgTYFjRhjlF1Rh%2Boh38g7ieZrjcZoqzNdnSk796dPJELK4l7CIZxCeiL%2BkOoYwVcqL0cQ8p%2FLZeqnD4sl8abn391ohW0NZgFmKzNYPc6PBtA5KDC%2BYCwOoPU7F50fBqh0t65ECZmum9a6VwN3omvIaaUL6bKaJWPSlDJPfqVWRzuxTSAp65c%2F6cXm70oscA750%2B%2B7hxiFrET%2Byoq2yIj3dG426pudIbYwU32f1o1YNPDZ0A%2BHy3DAGePxZeBmz8RLnCmphD0wz6YV2FqoQJdWHLyJpqFvtinsyqHMVSnV%2Ft4%2BR6yOw0uKqGnfFjkuDD%2Fom8n%2FxjCi79mn3bt3dcySVFcy23u32tTCft5WGISHicz99QHKoXY5TfPcAKJrrap0d2NAWLfi1rRA6OSo%2BnHJJ%2F8A&RelayState=63546 to auth

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'VISIT_DATE_TIME_UTC',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'D_USER_LANGUAGE_SK',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'TIME_ON_SITE_SECONDS',
 'SITE_VISIT_CONVERSION',
 'SITE_VISIT_UNQIUE_CLICKOUT',
 'SITE_VISIT_TOTAL_CLICKOUT',
 'SITE_VISIT_ENGAGEMENT',
 'SITE_VISIT_BOUNCE',
 'HAS_PAGE_VIEW_POKER',
 'SITE_VISIT_CONVERSION_POKER',
 'SITE_VISIT_UNIQUE_CLICKOUT_POKER',
 'SITE_VISIT_TOTAL_CLICKOUT_POKER',
 'SITE_VISIT_ENGAGEMENT_POKER',
 'SITE_VISIT_BOUNCE_POKER',
 'HAS_PAGE_VIEW_CASINO',
 'SITE_VISIT_CONVERSION_CASINO',
 'SITE_VISIT_UNIQUE_CLICKOUT_CASINO',
 'SITE_VISIT_TOTAL_CLICKOUT_CASINO',
 'SITE_VISIT_ENGAGEMENT_CASINO',
 'SITE_VISIT_BOUNCE_CASINO',
 'HAS_PAGE_VIEW_SPORTS',
 'SITE_VISIT_CONVERSION_SPORTS',
 'SITE_VISIT_UNIQUE_CLICKOUT_SPORTS',
 'SITE_VISIT_TOTAL_CLICKOUT_SPORTS',
 'SITE_VISIT_ENGAGEMENT_SPORTS',
 'SITE_VISIT_BOUNCE_SPORTS',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TAS

In [106]:
d_site_hist = main_session.table('dimensional.d_site_hist')
d_site_hist.columns

['D_SITE_HIST_SK',
 'SITE_URL',
 'SITE_NAME',
 'PILLAR',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TASK_ID',
 'ETL_LOGICAL_DATE_TIME',
 'SITE_NUMBER']

'VISIT_DATE_TIME_UTC',
 'VISIT_ID',
  'D_SITE_HIST_SK',
   'D_COUNTRY_HIST_SK',
 'TIME_ON_SITE_SECONDS',
  'SITE_VISIT_UNQIUE_CLICKOUT',
 'SITE_VISIT_TOTAL_CLICKOUT',
 'SITE_VISIT_ENGAGEMENT',