# Detailed analysis of the "time on site" variable and determine if 30 seconds is the correct threshold to use for identifying engaged users:

Also segmenting by product, device and marketing channel

### Set up required connections and imports

In [2]:
pip install snowflake-snowpark-python --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
#connecting and reading to and from snowflake
import boto3
import snowflake.connector
from pathlib import Path
from snowflake.connector.pandas_tools import write_pandas
from sqlalchemy import create_engine
from snowflake.connector.pandas_tools import pd_writer
from snowflake.connector import DictCursor
import pandas as pd
from snowflake.snowpark import functions as fn
from snowflake.snowpark import types as type
from snowflake.snowpark import DataFrame as df

   
def get_snowflake_credentials():
    """ iTech credentials service for Sagemaker
   
    iTech has a service account for Sagemaker notebooks that has access permission to snowflake.
    We need to use this service account every time we need to build a connection, to obtain the right credentials.
    """
    params = [
        '/sagemaker/snowflake/user_id',
        '/sagemaker/snowflake/password',
        '/sagemaker/snowflake/account_id'
    ]
 
    sm = boto3.client('secretsmanager', "eu-west-1")
 
    param_values = {}
    for param in params:
        param_values[param.replace('/sagemaker/snowflake/','').replace('_id','')] = sm.get_secret_value(SecretId = param)['SecretString']
    return param_values

def create_snowflake_connection(schema='DIMENSIONAL'):
    """
    With the created connection object `conn` you can connect to Snowflake and obtain data with the following way:
 
    ```python
    with conn.cursor() as c:
        df = c.execute("SQL_QUERY_STRING_HERE").fetch_pandas_all()
    ```
    """
    credentials = get_snowflake_credentials()
    return snowflake.connector.connect(
        user=credentials['user'],
        password=credentials['password'],
        account=credentials['account'],
        role="PRD_ANALYST",
        warehouse='PRD_WH',
        database='PRD_DWH',
        schema=schema
    )

def download_from_snowflake(sql_query):
    with create_snowflake_connection().cursor(DictCursor) as cur:
            data = cur.execute(sql_query).fetchall()
            data = pd.DataFrame(data).rename(columns=str.lower)
    return data

def read_sql_query(sql_file):
    return sql_file.read_text()
 

def write_to_table(df, table_name):
    drop_sql = f'DROP TABLE IF EXISTS {table_name.upper()}'
    with create_snowflake_connection(schema='SANDBOX') as conn:
        conn.cursor().execute(drop_sql)
        write_pandas(conn, df, table_name.upper())

  warn_incompatible_dep(
Failed to import ArrowResult. No Apache Arrow result set format can be used. ImportError: DLL load failed while importing arrow_iterator: The specified procedure could not be found.


In [3]:
import numpy as np

def is_outlier(points, thresh=3.5):
    """
    Returns a boolean array with True if points are outliers and False 
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. 
    """
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [4]:
from snowflake.snowpark import Session

credentials = get_snowflake_credentials()
connection_parameters = {
    "account": str(credentials['account']),
    "user": str(credentials['user']),
    "password" : str(credentials['password']),
    "role": "PRD_ANALYST",
    "warehouse": "PRD_WH",
    "database": "PRD_DWH",
    "schema": "SANDBOX"
}

test_session = Session.builder.configs(connection_parameters).create()
#test_session.sql('select current_date, current_warehouse(), current_database(), current_schema()').collect()
#test_session.close()

NoCredentialsError: Unable to locate credentials

In [6]:
connection_parameters = {
    "user" : "sophie.jones@contractor.itech.media",
    "account" : "gs46004.eu-west-1",
    "role" : "PRD_ANALYST",
    "warehouse" : "PRD_WH",
    "database" : "PRD_DWH",
    "authenticator" : "externalbrowser"
}


test_session = Session.builder.configs(connection_parameters).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://netman.okta.com/app/snowflake/exk5idnkinyNtZ20m417/sso/saml?SAMLRequest=lZJBb%2BIwEIX%2FSuQ9J3ZCKK0FVCwsKivoogKVlpubDGCS2KnHIdBfvyaUVffQSnuz7PdmPs%2Bb7v2xyL0DGJRa9UgYMOKBSnQq1bZHVsuxf0s8tEKlItcKeuQESO77XRRFXvJBZXfqCV4rQOu5Qgp589AjlVFcC5TIlSgAuU34YjCb8ihgvDTa6kTn5IPla4dABGMd4dWSonR4O2tLTmld10HdCrTZ0ogxRtkddaqz5NtVf3R%2F%2BkQfUhaf9U7h5PN3tu9SXUbwFdbLRYT8Ybmc%2B%2FNfiyXxBlfUoVZYFWAWYA4ygdXT9AKAjmCL8Q1jcQCVX7vR%2BWGAStebXGSQ6KKsrKsauBPdQEpzvZXu45NRj5SZTI%2FxbGzGh%2Bd1lRz3P38vHt7ELtT7znClfuyj9iJ9fWlPxc3pbpYlxHu%2BJhudk50gVjBR5zytu2JRy2exH3WWYcxbjLduA9Zpr4k3clBSCds4r9AKbCFUoDMrGjZRlvQvNoVj1papyqQ6Pdp1xIo47FBETc%2FpksvC8Ka%2F6f%2FvGLr0o%2Ft99x5dHJPRXOcyOXljbQphP08rDMLmRqb%2BppFyKITMB2lqANGllue6HhoQ1q24NRUQ2r90%2FXfJ%2B38A&RelayState=51271 t

### Read in dimensional.f_aa_engagement_by_site as already includes time on site data

In [7]:
f_aa_engagement_by_site = test_session.table('dimensional.f_aa_engagement_by_site')

In [None]:
f_aa_engagement_by_site.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'VISIT_DATE_TIME_UTC',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'D_USER_LANGUAGE_SK',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'TIME_ON_SITE_SECONDS',
 'SITE_VISIT_CONVERSION',
 'SITE_VISIT_UNQIUE_CLICKOUT',
 'SITE_VISIT_TOTAL_CLICKOUT',
 'SITE_VISIT_ENGAGEMENT',
 'SITE_VISIT_BOUNCE',
 'HAS_PAGE_VIEW_POKER',
 'SITE_VISIT_CONVERSION_POKER',
 'SITE_VISIT_UNIQUE_CLICKOUT_POKER',
 'SITE_VISIT_TOTAL_CLICKOUT_POKER',
 'SITE_VISIT_ENGAGEMENT_POKER',
 'SITE_VISIT_BOUNCE_POKER',
 'HAS_PAGE_VIEW_CASINO',
 'SITE_VISIT_CONVERSION_CASINO',
 'SITE_VISIT_UNIQUE_CLICKOUT_CASINO',
 'SITE_VISIT_TOTAL_CLICKOUT_CASINO',
 'SITE_VISIT_ENGAGEMENT_CASINO',
 'SITE_VISIT_BOUNCE_CASINO',
 'HAS_PAGE_VIEW_SPORTS',
 'SITE_VISIT_CONVERSION_SPORTS',
 'SITE_VISIT_UNIQUE_CLICKOUT_SPORTS',
 'SITE_VISIT_TOTAL_CLICKOUT_SPORTS',
 'SITE_VISIT_ENGAGEMENT_SPORTS',
 'SITE_VISIT_BOUNCE_SPORTS',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TAS

In [13]:
from snowflake.snowpark.functions import col, lit, sum as sum_, max as max_
f_aa_engagement_by_site.group_by("time_on_site_seconds").agg((col("*"), "count")).sample(n=500).show(5)

-----------------------------------------------
|"TIME_ON_SITE_SECONDS"  |"COUNT(LITERAL())"  |
-----------------------------------------------
|7                       |1463743             |
|32680                   |5                   |
|9100                    |470                 |
|28383                   |5                   |
|4461                    |3409                |
-----------------------------------------------



## Basic Descriptive Statistics about the time on site column - to understand the spread of the data

In [15]:
# Get basic descriptive statistics
f_aa_engagement_by_site.agg({"time_on_site_seconds": "avg"}).show()
f_aa_engagement_by_site.agg({"time_on_site_seconds": "std"}).show()
f_aa_engagement_by_site.agg({"time_on_site_seconds": "min"}).show()
f_aa_engagement_by_site.agg({"time_on_site_seconds": "max"}).show()
quartiles = f_aa_engagement_by_site.approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
print("Median: {}".format(quartiles[1]))
print("25th percentile: {}".format(quartiles[0]))
print("75th percentile: {}".format(quartiles[2]))


-------------------------------
|"AVG(TIME_ON_SITE_SECONDS)"  |
-------------------------------
|661.095333                   |
-------------------------------

----------------------------------
|"STDDEV(TIME_ON_SITE_SECONDS)"  |
----------------------------------
|1429.2346441707884              |
----------------------------------

-------------------------------
|"MIN(TIME_ON_SITE_SECONDS)"  |
-------------------------------
|0                            |
-------------------------------

-------------------------------
|"MAX(TIME_ON_SITE_SECONDS)"  |
-------------------------------
|43200                        |
-------------------------------

Median: 225.36372891611407
25th percentile: 30.000000000000007
75th percentile: 618.5334082457388


## Basic Descriptive Statistics Segmented

### Device Type

In [16]:
# Get basic descriptive statistics
device_group = f_aa_engagement_by_site.groupBy('device_type')
avg = device_group.agg({"time_on_site_seconds": "avg"})
device_group.agg({"time_on_site_seconds": "avg"}).show()
device_group.agg({"time_on_site_seconds": "std"}).show()
device_group.agg({"time_on_site_seconds": "min"}).show()
device_group.agg({"time_on_site_seconds": "max"}).show()
#print(avg['DEVICE_TYPE'].count())

#quartiles = device_group.approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
#print("Median: {}".format(quartiles[1]))
#print("25th percentile: {}".format(quartiles[0]))
#print("75th percentile: {}".format(quartiles[2]))

distinct_ids = [x.DEVICE_TYPE for x in avg.select('DEVICE_TYPE').distinct().collect()]
for i in distinct_ids:
    print('')
    print(f'{i} quartiles:')
    quartiles = f_aa_engagement_by_site.filter(f_aa_engagement_by_site['device_type'] == i).approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
    print("Median: {}".format(quartiles[1]))
    print("25th percentile: {}".format(quartiles[0]))
    print("75th percentile: {}".format(quartiles[2]))
    

-----------------------------------------------------
|"DEVICE_TYPE"        |"AVG(TIME_ON_SITE_SECONDS)"  |
-----------------------------------------------------
|Mobile Phone         |593.343893                   |
|eReader              |216.772727                   |
|Geolocation Tracker  |455.066667                   |
|Glasses              |440.000000                   |
|Desktop              |799.096699                   |
|Braille Tablet       |981.650888                   |
|[Missing]            |471.364617                   |
|VR Headset           |9.000000                     |
|Security Hub         |149.000000                   |
|TV                   |803.778111                   |
-----------------------------------------------------

--------------------------------------------------------------
|"DEVICE_TYPE"              |"STDDEV(TIME_ON_SITE_SECONDS)"  |
--------------------------------------------------------------
|TV                         |1522.9949687388992       

### Marketing channel

In [17]:
#channel_name

In [18]:
# Get basic descriptive statistics
channel_name = f_aa_engagement_by_site.groupBy('channel_name')
avg = channel_name.agg({"time_on_site_seconds": "avg"})
channel_name.agg({"time_on_site_seconds": "avg"}).show()
channel_name.agg({"time_on_site_seconds": "std"}).show()
channel_name.agg({"time_on_site_seconds": "min"}).show()
channel_name.agg({"time_on_site_seconds": "max"}).show()
#print(avg['DEVICE_TYPE'].count())

#quartiles = device_group.approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
#print("Median: {}".format(quartiles[1]))
#print("25th percentile: {}".format(quartiles[0]))
#print("75th percentile: {}".format(quartiles[2]))

distinct_ids = [x.CHANNEL_NAME for x in avg.select('channel_name').distinct().collect()]
for i in distinct_ids:
    print('')
    print(f'{i} quartiles:')
    quartiles = f_aa_engagement_by_site.filter(f_aa_engagement_by_site['channel_name'] == i).approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
    print("Median: {}".format(quartiles[1]))
    print("25th percentile: {}".format(quartiles[0]))
    print("75th percentile: {}".format(quartiles[2]))

------------------------------------------------------
|"CHANNEL_NAME"        |"AVG(TIME_ON_SITE_SECONDS)"  |
------------------------------------------------------
|Direct                |686.665165                   |
|Email                 |837.114716                   |
|[Missing]             |409.464588                   |
|Paid Social           |91.712333                    |
|Organic Search        |575.976404                   |
|Referring             |950.318589                   |
|News & Blogs          |155.289525                   |
|Display View Through  |917.179542                   |
|Programmatic          |74.412707                    |
|Social Media          |460.930900                   |
------------------------------------------------------

---------------------------------------------------------
|"CHANNEL_NAME"        |"STDDEV(TIME_ON_SITE_SECONDS)"  |
---------------------------------------------------------
|Direct                |1420.2745374718932             

### Product

In [19]:
# HAS_PAGE_VIEW_POKER, HAS_PAGE_VIEW_CASINO, HAS_PAGE_VIEW_SPORTS

In [6]:
f_aa_engagement_by_site.select('HAS_PAGE_VIEW_POKER', 'HAS_PAGE_VIEW_CASINO', 'HAS_PAGE_VIEW_SPORTS').show(5)

---------------------------------------------------------------------------
|"HAS_PAGE_VIEW_POKER"  |"HAS_PAGE_VIEW_CASINO"  |"HAS_PAGE_VIEW_SPORTS"  |
---------------------------------------------------------------------------
|False                  |False                   |False                   |
|False                  |False                   |False                   |
|False                  |False                   |False                   |
|False                  |False                   |False                   |
|False                  |False                   |False                   |
---------------------------------------------------------------------------



In [7]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn("HAS_PAGE_VIEW_POKER_ENCODED", fn.when(f_aa_engagement_by_site["HAS_PAGE_VIEW_POKER"], 1).otherwise(0))
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn("HAS_PAGE_VIEW_CASINO_ENCODED", fn.when(f_aa_engagement_by_site["HAS_PAGE_VIEW_CASINO"], 1).otherwise(0))
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn("HAS_PAGE_VIEW_SPORTS_ENCODED", fn.when(f_aa_engagement_by_site["HAS_PAGE_VIEW_SPORTS"], 1).otherwise(0))
f_aa_engagement_by_site.select('HAS_PAGE_VIEW_POKER', 'HAS_PAGE_VIEW_CASINO', 'HAS_PAGE_VIEW_SPORTS','HAS_PAGE_VIEW_POKER_ENCODED','HAS_PAGE_VIEW_CASINO_ENCODED','HAS_PAGE_VIEW_SPORTS_ENCODED').show(5)

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"HAS_PAGE_VIEW_POKER"  |"HAS_PAGE_VIEW_CASINO"  |"HAS_PAGE_VIEW_SPORTS"  |"HAS_PAGE_VIEW_POKER_ENCODED"  |"HAS_PAGE_VIEW_CASINO_ENCODED"  |"HAS_PAGE_VIEW_SPORTS_ENCODED"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|False                  |False                   |False                   |0                              |0                               |0                               |
|False                  |False                   |False                   |0                              |0                               |0                               |
|False                  |False                   |False                   |0                              |0                      

In [8]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn(
    "Product_Count", (f_aa_engagement_by_site.HAS_PAGE_VIEW_POKER_ENCODED  + f_aa_engagement_by_site.HAS_PAGE_VIEW_CASINO_ENCODED + f_aa_engagement_by_site.HAS_PAGE_VIEW_SPORTS_ENCODED))

In [9]:
from snowflake.snowpark.functions import col
f_aa_engagement_by_site.group_by(["time_on_site_seconds","device_type"]).agg((col("*"), "count")).show(10) 

---------------------------------------------------------------
|"TIME_ON_SITE_SECONDS"  |"DEVICE_TYPE"  |"COUNT(LITERAL())"  |
---------------------------------------------------------------
|581                     |Desktop        |26326               |
|338                     |Desktop        |55445               |
|56                      |Mobile Phone   |380021              |
|248                     |Mobile Phone   |356559              |
|316                     |Desktop        |65229               |
|763                     |Desktop        |20493               |
|427                     |Mobile Phone   |76524               |
|54                      |Mobile Phone   |349296              |
|450                     |Desktop        |34550               |
|599                     |Mobile Phone   |52146               |
---------------------------------------------------------------



In [10]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn("desktop",fn.when(f_aa_engagement_by_site.device_type == 'Desktop', 1).otherwise(0))

In [11]:
f_aa_engagement_by_site.select(['device_type','desktop']).show(5)

-----------------------------
|"DEVICE_TYPE"  |"DESKTOP"  |
-----------------------------
|Mobile Phone   |0          |
|Mobile Phone   |0          |
|Desktop        |1          |
|Mobile Phone   |0          |
|Mobile Phone   |0          |
-----------------------------



In [20]:
device_group = f_aa_engagement_by_site.groupBy('device_type')
avg = device_group.agg({"time_on_site_seconds": "avg"})
distinct_ids = [x.DEVICE_TYPE for x in avg.select('DEVICE_TYPE').distinct().collect()]
dict = {}
keys = range(len(distinct_ids))
for i in distinct_ids:
        dict[i] = "sum"
print(dict)

{'Games Console': 'sum', 'Mobile Phone': 'sum', 'Geolocation Tracker': 'sum', 'eReader': 'sum', 'TV': 'sum', 'Data Collection Terminal': 'sum', 'Projector': 'sum', 'Vehicle Multimedia System': 'sum', 'Glasses': 'sum', 'Security Hub': 'sum', 'VR Headset': 'sum', 'Digital Home Assistant': 'sum', 'Desktop': 'sum', 'Braille Tablet': 'sum', '[Missing]': 'sum', 'Wristwatch': 'sum', 'Set Top Box': 'sum', 'Wireless Hotspot': 'sum', 'Digital Signage Media Player': 'sum', 'Media Player': 'sum', 'Tablet': 'sum', 'Embedded Network Module': 'sum', 'Payment Terminal': 'sum', 'Camera': 'sum'}


In [18]:
for i in distinct_ids:
        f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn(i,fn.when(f_aa_engagement_by_site.device_type == i, 1).otherwise(0))

In [23]:
f_aa_engagement_by_site.agg(dict).show()

AttributeError: DataFrame object has no attribute asc

In [9]:
f_aa_engagement_by_site.select('HAS_PAGE_VIEW_POKER', 'HAS_PAGE_VIEW_CASINO', 'HAS_PAGE_VIEW_SPORTS','HAS_PAGE_VIEW_POKER_ENCODED','HAS_PAGE_VIEW_CASINO_ENCODED','HAS_PAGE_VIEW_SPORTS_ENCODED','PRODUCT_COUNT').show(5)

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"HAS_PAGE_VIEW_POKER"  |"HAS_PAGE_VIEW_CASINO"  |"HAS_PAGE_VIEW_SPORTS"  |"HAS_PAGE_VIEW_POKER_ENCODED"  |"HAS_PAGE_VIEW_CASINO_ENCODED"  |"HAS_PAGE_VIEW_SPORTS_ENCODED"  |"PRODUCT_COUNT"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|False                  |False                   |True                    |0                              |0                               |1                               |1                |
|False                  |False                   |True                    |0                              |0                               |1                               |1                |
|False                  |False          

In [10]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn(
    "Product_Named", fn.when(f_aa_engagement_by_site.PRODUCT_COUNT == 2, "Multi-Product").when(f_aa_engagement_by_site.PRODUCT_COUNT == 3, "Multi-Product").when(f_aa_engagement_by_site.HAS_PAGE_VIEW_POKER == True, 'Poker').when(f_aa_engagement_by_site.HAS_PAGE_VIEW_CASINO == True, 'Casino').when(f_aa_engagement_by_site.HAS_PAGE_VIEW_SPORTS == True, 'Sports').otherwise('None'))

In [11]:
f_aa_engagement_by_site.select('HAS_PAGE_VIEW_POKER_ENCODED','HAS_PAGE_VIEW_CASINO_ENCODED','HAS_PAGE_VIEW_SPORTS_ENCODED','PRODUCT_COUNT','Product_Named').show(100)

---------------------------------------------------------------------------------------------------------------------------------------
|"HAS_PAGE_VIEW_POKER_ENCODED"  |"HAS_PAGE_VIEW_CASINO_ENCODED"  |"HAS_PAGE_VIEW_SPORTS_ENCODED"  |"PRODUCT_COUNT"  |"PRODUCT_NAMED"  |
---------------------------------------------------------------------------------------------------------------------------------------
|0                              |0                               |0                               |0                |None             |
|0                              |0                               |0                               |0                |None             |
|0                              |0                               |0                               |0                |None             |
|0                              |0                               |0                               |0                |None             |
|0                              |0              

In [13]:
# Get basic descriptive statistics
Product = f_aa_engagement_by_site.groupBy('Product_Named')
avg = Product.agg({"time_on_site_seconds": "avg"})
Product.agg({"time_on_site_seconds": "avg"}).show()
Product.agg({"time_on_site_seconds": "std"}).show()
Product.agg({"time_on_site_seconds": "min"}).show()
Product.agg({"time_on_site_seconds": "max"}).show()
#print(avg['DEVICE_TYPE'].count())

#quartiles = device_group.approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
#print("Median: {}".format(quartiles[1]))
#print("25th percentile: {}".format(quartiles[0]))
#print("75th percentile: {}".format(quartiles[2]))

distinct_ids = [x.PRODUCT_NAMED for x in avg.select('PRODUCT_NAMED').distinct().collect()]
for i in distinct_ids:
    print('')
    print(f'{i} quartiles:')
    quartiles = f_aa_engagement_by_site.filter(f_aa_engagement_by_site['Product_Named'] == i).approx_quantile("time_on_site_seconds", [0.25, 0.5, 0.75])
    print("Median: {}".format(quartiles[1]))
    print("25th percentile: {}".format(quartiles[0]))
    print("75th percentile: {}".format(quartiles[2]))

-------------------------------------------------
|"PRODUCT_NAMED"  |"AVG(TIME_ON_SITE_SECONDS)"  |
-------------------------------------------------
|Multi-Product    |1262.159187                  |
|Sports           |701.955490                   |
|None             |794.354279                   |
|Poker            |598.298993                   |
|Casino           |352.303339                   |
-------------------------------------------------

----------------------------------------------------
|"PRODUCT_NAMED"  |"STDDEV(TIME_ON_SITE_SECONDS)"  |
----------------------------------------------------
|Multi-Product    |2296.4449146308734              |
|Sports           |1492.3418319560703              |
|Casino           |805.3379624356473               |
|Poker            |1241.2755285789694              |
|None             |1604.4333551222999              |
----------------------------------------------------

-------------------------------------------------
|"PRODUCT_NAMED"  |"M

## Plot the distrubtion of the time on site column - in order to visualise the spread and recognise any patterns

In [None]:
df = f_aa_engagement_by_site.select('time_on_site_seconds').to_pandas()

In [None]:
import matplotlib.pyplot as plt
# Plot the histogram
plt.hist(df, bins=50)
plt.xlabel('Time on site (seconds)')
plt.ylabel("Frequency")
plt.show()

### As the outliers make it diffucult to visualise the distrubtion, define a function to remove outliers and replot

In [None]:
filtered_df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'])]

In [None]:
# Plot the histogram with outliers removed
plt.hist(filtered_df, bins=100)
plt.xlabel('Time on site (seconds)')
plt.ylabel("Frequency")
plt.show()

### Plot distrubtion with harsher outlier removal and showing engaged vs not engaged users

In [None]:
df = f_aa_engagement_by_site.select(['time_on_site_seconds','SITE_VISIT_ENGAGEMENT']).to_pandas()
df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'],thresh=2)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

x1 = list(df[df['SITE_VISIT_ENGAGEMENT'] == True]['TIME_ON_SITE_SECONDS'])
x2 = list(df[df['SITE_VISIT_ENGAGEMENT'] == False]['TIME_ON_SITE_SECONDS'])
 
# Setting colors and names
colors=['green','blue']
names=['Engaged User', 'None Enagaged User']
 
# Creating plot with list values, colors and names (labels)
# Note the density value set as true which represents the
# probability distribution
#
plt.hist([x1, x2], bins =100, color=colors, label=names, density=True)
plt.show()

### Plot the distrubtion with harsh outlier remove and label the quartiles of the complete (no outlier removal) time on site data

In [None]:
df = f_aa_engagement_by_site.select(['time_on_site_seconds','SITE_VISIT_ENGAGEMENT']).to_pandas()
df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'],thresh=1)]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

x1 = list(df[df['SITE_VISIT_ENGAGEMENT'] == True]['TIME_ON_SITE_SECONDS'])
x2 = list(df[df['SITE_VISIT_ENGAGEMENT'] == False]['TIME_ON_SITE_SECONDS'])
 
# Setting colors and names
colors=['green','blue']
names=['Engaged User', 'None Enagaged User']
 
fig, ax = plt.subplots(figsize = (20,4))

# Creating plot with list values, colors and names (labels)
# Note the density value set as true which represents the
# probability distribution
#
plt.hist([x1, x2 ], bins =100, color=colors, label=names, density=True)

quant_5, quant_25, quant_50, quant_75 = f_aa_engagement_by_site.approx_quantile("time_on_site_seconds", [0.05, 0.25, 0.5, 0.75])
quants = [[quant_5, 0.6, 0.16], [quant_25, 0.8, 0.26], [quant_50, 1, 0.36],  [quant_75, 0.8, 0.46]]
for i in quants:
    ax.axvline(i[0], alpha = i[1], ymax = i[2], linestyle = ":", color= 'black')

ax.xaxis.set_ticks(np.arange(0, 550, 15))

# Annotations
ax.text(quant_5-.3, 0.087, "5th", size = 10, alpha = 0.8)
ax.text(quant_25-.13, 0.03, "25th", size = 11, alpha = 0.85)
ax.text(quant_50-.13, 0.04, "50th", size = 12, alpha = 1)
ax.text(quant_75-.13, 0.05, "75th", size = 11, alpha = 0.85)

# Overall
ax.grid(False)

plt.show()

### Plot the distrubtion of the time on site column with outlier removal and segmented

In [9]:
f_aa_engagement_by_site.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'VISIT_DATE_TIME_UTC',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'D_USER_LANGUAGE_SK',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'TIME_ON_SITE_SECONDS',
 'SITE_VISIT_CONVERSION',
 'SITE_VISIT_UNQIUE_CLICKOUT',
 'SITE_VISIT_TOTAL_CLICKOUT',
 'SITE_VISIT_ENGAGEMENT',
 'SITE_VISIT_BOUNCE',
 'HAS_PAGE_VIEW_POKER',
 'SITE_VISIT_CONVERSION_POKER',
 'SITE_VISIT_UNIQUE_CLICKOUT_POKER',
 'SITE_VISIT_TOTAL_CLICKOUT_POKER',
 'SITE_VISIT_ENGAGEMENT_POKER',
 'SITE_VISIT_BOUNCE_POKER',
 'HAS_PAGE_VIEW_CASINO',
 'SITE_VISIT_CONVERSION_CASINO',
 'SITE_VISIT_UNIQUE_CLICKOUT_CASINO',
 'SITE_VISIT_TOTAL_CLICKOUT_CASINO',
 'SITE_VISIT_ENGAGEMENT_CASINO',
 'SITE_VISIT_BOUNCE_CASINO',
 'HAS_PAGE_VIEW_SPORTS',
 'SITE_VISIT_CONVERSION_SPORTS',
 'SITE_VISIT_UNIQUE_CLICKOUT_SPORTS',
 'SITE_VISIT_TOTAL_CLICKOUT_SPORTS',
 'SITE_VISIT_ENGAGEMENT_SPORTS',
 'SITE_VISIT_BOUNCE_SPORTS',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TAS

Attempting to plot a histrogram of data filtered by device types, time on site column but am unable to plot woth snowpark and everytime i try to convert to pandas the kernel dies
- tried to go pandas in chunks to hdf5 files then to vaex

In [11]:
device_group = f_aa_engagement_by_site.groupBy('device_type')
avg = device_group.agg({"time_on_site_seconds": "avg"})
distinct_ids = [x.DEVICE_TYPE for x in avg.select('DEVICE_TYPE').distinct().collect()]
for i in distinct_ids:
    f_aa_engagement_by_site.filter(f_aa_engagement_by_site['device_type'] == i).select(['device_type','time_on_site_seconds']).show(5)

------------------------------------------
|"DEVICE_TYPE"  |"TIME_ON_SITE_SECONDS"  |
------------------------------------------
|Games Console  |45                      |
|Games Console  |0                       |
|Games Console  |379                     |
|Games Console  |29                      |
|Games Console  |9                       |
------------------------------------------

------------------------------------------
|"DEVICE_TYPE"  |"TIME_ON_SITE_SECONDS"  |
------------------------------------------
|Mobile Phone   |1060                    |
|Mobile Phone   |10                      |
|Mobile Phone   |18                      |
|Mobile Phone   |90                      |
|Mobile Phone   |240                     |
------------------------------------------

------------------------------------------
|"DEVICE_TYPE"  |"TIME_ON_SITE_SECONDS"  |
------------------------------------------
|eReader        |239                     |
|eReader        |1407                    |
|eReader 

In [None]:
device_group = f_aa_engagement_by_site.groupBy('device_type')
avg = device_group.agg({"time_on_site_seconds": "avg"})
distinct_ids = [x.DEVICE_TYPE for x in avg.select('DEVICE_TYPE').distinct().collect()]
for i in distinct_ids:
    for chunk in f_aa_engagement_by_site.filter(f_aa_engagement_by_site['device_type'] == i).to_pandas_batches():
        pandas_df = load_chunk(chunk) # your function to load a piece that does fit into memory
        pandas_df.export(f'data/chunk_{chunk}.hdf5')
    df = vaex.open('data/chunk_*.hdf5')
    df.head()
    break



#df = vaex.open('chunk_*.hdf5', convert='big.hdf5') # will convert to 1 file if it does not exist

In [None]:
df = f_aa_engagement_by_site.select(['device_type','time_on_site_seconds']).to_pandas()

filtered_df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'])]
# Plot the histogram with outliers removed
plt.hist(filtered_df, bins=100)
plt.xlabel('Time on site (seconds)')
plt.ylabel("Frequency")
plt.show()

In [None]:
import matplotlib.pyplot as plt
device_group = f_aa_engagement_by_site.groupBy('device_type')
avg = device_group.agg({"time_on_site_seconds": "avg"})

distinct_ids = [x.DEVICE_TYPE for x in avg.select('DEVICE_TYPE').distinct().collect()]
for i in distinct_ids:
    print(f'{i} distrubtion visualisation:')
    df = f_aa_engagement_by_site.filter(f_aa_engagement_by_site['device_type'] == i)
    print('df created')
    df = df.select('time_on_site_seconds').to_pandas_batches()
    #filtered_df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'])]
    # Plot the histogram with outliers removed
    plt.hist(i, bins=100)
    plt.xlabel('Time on site (seconds)')
    plt.ylabel("Frequency")
    plt.show()
    print('')
    

Games Console distrubtion visualisation:
df created


In [None]:
for df in f_aa_engagement_by_site.select(['visit_date_time_utc','device_type','time_on_site_seconds']).to_pandas_batches():
     print(df.head())

In [None]:
device_df = f_aa_engagement_by_site.select(['visit_date_time_utc','device_type','time_on_site_seconds']).to_pandas()
#'channel_name','product_named',

In [None]:
filtered_df = df[~is_outlier(df['TIME_ON_SITE_SECONDS'])]
# Plot the histogram with outliers removed
plt.hist(filtered_df, bins=100)
plt.xlabel('Time on site (seconds)')
plt.ylabel("Frequency")
plt.show()

## How does time on site change over time?
### Plot the monthly total time on site as a time series

In [None]:
df = f_aa_engagement_by_site.select(['time_on_site_seconds','visit_date_time_utc']).to_pandas()
df['month_year'] = df['VISIT_DATE_TIME_UTC'].dt.to_period('M')
df.groupby('month_year').sum('TIME_ON_SITE_SECONDS').drop('2023-04').plot()

### Plot the weekly total time on site as a time series

In [None]:
df.tail()

In [59]:
df['week_month_year'] = df['VISIT_DATE_TIME_UTC'].dt.to_period('W')
weeklydf = df.groupby('week_month_year').sum('TIME_ON_SITE_SECONDS')
weeklydf.drop('2023-04-10/2023-04-16').plot()

## Observe Correlations between time on site and other key metrics

In [8]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn(
    "visit_engagement_encoded",
    fn.when(f_aa_engagement_by_site.SITE_VISIT_ENGAGEMENT == True, 1).otherwise(0)
           )

In [10]:
f_aa_engagement_by_site.columns

['VISIT_DATE_UTC',
 'D_SITE_HIST_SK',
 'VISIT_DATE_TIME_UTC',
 'VISIT_ID',
 'DEVICE_TYPE',
 'CONNECTION_TYPE',
 'D_COUNTRY_HIST_SK',
 'STATE_CODE',
 'D_USER_LANGUAGE_SK',
 'VISIT_REFERRER_TYPE',
 'VISIT_SEARCH_ENGINE',
 'TIME_ON_SITE_SECONDS',
 'SITE_VISIT_CONVERSION',
 'SITE_VISIT_UNQIUE_CLICKOUT',
 'SITE_VISIT_TOTAL_CLICKOUT',
 'SITE_VISIT_ENGAGEMENT',
 'SITE_VISIT_BOUNCE',
 'HAS_PAGE_VIEW_POKER',
 'SITE_VISIT_CONVERSION_POKER',
 'SITE_VISIT_UNIQUE_CLICKOUT_POKER',
 'SITE_VISIT_TOTAL_CLICKOUT_POKER',
 'SITE_VISIT_ENGAGEMENT_POKER',
 'SITE_VISIT_BOUNCE_POKER',
 'HAS_PAGE_VIEW_CASINO',
 'SITE_VISIT_CONVERSION_CASINO',
 'SITE_VISIT_UNIQUE_CLICKOUT_CASINO',
 'SITE_VISIT_TOTAL_CLICKOUT_CASINO',
 'SITE_VISIT_ENGAGEMENT_CASINO',
 'SITE_VISIT_BOUNCE_CASINO',
 'HAS_PAGE_VIEW_SPORTS',
 'SITE_VISIT_CONVERSION_SPORTS',
 'SITE_VISIT_UNIQUE_CLICKOUT_SPORTS',
 'SITE_VISIT_TOTAL_CLICKOUT_SPORTS',
 'SITE_VISIT_ENGAGEMENT_SPORTS',
 'SITE_VISIT_BOUNCE_SPORTS',
 'ETL_LOAD_TIME',
 'ETL_DAG_ID',
 'ETL_TAS

In [11]:
f_aa_engagement_by_site.select('TIME_ON_SITE_SECONDS','SITE_VISIT_ENGAGEMENT','VISIT_ENGAGEMENT_ENCODED').show(5)

---------------------------------------------------------------------------------
|"TIME_ON_SITE_SECONDS"  |"SITE_VISIT_ENGAGEMENT"  |"VISIT_ENGAGEMENT_ENCODED"  |
---------------------------------------------------------------------------------
|151                     |True                     |1                           |
|14                      |False                    |0                           |
|92                      |True                     |1                           |
|119                     |True                     |1                           |
|177                     |True                     |1                           |
---------------------------------------------------------------------------------



In [12]:
#time on site with engagement correlation
corr = f_aa_engagement_by_site.select(fn.corr(f_aa_engagement_by_site.TIME_ON_SITE_SECONDS, f_aa_engagement_by_site.VISIT_ENGAGEMENT_ENCODED)).collect()[0][0]
    
# Print the correlation coefficient
print(f"The correlation coefficient between time on site and visit engagement is {corr:.3f}")

The correlation coefficient between column1 and column2 is 0.203


In [15]:
f_aa_engagement_by_site.select('TIME_ON_SITE_SECONDS','SITE_VISIT_ENGAGEMENT','VISIT_ENGAGEMENT_ENCODED','SITE_VISIT_CONVERSION').show(5)

-----------------------------------------------------------------------------------------------------------
|"TIME_ON_SITE_SECONDS"  |"SITE_VISIT_ENGAGEMENT"  |"VISIT_ENGAGEMENT_ENCODED"  |"SITE_VISIT_CONVERSION"  |
-----------------------------------------------------------------------------------------------------------
|0                       |False                    |0                           |False                    |
|139                     |True                     |1                           |False                    |
|258                     |True                     |1                           |False                    |
|0                       |False                    |0                           |False                    |
|29                      |True                     |1                           |False                    |
-----------------------------------------------------------------------------------------------------------



In [16]:
f_aa_engagement_by_site = f_aa_engagement_by_site.withColumn(
    "visit_conversion_encoded",
    fn.when(f_aa_engagement_by_site.SITE_VISIT_CONVERSION == True, 1).otherwise(0)
           )

In [17]:
f_aa_engagement_by_site.select('TIME_ON_SITE_SECONDS','SITE_VISIT_ENGAGEMENT','VISIT_ENGAGEMENT_ENCODED','SITE_VISIT_CONVERSION','visit_conversion_encoded').show(5)

----------------------------------------------------------------------------------------------------------------------------------------
|"TIME_ON_SITE_SECONDS"  |"SITE_VISIT_ENGAGEMENT"  |"VISIT_ENGAGEMENT_ENCODED"  |"SITE_VISIT_CONVERSION"  |"VISIT_CONVERSION_ENCODED"  |
----------------------------------------------------------------------------------------------------------------------------------------
|1067                    |True                     |1                           |False                    |0                           |
|177                     |True                     |1                           |False                    |0                           |
|363                     |True                     |1                           |False                    |0                           |
|103                     |True                     |1                           |False                    |0                           |
|240                     |True           

In [18]:
#time on site with conversion (clickout) correlation
corr = f_aa_engagement_by_site.select(fn.corr(f_aa_engagement_by_site.TIME_ON_SITE_SECONDS, f_aa_engagement_by_site.VISIT_CONVERSION_ENCODED)).collect()[0][0]
    
# Print the correlation coefficient
print(f"The correlation coefficient between time on site and visit conversion is {corr:.3f}")

The correlation coefficient between column1 and column2 is -0.014


### would this improve with remove of the time on site outliers

In [19]:
f_aa_engagement_by_site.count()

345038879

In [20]:
corrdf = f_aa_engagement_by_site.select(['TIME_ON_SITE_SECONDS','VISIT_ENGAGEMENT_ENCODED','VISIT_CONVERSION_ENCODED']).to_pandas()
corrdf = corrdf[~is_outlier(corrdf['TIME_ON_SITE_SECONDS'],thresh=2)]

  points = points[:,None]


In [21]:
corrdf.count()

TIME_ON_SITE_SECONDS        272900944
VISIT_ENGAGEMENT_ENCODED    272900944
VISIT_CONVERSION_ENCODED    272900944
dtype: int64

In [22]:
#time on site with engagement correlation
corr = corrdf["TIME_ON_SITE_SECONDS"].corr(corrdf["VISIT_ENGAGEMENT_ENCODED"])

# Print the correlation coefficient
print(f"The correlation coefficient between TIME_ON_SITE_SECONDS and VISIT_ENGAGEMENT_ENCODED, with time on site outliers removed, is {corr:.3f}")

The correlation coefficient between TIME_ON_SITE_SECONDS and VISIT_ENGAGEMENT_ENCODED, with time on site outliers removed, is 0.438


In [23]:
#time on site with conversion (clickout) correlation
corr = corrdf["TIME_ON_SITE_SECONDS"].corr(corrdf["VISIT_CONVERSION_ENCODED"])

# Print the correlation coefficient
print(f"The correlation coefficient between TIME_ON_SITE_SECONDS and VISIT_CONVERSION_ENCODED, with time on site outliers removed, is {corr:.3f}")

The correlation coefficient between TIME_ON_SITE_SECONDS and VISIT_CONVERSION_ENCODED, with time on site outliers removed, is 0.030
