In [None]:
import sys
import os
import yaml
import pathlib
import json
from logger import logger

In [None]:
try:
    from RedshiftConnector import RedshiftConnector
except Exception as e:
    logger.warning(f"Could not import RedshiftConnector")

In [None]:
homedir = os.path.expanduser("~")

with open(os.path.join(homedir, ".pb/siteconfig.yaml"), "r") as f:
    creds = yaml.safe_load(f)["connections"]["shopify_wh_rs"]["outputs"]["dev"]

if creds["type"] == "snowflake":
    print(
        f"Using {creds['schema']} schema in snowflake account: {creds['account']}"
    )
elif creds["type"] == "redshift":
    print(f"Using {creds['schema']} schema in Redshift account: {creds['host']}")
else:
    raise Exception(f"Unknown database type: {creds['type']}")

In [None]:
connector = RedshiftConnector("./")
cursor = connector.build_session(creds)

In [None]:
type(cursor)

In [None]:
pages = "pages"
tracks = "tracks"
identifies = "identifies"

# For Pages Table:

## Creating pages table copy

In [None]:
query_create_temp = f"""
    CREATE TABLE {pages}_1 
    AS (
        SELECT 
            anonymous_id, 
            user_id, 
            timestamp as timestamp, 
            context_campaign_name, 
            context_campaign_medium, 
            context_campaign_source, 
            context_session_id
        FROM {pages}
    );
"""
cursor.execute(query_create_temp)

print("Created temp tables successfully.")

In [None]:
iter = 1
data = {"100k": 100000, "500k": 500000, "1mn": 1000000}
query_count_row = f"select count(distinct anonymous_id) from {pages}_1"

In [None]:
for postfix, limit in data.items():
    # reaching limits one by one.
    while cursor.execute(query_count_row).fetch_dataframe()['count'][0] <= limit:
        query_extend_tmp = f"""
            INSERT INTO {pages}_1 (
                (select 
                    sha1(anonymous_id+{iter}) as anonymous_id, 
                    sha1(user_id+{iter}) as user_id, 
                    dateadd(day, CAST(RAND()*28 AS INT), T.timestamp) as timestamp, 
                    context_campaign_name, 
                    context_campaign_medium, 
                    context_campaign_source, 
                    context_session_id 
                from {pages}_1 T)
            );
        """
        cursor.execute(query_extend_tmp)
        iter += 1
    
    #saving the limit results.
    query_save_limit = f"""
        CREATE TABLE {pages}_{postfix}
        AS (
            SELECT 
                anonymous_id, 
                user_id, 
                timestamp as timestamp, 
                context_campaign_name, 
                context_campaign_medium, 
                context_campaign_source, 
                context_session_id
            FROM {pages}_1
        );
    """
    cursor.execute(query_save_limit)
    print(f"Saved {limit} rows successfully in table {pages}_{postfix}")

## Dropping extra tables

In [None]:
query_drop_temp = f"DROP TABLE {pages}_1"
cursor.execute(query_drop_temp)

print("Dropped temp tables successfully.")

# For Tracks Table:

## Creating tracks table copy

In [None]:
query_create_temp = f"""
    CREATE TABLE {tracks}_1 
    AS (
        SELECT 
            anonymous_id, 
            user_id, 
            timestamp as timestamp, 
            context_campaign_name, 
            context_campaign_medium, 
            context_campaign_source, 
            context_session_id
        FROM {tracks}
    );
"""
cursor.execute(query_create_temp)

print("Created temp tables successfully.")

In [None]:
iter = 1
data = {"100k": 100000, "500k": 500000}
query_count_row = f"select count(distinct anonymous_id) from {tracks}_1"

In [None]:
for postfix, limit in data.items():
    # reaching limits one by one.
    while cursor.execute(query_count_row).fetch_dataframe()['count'][0] <= limit:
        query_extend_tmp = f"""
            INSERT INTO {tracks}_1 (
                (select 
                    sha1(anonymous_id+{iter}) as anonymous_id, 
                    sha1(user_id+{iter}) as user_id, 
                    dateadd(day, CAST(RAND()*28 AS INT), T.timestamp) as timestamp, 
                    context_campaign_name, 
                    context_campaign_medium, 
                    context_campaign_source, 
                    context_session_id 
                from {tracks}_1 T)
            );
        """
        cursor.execute(query_extend_tmp)
        iter += 1
    
    #saving the limit results.
    query_save_limit = f"""
        CREATE TABLE {tracks}_{postfix}
        AS (
            SELECT 
                anonymous_id, 
                user_id, 
                timestamp as timestamp, 
                context_campaign_name, 
                context_campaign_medium, 
                context_campaign_source, 
                context_session_id
            FROM {tracks}_1
        );
    """
    cursor.execute(query_save_limit)
    print(f"Saved {limit} rows successfully in table {tracks}_{postfix}")

## Dropping extra tables

In [None]:
query_drop_temp = f"DROP TABLE {tracks}_1"
cursor.execute(query_drop_temp)

print("Dropped temp tables successfully.")

# For identifies Table:

## Creating identifies table copy

In [None]:
query_create_temp = f"""
    CREATE TABLE {identifies}_1 
    AS (
        SELECT 
            anonymous_id, 
            user_id, 
            email,
            timestamp as timestamp, 
            context_device_name, 
            context_device_type,
            context_device_manufacturer, 
            context_campaign_source, 
            address_country,
            currency,
            state,
            first_name,
            last_name
        FROM {identifies}
    );
"""
cursor.execute(query_create_temp)

print("Created temp tables successfully.")

In [None]:
iter = 1
data = {"100k": 100000, "500k": 500000, "1mn": 1000000}
query_count_row = f"select count(distinct anonymous_id) from {identifies}_1"

In [None]:
for postfix, limit in data.items():
    # reaching limits one by one.
    while cursor.execute(query_count_row).fetch_dataframe()['count'][0] <= limit:
        query_extend_tmp = f"""
            INSERT INTO {identifies}_1 (
                (select 
                    sha1(anonymous_id+{iter}) as anonymous_id, 
                    sha1(user_id+{iter}) as user_id,
                    sha1(email+{iter}) as email, 
                    dateadd(day, CAST(RAND()*28 AS INT), T.timestamp) as timestamp, 
                    context_device_name, 
                    context_device_type,
                    context_device_manufacturer, 
                    context_campaign_source, 
                    address_country,
                    currency,
                    state,
                    first_name,
                    last_name
                from {identifies}_1 T)
            );
        """
        cursor.execute(query_extend_tmp)
        iter += 1
    
    #saving the limit results.
    query_save_limit = f"""
        CREATE TABLE {identifies}_{postfix}
        AS (
            SELECT 
                anonymous_id, 
                user_id, 
                email,
                timestamp as timestamp, 
                context_device_name, 
                context_device_type,
                context_device_manufacturer, 
                context_campaign_source, 
                address_country,
                currency,
                state,
                first_name,
                last_name
            FROM {identifies}_1
        );
    """
    cursor.execute(query_save_limit)
    print(f"Saved {limit} rows successfully in table {identifies}_{postfix}")

## Dropping extra tables

In [None]:
query_drop_temp = f"DROP TABLE {identifies}_1"
cursor.execute(query_drop_temp)

print("Dropped temp tables successfully.")