In [0]:
%run ./01-config

In [0]:
landing_zone = base_dir_data + "/raw"
checkpoint_base = base_dir_checkpoint + "/checkpoints"



# Setup flag
initialized = False

def create_database(catalog, db_name):
    global initialized
    print(f"Creating database {catalog}.{db_name}...", end='')
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {catalog}.{db_name}")
    spark.sql(f"USE {catalog}.{db_name}")
    initialized = True
    print("Done")



def create_registered_users_bz(catalog, db_name):
    if initialized:
        print(f"Creating registered_users_bz table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.registered_users_bz (
                user_id long,
                device_id long,
                mac_address string,
                registration_timestamp double,
                load_time timestamp,
                source_file string
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_gym_logins_bz(catalog, db_name):
    if initialized:
        print(f"Creating gym_logins_bz table...", end='')
        spark.sql(f"""
            CREATE OR REPLACE TABLE {catalog}.{db_name}.gym_logins_bz(
                mac_address string,
                gym bigint,
                login double,                      
                logout double,                    
                load_time timestamp,
                source_file string
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


def create_kafka_multiplex_bz(catalog, db_name):
    if initialized:
        print(f"Creating kafka_multiplex_bz table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.kafka_multiplex_bz(
                key string, 
                value string, 
                topic string, 
                partition bigint, 
                offset bigint, 
                timestamp bigint,                  
                date date, 
                week_part string,                  
                load_time timestamp,
                source_file string)
                PARTITIONED BY (topic, week_part)
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_users(catalog, db_name):
    if initialized:
        print(f"Creating users table...", end='')
        spark.sql(f"""
            CREATE OR REPLACE TABLE {catalog}.{db_name}.users(
                user_id bigint, 
                device_id bigint, 
                mac_address string,
                registration_timestamp timestamp
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_gym_logs(catalog, db_name):
    if initialized:
        print(f"Creating gym_logs table...", end='')
        spark.sql(f"""
            CREATE OR REPLACE TABLE {catalog}.{db_name}.gym_logs(
                mac_address string,
                gym bigint,
                login timestamp,                      
                logout timestamp
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_user_profile(catalog, db_name):
    if initialized:
        print(f"Creating user_profile table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.user_profile (
                 user_id bigint, 
                dob DATE, 
                sex STRING, 
                gender STRING, 
                first_name STRING, 
                last_name STRING, 
                street_address STRING, 
                city STRING, 
                state STRING, 
                zip INT, 
                updated TIMESTAMP
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_heart_rate(catalog, db_name):
    if initialized:
        print(f"Creating heart_rate table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.heart_rate (
                device_id LONG, 
                time TIMESTAMP, 
                heartrate DOUBLE, 
                valid BOOLEAN
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")




def create_user_bins(catalog, db_name):
    if initialized:
        print(f"Creating user_bins table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.user_bins (
                user_id BIGINT, 
                age STRING, 
                gender STRING, 
                city STRING, 
                state STRING
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_workouts(catalog, db_name):
    if initialized:
        print(f"Creating workouts table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.workouts (
                user_id INT, 
                workout_id INT, 
                time TIMESTAMP, 
                action STRING, 
                session_id INT
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")
    


def create_completed_workouts(catalog, db_name):
    if initialized:
        print(f"Creating completed_workouts table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.completed_workouts (
               user_id INT, 
               workout_id INT, 
               session_id INT, 
               start_time TIMESTAMP, 
               end_time TIMESTAMP
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


def create_workout_bpm(catalog, db_name):
    if initialized:
        print(f"Creating workout_bpm table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.workout_bpm (
                user_id INT, 
                workout_id INT, 
                session_id INT,
                start_time TIMESTAMP, 
                end_time TIMESTAMP,
                time TIMESTAMP, 
                heartrate DOUBLE
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def create_date_lookup(catalog, db_name):
    if initialized:
        print(f"Creating date_lookup table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.date_lookup (
                date date, 
                week int, 
                year int, 
                month int, 
                dayofweek int, 
                dayofmonth int, 
                dayofyear int, 
                week_part string
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



def workout_bpm_summary(catalog, db_name):
    if initialized:
        print(f"Creating workout_bpm_summary table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {catalog}.{db_name}.workout_bpm_summary (
                workout_id INT, 
                session_id INT, 
                user_id BIGINT, 
                age STRING, 
                gender STRING, 
                city STRING, 
                state STRING, 
                min_bpm DOUBLE, 
                avg_bpm DOUBLE, 
                max_bpm DOUBLE, 
                num_recordings BIGINT
            )
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


def create_gym_summary(catalog, db_name):
    if initialized:
        print(f"Creating gym_summary table...", end='')
        spark.sql(f"""CREATE OR REPLACE VIEW {catalog}.{db_name}.gym_summary AS
                        SELECT to_date(login::timestamp) date,
                        gym, l.mac_address, workout_id, session_id, 
                        round((logout::long - login::long)/60,2) minutes_in_gym,
                        round((end_time::long - start_time::long)/60,2) minutes_exercising
                        FROM gym_logs l 
                        JOIN (
                        SELECT mac_address, workout_id, session_id, start_time, end_time
                        FROM completed_workouts w INNER JOIN users u ON w.user_id = u.user_id) w
                        ON l.mac_address = w.mac_address 
                        AND w. start_time BETWEEN l.login AND l.logout
                        order by date, gym, l.mac_address, session_id
        """)
        print("Done")
    else:
        raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")




def create_all_tables(catalog, db_name):
    import time
    start = int(time.time())
    create_database(catalog, db_name)

    # Bronze
    create_registered_users_bz(catalog, db_name)
    create_gym_logins_bz(catalog, db_name)
    create_kafka_multiplex_bz(catalog, db_name)

    # Silver
    create_users(catalog, db_name)
    create_gym_logs(catalog, db_name)
    create_user_profile(catalog, db_name)
    create_heart_rate(catalog, db_name)
    create_user_bins(catalog, db_name)
    create_workouts(catalog, db_name)
    create_completed_workouts(catalog, db_name)
    create_workout_bpm(catalog, db_name)
    create_date_lookup(catalog, db_name)

    # Gold
    workout_bpm_summary(catalog, db_name)
    create_gym_summary(catalog, db_name)

    print(f"Setup completed in {int(time.time()) - start} seconds")



def assert_table(catalog, db_name, table_name):
    exists = spark.sql(f"SHOW TABLES IN {catalog}.{db_name}") \
                 .filter(f"isTemporary == false AND tableName == '{table_name}'") \
                 .count() == 1
    
    assert exists, f"The table {table_name} is missing in {catalog}.{db_name}"
    print(f"Found {table_name} table in {catalog}.{db_name}: Success")



def validate_setup(catalog, db_name):
    import time
    start = int(time.time())
    print(f"\n Starting setup validation ...")

    # Check if database exists
    db_exists = spark.sql(f"SHOW DATABASES IN {catalog}") \
                     .filter(f"databaseName == '{db_name}'") \
                     .count() == 1
    
    assert db_exists, f"The database '{catalog}.{db_name}' is missing"
    print(f"Found database {catalog}.{db_name}: Success")

    assert_table(catalog, db_name, "registered_users_bz")   
    assert_table(catalog, db_name, "gym_logins_bz")        
    assert_table(catalog, db_name, "kafka_multiplex_bz")
    assert_table(catalog, db_name, "users")
    assert_table(catalog, db_name, "gym_logs")
    assert_table(catalog, db_name, "user_profile")
    assert_table(catalog, db_name, "heart_rate")
    assert_table(catalog, db_name, "workouts")
    assert_table(catalog, db_name, "completed_workouts")
    assert_table(catalog, db_name, "workout_bpm")
    assert_table(catalog, db_name, "user_bins")
    assert_table(catalog, db_name, "date_lookup")
    assert_table(catalog, db_name, "workout_bpm_summary") 
    assert_table(catalog, db_name, "gym_summary") 

    print(f"Setup validation completed in {int(time.time()) - start} seconds")


def cleanup(catalog, db_name):
    try:
        # Check if the database exists
        if spark.sql(f"SHOW DATABASES IN {catalog}") \
                .filter(f"databaseName == '{db_name}'").count() == 1:
            
            print(f"Dropping the database {catalog}.{db_name}...", end='')
            spark.catalog.clearCache()
            spark.sql(f"DROP DATABASE {catalog}.{db_name} CASCADE")
            print("Done")
        else:
            print(f"Database {catalog}.{db_name} does not exist. Skipping drop.")
    except Exception as e:
        print(f"Error while dropping database: {str(e)}")
