In [8]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.

In [3]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [4]:
q = """
WITH historical AS (
    SELECT
        game_desc.season,
        game_desc.tier,
        ticket.event_date::date,
        ticket.comp_seats::float,
        ticket.paid_seats::float,
        CASE
            WHEN ticket.is_comp = TRUE AND ticket.did_attended = TRUE THEN 1
            ELSE 0
        END AS "comp_seats_attended",
        CASE
            WHEN is_comp = FALSE AND did_attended = TRUE THEN 1
            ELSE 0
        END AS "paid_seats_attended"
    FROM
        custom.cth_v_historical_ticket ticket
    INNER JOIN
        custom.cth_game_descriptions game_desc
            ON ticket.event_datetime::date = game_desc.event_datetime::date
            AND game_desc.season IN ('2021-22', '2022-23', '2023-24', '2024-25')
            AND game_desc.event_datetime < current_date
),
tier_show_rate AS (
    SELECT
        season,
        tier,
        sum(historical.comp_seats_attended)::float / sum(historical.comp_seats) AS "comp_show_rate",
        sum(historical.paid_seats_attended)::float / sum(historical.paid_seats) AS "paid_show_rate"
    FROM
        historical
    GROUP BY
        season, tier
)
SELECT * FROM tier_show_rate
"""

initial_df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

NameError: name 'FLA_Redshift' is not defined

In [46]:
weights = {'2021-22':0.5, '2022-23': .75, '2023-24':1.25,'2024-25':1.5}

initial_df['weights'] = initial_df['season'].map(weights)

def weighted_paid_average(group):
    # Calculate the weighted sum
    weighted_sum = (group['paid_show_rate'] * group['weights']).sum()
    
    # Calculate the weight sum
    weight_sum = group['weights'].sum()
    
    # Calculate the weighted average
    wavg = weighted_sum / weight_sum
    
    return pd.Series({
        'weighted_paid_average': wavg
    })

paid_tiers = initial_df.groupby('tier').apply(weighted_paid_average).reset_index()

def weighted_comp_average(group):
    # Calculate the weighted sum
    weighted_sum = (group['comp_show_rate'] * group['weights']).sum()
    
    # Calculate the weight sum
    weight_sum = group['weights'].sum()
    
    # Calculate the weighted average
    wavg = weighted_sum / weight_sum
    
    return pd.Series({
        'weighted_comp_average': wavg
    })

comp_tiers = initial_df.groupby('tier').apply(weighted_comp_average).reset_index()

tiers = pd.merge(paid_tiers, comp_tiers, on = 'tier', how = 'left')

In [54]:
q = """
select
    cth_v_ticket_2425.event_datetime,
    tier,
    datediff('days', current_date, cth_v_ticket_2425.event_datetime) as days_from_event,
    sum(paid_seats) as paid_seats,
    sum(comp_seats) as comp_seats
from
    custom.cth_v_ticket_2425
left join
    custom.cth_game_descriptions game_desc ON date(cth_v_ticket_2425.event_datetime) = date(game_desc.event_date)
group by
    cth_v_ticket_2425.event_datetime, tier
order by
    cth_v_ticket_2425.event_datetime
"""

current_df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

In [55]:
final_df = pd.merge(current_df, tiers, on = 'tier', how = 'left')

final_df['paid_attendance'] = final_df['paid_seats']*final_df['weighted_paid_average']
final_df['comp_attendance'] = final_df['comp_seats']*final_df['weighted_comp_average']
final_df['total_attendance'] = final_df['paid_attendance']+final_df['comp_attendance']

final_df

Unnamed: 0,event_datetime,tier,days_from_event,paid_seats,comp_seats,weighted_paid_average,weighted_comp_average,paid_attendance,comp_attendance,total_attendance
0,2024-09-28 18:00:00,PS,-18,14313,456,0.574348,0.448042,8220.637878,204.307246,8424.945124
1,2024-09-30 19:00:00,PS,-16,13634,421,0.574348,0.448042,7830.655826,188.625769,8019.281595
2,2024-10-08 19:00:00,B,-8,19204,609,0.891159,0.704965,17113.814988,429.323697,17543.138685
3,2024-10-17 19:00:00,E,1,16417,465,0.819578,0.667316,13455.017288,310.30187,13765.319159
4,2024-10-19 18:00:00,D,3,18091,321,0.834918,0.698592,15104.49468,224.248092,15328.742772
5,2024-10-22 18:30:00,E,6,15256,351,0.819578,0.667316,12503.486858,234.227863,12737.714721
6,2024-11-02 18:00:00,F,17,12289,1507,,,,,
7,2024-11-07 19:00:00,E,22,15640,318,0.819578,0.667316,12818.204933,212.20644,13030.411374
8,2024-11-09 18:00:00,C,24,17957,298,0.862451,0.707202,15487.024433,210.746309,15697.770742
9,2024-11-12 19:30:00,E,27,15232,322,0.819578,0.667316,12483.816978,214.875704,12698.692682
