In [None]:
import pandas as pd
import numpy as np
from prefect.blocks.system import Secret
from catnip.fla_redshift import FLA_Redshift
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import polars as pl
import pyarrow

In [2]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [None]:
# get game info data

q = """
select 
    date(event_date) as event_date,
    tier,
    is_premier, 
    cast(original_six_plus_extra*100 as int) as original_six_plus_extra
from  
    custom.cth_game_descriptions
where 
    season = '2023-24'
"""
tier_df = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
pl_tier_df = pl.from_pandas(tier_df)

In [None]:
# get 24/25 ticket data

#df_2324 = pl.read_csv("C:\\Users\\riffere\\Florida Panthers\\SP-BS - Documents\\Data Science\\Resources\\Files\\emily_ticket_sales_model_data_final.csv")  

q = """
with arena_levels as
    (select
        CASE
            WHEN price_level IN ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', '1', '2', '3', '4', '5', '6', '7', '8') THEN 'Lowers'
            WHEN price_level IN ('K', 'L', 'M') THEN 'Clubs'
            WHEN price_level IN ('N', 'O', 'P', 'Q', 'R', 'S', 'T', 'Y') THEN 'Uppers'
            WHEN price_level IN ('U', 'V', 'W') THEN 'Suites'
            WHEN price_level IN ('X', 'Z') THEN 'Premium'
            ELSE 'Unknown'
        END AS arena_level_internal,
        count(distinct seat_id) as capacity
    from
        custom.cth_v_manifest_2324
    group by
        arena_level_internal),
event_dates as
    (select
        event_datetime,
        date(transaction_date) as transaction_date,
        datediff('days', date(transaction_date), date(event_datetime)) as days_out
    from
        custom.cth_v_ticket_2324
    group by
        event_datetime,
        date(transaction_date)),
cross_join as
    (select
        *
    from
        arena_levels
    cross join
        event_dates),
ticket_info as
    (select
        event_datetime,
        datediff('days',date(transaction_date), date(event_datetime)) as days_out,
        arena_level_internal,
        sum(paid_seats) as paid_seats,
        sum(gross_revenue) as gross_revenue
    from
        custom.cth_v_ticket_2324
    group by
        event_datetime,
        date(transaction_date),
        arena_level_internal)
select
    date(cross_join.event_datetime) as event_date,
    cross_join.days_out,
    cross_join.arena_level_internal,
    capacity,
    coalesce(paid_seats,0) as paid_seats
from
    cross_join
left join
    ticket_info on cross_join.event_datetime = ticket_info.event_datetime
    and cross_join.arena_level_internal = ticket_info.arena_level_internal
    and cross_join.days_out = ticket_info.days_out
order by
    event_date,
    arena_level_internal,
    days_out
"""

df_2324 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
df_2324 = pl.from_pandas(df_2324)

### coalesce(gross_revenue::int,0) as gross_revenue

In [None]:
# create tickets left to sell by days out descending

#df_2324['cumulative_tickets']  = df_2324.groupby(['event_date', 'arena_level_internal'])['paid_seats'].cumsum()
df_2324 = df_2324.with_columns(
    pl.col("paid_seats").cum_sum().over(["event_date", "arena_level_internal"]).alias("cumulative_tickets")
)

In [None]:
# join game info data on 24/25 ticket data

#df_2324 = df_2324.merge(tier_df, on = 'event_date', how = 'left')
df_2324 = df_2324.join(pl_tier_df, on="event_date", how="left")

In [None]:
# make all int64 to int16 columns to make it run faster

int64_columns = df_2324.select(pl.col(pl.Int64)).columns
    
df_2324 = df_2324.with_columns([
    pl.col(col).cast(pl.Int16) for col in int64_columns])

In [None]:
# create model columns: dow, tier_num, arena_level_num, cap_remaining and filter out non-regular season games


# df_2324 = df_2324.with_columns([
#     pl.col('event_date').cast(pl.Date)
# ])

# df_2324 = df_2324.with_columns([
#     pl.col('gross_revenue').cast(pl.Int16)
# ])

df_2324 = df_2324.with_columns([
    pl.col('event_date').dt.weekday().cast(pl.Int16).alias('dow')
    ])

# pcs = sorted(df_2324['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2324 = df_2324.with_columns([
#     pl.col('pc_one').map_elements(
#         lambda x: pc_dict.get(x, None)
#     ).cast(pl.Int16)
#     .alias('pc_number')
# ])

df_2324 = df_2324.with_columns([
    pl.when(pl.col('tier') == 'A').then(5)
    .when(pl.col('tier') == 'B').then(4)
    .when(pl.col('tier') == 'C').then(3)
    .when(pl.col('tier') == 'D').then(2)
    .otherwise(1)
    .cast(pl.Int16)
    .alias('tier_num')
])

df_2324 = df_2324.with_columns([
    pl.when(pl.col('arena_level_internal') == 'Clubs').then(6)
    .when(pl.col('arena_level_internal') == 'Lowers').then(5)
    .when(pl.col('arena_level_internal') == 'Uppers').then(4)
    .when(pl.col('arena_level_internal') == 'Suites').then(3)
    .when(pl.col('arena_level_internal') == 'Premium').then(2)
    .otherwise(1)
    .cast(pl.Int16)
    .alias('arena_level_num')
])

df_2324 = df_2324.with_columns(
    pl.col('capacity').sub(pl.col('cumulative_tickets')).alias('cap_remaining'))

df_2324 = df_2324.filter(
    (pl.col("tier").is_in(['A','B','C','D','E'])) & (pl.col("days_out") >= 0)
)

In [11]:
# df_2324['dow'] = [datetime.weekday(x) for x in df_2324['event_date']]
# df_2324['tier_num'] = [5 if tier == 'A' else (4 if tier == 'B' else (3 if tier == 'C' else (2 if tier == 'D' else 1))) for tier in df_2324['tier']]
# #df_2324['random'] = [x for x in (np.random.rand(len(df_2324),1)/2)]

# # pcs = sorted(df_2324['pc_one'].unique())
# # pc_dict = dict((value,count) for count, value in enumerate(pcs))
# # df_2324['pc_num'] = df_2324.apply(lambda row: pc_dict[row['pc_one']], axis = 1)


# df_2324['arena_level_num'] = [6 if arena_level_internal == 'Premium' else (5 if arena_level_internal == 'Clubs' else (4 if arena_level_internal == 'Lowers' else 
#                             (3 if arena_level_internal == 'Uppers' else (2 if arena_level_internal == 'Suites' else 1)))) for arena_level_internal in df_2324['arena_level_internal']]

# #df_2324 = df_2324.sample(n=len(df_2324), random_state=1993)
# df_2324 = df_2324.reset_index()

In [12]:
# def run_model(df_train, df_test):

#     #df_train_subset = df_train[df_train['date_diff'] == days_out]

#     X_train = df_train[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra','days_out', 'cap_remaining']]
#     y_train = df_train[['cumulative_tickets']]

#     #df_test_subset = df_test[(df_test['days_out'] == days_out) & (df_test['ticket_type_final'] == 'Not Sold')]

#     X_test = df_test[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra', 'days_out', 'cap_remaining']]
#     #y_test = df_test[['is_sold']]
 
#     if len(X_test) > 0:
#         ss = StandardScaler()
#         x_train_scaled = ss.fit_transform(X_train)
#         x_test_scaled = ss.fit_transform(X_test)

#         clf = RandomForestClassifier(random_state = 1993)
#         clf.fit(x_train_scaled, y_train)

#         predicted_df = pd.DataFrame(data = clf.predict(x_test_scaled), columns = ['cumulative_tickets_predicted'])
#         predicted_df = pl.from_pandas(predicted_df)
#         final_df = pl.concat([df_test, predicted_df], how = 'horizontal')

#         return final_df

In [None]:
# get 24/25 ticket data

q = """
with arena_levels as
    (select
        CASE
            WHEN price_level IN ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', '1', '2', '3', '4', '5', '6', '7', '8') THEN 'Lowers'
            WHEN price_level IN ('K', 'L', 'M') THEN 'Clubs'
            WHEN price_level IN ('N', 'O', 'P', 'Q', 'R', 'S', 'T', 'Y') THEN 'Uppers'
            WHEN price_level IN ('U', 'V', 'W') THEN 'Suites'
            WHEN price_level IN ('X', 'Z') THEN 'Premium'
            ELSE 'Unknown'
        END AS arena_level_internal,
        count(distinct seat_id) as capacity
    from
        custom.cth_v_manifest_2425
    group by
        arena_level_internal),
event_dates as
    (select
        distinct event_datetime
    from
        custom.cth_v_ticket_2425),
cross_join as
    (select
        *
    from
        arena_levels
    cross join
        event_dates),
current_info as
    (select
         event_datetime,
         arena_level_internal,
         sum(paid_seats) as paid_seats,
         sum(gross_revenue) as gross_revenue
    from
        custom.cth_v_ticket_2425
    group by
        event_datetime,
        arena_level_internal)
select
    date(cross_join.event_datetime) as event_date,
    cross_join.arena_level_internal,
    datediff('days', current_date, cth_game_descriptions.event_datetime) as days_out,
    tier,
    cast(original_six_plus_extra*100 as int) as original_six_plus_extra,
    is_premier,
    capacity,
    paid_seats,
    capacity-paid_seats as cap_remaining
from
    cross_join
left join
    current_info on cross_join.arena_level_internal = current_info.arena_level_internal
    and cross_join.event_datetime = current_info.event_datetime
left join
    custom.cth_game_descriptions on cross_join.event_datetime = cth_game_descriptions.event_datetime
 """

df_2425 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
# df_2425['days_out'] = df_2425['days_out'].astype(np.int64)
# df_2425['original_six_plus_extra'] = df_2425['original_six_plus_extra'].astype(np.float32)
# df_2425['days_out']
# for col in df_2425.columns:
#     print(df_2425[col].dtype)
df_2425 = pl.from_pandas(df_2425)

### gross_revenue,

In [None]:
# create model columns: dow, tier_num, arena_level_num, cap_remaining and filter out non-regular season games


# df_2425['dow'] = [datetime.weekday(x) for x in df_2425['event_datetime']]
# df_2425['tier_num'] = [5 if tier == 'A' else (4 if tier == 'B' else (3 if tier == 'C' else (2 if tier == 'D' else 1))) for tier in df_2425['tier']]
#df_2324['random'] = [x for x in (np.random.rand(len(df_2324),1)/2)]

# pcs = sorted(df_2324['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2324['pc_num'] = df_2324.apply(lambda row: pc_dict[row['pc_one']], axis = 1)


# df_2425['arena_level_num'] = [6 if arena_level_internal == 'Premium' else (5 if arena_level_internal == 'Clubs' else (4 if arena_level_internal == 'Lowers' else 
#                             (3 if arena_level_internal == 'Uppers' else (2 if arena_level_internal == 'Suites' else 1)))) for arena_level_internal in df_2425['arena_level_internal']]

#df_2425 = df_2425.sample(n=len(df_2324), random_state=1993)
# df_2425 = df_2425.reset_index()

df_2425 = df_2425.with_columns([
    pl.col('event_date').dt.weekday().cast(pl.Int16).alias('dow')
    ])

# pcs = sorted(df_2425['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2425 = df_2425.with_columns([
#     pl.col('pc_one').map_elements(
#         lambda x: pc_dict.get(x, None)
#     ).cast(pl.Int16)
#     .alias('pc_number')
# ])

df_2425 = df_2425.with_columns([
    pl.when(pl.col('tier') == 'A').then(5)
    .when(pl.col('tier') == 'B').then(4)
    .when(pl.col('tier') == 'C').then(3)
    .when(pl.col('tier') == 'D').then(2)
    .otherwise(1)
    .cast(pl.Int16)
    .alias('tier_num')
])


df_2425 = df_2425.with_columns([
    pl.when(pl.col('arena_level_internal') == 'Clubs').then(6)
    .when(pl.col('arena_level_internal') == 'Lowers').then(5)
    .when(pl.col('arena_level_internal') == 'Uppers').then(4)
    .when(pl.col('arena_level_internal') == 'Suites').then(3)
    .when(pl.col('arena_level_internal') == 'Premium').then(2)
    .otherwise(1)
    .cast(pl.Int16)
    .alias('arena_level_num')
])

# df_2425 = df_2425.with_columns(
#     pl.col('capacity').sub(pl.col('cumulative_tickets')).alias('cap_remaining'))

df_2425 = df_2425.filter(
    (pl.col("tier").is_in(['A','B','C','D','E'])) & (pl.col("days_out") >= 0)
)

In [None]:
# make all int64 to int16 columns to make it run faster

int64_columns = df_2425.select(pl.col(pl.Int64)).columns
    
df_2425 = df_2425.with_columns([
    pl.col(col).cast(pl.Int16) for col in int64_columns])

In [None]:
# create model

def run_model(df, df_future, arena_level):

    x_train_table = df.filter(
        (pl.col("arena_level_internal").is_in([arena_level])))

    X_train = x_train_table.select(['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra','days_out', 'cap_remaining'])
    y_train = x_train_table.select(['cumulative_tickets'])

    x_test_table = df_future.filter(
        (pl.col("arena_level_internal").is_in([arena_level])))
    
    X_test = x_test_table.select(['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra', 'days_out', 'cap_remaining'])

    ss = StandardScaler()
    x_train_scaled = ss.fit_transform(X_train)
    x_test_scaled = ss.fit_transform(X_test)

    polynomial = LinearRegression().fit(x_train_scaled, np.array(y_train).ravel())

    return polynomial.predict(x_test_scaled)

In [None]:
# run by arena_level

arena_levels = ['Lowers','Premium','Uppers', 'Suites','Clubs']

final_df = pl.DataFrame(
    schema= {
        'event_date': pl.Date,
        'arena_level_internal': pl.String,
        'days_out': pl.Int16,
        'tier': pl.String,
        'original_six_plus_extra': pl.Int16,
        'is_premier': pl.Boolean,
        'capacity': pl.Int16,
        'paid_seats': pl.Int16,
        'cap_remaining': pl.Int16,
        'dow': pl.Int16,
        'tier_num': pl.Int16,
        'arena_level_num': pl.Int16,
        'literal' : pl.Float64
    }
)

for arena_level in arena_levels:

    temp = df_2425.filter(
        (pl.col("arena_level_internal").is_in([arena_level])))
    
    result = run_model(df_2324, df_2425, arena_level)

    temp = temp.with_columns([result])

    final_df = pl.concat([final_df,temp], how = 'vertical')

final_df = final_df.rename({'literal':'cumulative_tickets_predicted'})

In [None]:
# create cumulative_tickets_predicted column so its greater than 0 and less than cap_remianing

final_df = final_df.with_columns(
        pl.when(pl.col("cumulative_tickets_predicted") < 0)
        .then(0)
        .when(pl.col("cap_remaining") < pl.col("cumulative_tickets_predicted"))
        .then(pl.col("cap_remaining"))
        .otherwise(pl.col("cumulative_tickets_predicted"))
        .alias("cumulative_tickets_predicted")
)

In [None]:
# get total tickets prediction

final_df = final_df.with_columns([
    ((pl.col('paid_seats') + pl.col('cumulative_tickets_predicted'))
    .alias('total_predicted_tickets'))
])

In [None]:
# get historical show rate data

q = """
WITH historical AS (
    SELECT
        game_desc.season,
        game_desc.tier,
        arena_level_internal,
        ticket.event_date::date,
        ticket.comp_seats::float,
        ticket.paid_seats::float,
        CASE
            WHEN ticket.is_comp = TRUE AND ticket.did_attended = TRUE THEN 1
            ELSE 0
        END AS "comp_seats_attended",
        CASE
            WHEN is_comp = FALSE AND did_attended = TRUE THEN 1
            ELSE 0
        END AS "paid_seats_attended"
    FROM
        custom.cth_v_historical_ticket ticket
    INNER JOIN
        custom.cth_game_descriptions game_desc
            ON ticket.event_datetime::date = game_desc.event_datetime::date
            AND game_desc.season IN ('2021-22', '2022-23', '2023-24', '2024-25')
            AND game_desc.event_datetime < current_date
),
tier_show_rate AS (
    SELECT
        season,
        tier,
        arena_level_internal,
        sum(historical.comp_seats_attended)::float / nullif(sum(historical.comp_seats),0) AS "comp_show_rate",
        sum(historical.paid_seats_attended)::float / nullif(sum(historical.paid_seats),0) AS "paid_show_rate"
    FROM
        historical
    GROUP BY
        season, tier, arena_level_internal
)
SELECT
    *
FROM
    tier_show_rate
where 
    tier != 'F'
"""

show_rate = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

In [None]:
# get weighted show_rate avergaes over last 4 seasons

weights = {'2021-22':0.5, '2022-23': .75, '2023-24':1.25,'2024-25':1.5}

show_rate['weights'] = show_rate['season'].map(weights)

def weighted_paid_average(group):
    # Calculate the weighted sum
    weighted_sum = (group['paid_show_rate'] * group['weights']).sum()
    
    # Calculate the weight sum
    weight_sum = group['weights'].sum()
    
    # Calculate the weighted average
    wavg = weighted_sum / weight_sum
    
    return pd.Series({
        'weighted_paid_average': wavg
    })

paid_tiers = show_rate.groupby(by = ['tier', 'arena_level_internal']).apply(weighted_paid_average).reset_index()

def weighted_comp_average(group):
    # Calculate the weighted sum
    weighted_sum = (group['comp_show_rate'] * group['weights']).sum()
    
    # Calculate the weight sum
    weight_sum = group['weights'].sum()
    
    # Calculate the weighted average
    wavg = weighted_sum / weight_sum
    
    return pd.Series({
        'weighted_comp_average': wavg
    })

comp_tiers = show_rate.groupby(by = ['tier', 'arena_level_internal']).apply(weighted_comp_average).reset_index()

tiers = pd.merge(paid_tiers, comp_tiers, on = ['tier', 'arena_level_internal'], how = 'left')
tiers = pl.from_pandas(tiers)

  paid_tiers = show_rate.groupby(by = ['tier', 'arena_level_internal']).apply(weighted_paid_average).reset_index()
  comp_tiers = show_rate.groupby(by = ['tier', 'arena_level_internal']).apply(weighted_comp_average).reset_index()


In [None]:
# merge onto 24/25 data and predict attendance

final_df = final_df.join(tiers, on = ['tier','arena_level_internal'])

final_df = final_df.with_columns([
    ((pl.col('total_predicted_tickets') * pl.col('weighted_paid_average'))
    .alias('total_attendance'))
])

In [37]:
final_df.write_csv('C:\\Users\\riffere\\Desktop\\output.csv')