In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from prefect.blocks.system import Secret
from catnip.fla_redshift import FLA_Redshift
from typing import Dict
from concurrent.futures import ThreadPoolExecutor
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from collections import Counter

from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import polars as pl
import pyarrow

In [3]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [3]:
# q = """
# with seats as
#     (select
#         distinct seat_id, pc_one
#     from
#         custom.cth_manifest_2223),
# event_dates as
#     (select
#         event_date, tier, is_premier, original_six_plus_extra, abbreviation
#     from
#         custom.cth_game_descriptionsv  
#     where
#         season = '2023-24' and game_type = 1 and event_date = '2023-10-19'),
# transaction_dates as
#     (select
#          transaction_date
#      from
#          custom.cth_v_ticket_2324
#      where
#         date(event_datetime) >= date(transaction_date)),
# other as
#     (select
#         event_datetime, section, row, seat, transaction_date,
#         cast(section as varchar)+'-'+cast(row as varchar)+'-'+cast(seat as varchar) as seat_id, gross_revenue, ticket_type
#     from
#         custom.cth_v_ticket_2324),
# base as
#     (select
#         *
#     from
#         event_dates
#     cross join
#         seats
#     cross join
#         transaction_dates)
# select
#     base.event_date, base.transaction_date, tier, is_premier::int, datediff(day, date(other.transaction_date), date(base.event_date)) as days_out,
#     original_six_plus_extra, base.seat_id, abbreviation, pc_one,
#         CASE
#             WHEN pc_one in ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', '1', '2', '3', '4', '5', '6', '7', '8')
#             THEN 'Lowers'
#             WHEN pc_one in ('K', 'L', 'M') THEN 'Clubs'
#             WHEN pc_one in ('N', 'O', 'P', 'Q', 'R', 'S', 'T') THEN 'Uppers'
#             WHEN pc_one in ('U', 'V', 'W') THEN 'Suites'
#             WHEN pc_one in ('X') THEN 'Amerant'
#             WHEN pc_one in ('Y') THEN 'Loft'
#             WHEN pc_one in ('Z') THEN 'Corona'
#             ELSE 'unknown'
#             END AS location,
#     CASE
#         when gross_revenue > 0 then gross_revenue
#         else 0
#     end as block_purchase_price,
#     CASE
#         when ticket_type IS NOT NULL then ticket_type
#         else 'Not Sold'
#     end as ticket_type_final,
#     CASE
#         when ticket_type_final in ('Full', 'Annual Suites', 'Premier', 'Flex', 'Quarter', 'Sponsor', 'Trade') then 'Plans'
#         when ticket_type_final in ('Not Sold') then 'Not Sold'
#         else 'Nightly'
#     end as ticket_type_group
# from
#     base
# left join
#     other on date(base.event_date) = date(other.event_datetime) and base.seat_id = other.seat_id
# order by
#     base.event_date, base.seat_id
# """
# df_2324 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)

In [4]:
q = """
select 
    date(event_date) as event_date,
    tier,
    is_premier, 
    original_six_plus_extra::int
from  
    custom.cth_game_descriptions
where 
    season = '2023-24'
"""
tier_df = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
#pl_tier_df = pl.from_pandas(tier_df)

In [5]:
#df_2324 = pl.read_csv("C:\\Users\\riffere\\Florida Panthers\\SP-BS - Documents\\Data Science\\Resources\\Files\\emily_ticket_sales_model_data_final.csv")  

q = """
with arena_levels as
    (select
        CASE
            WHEN price_level IN ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', '1', '2', '3', '4', '5', '6', '7', '8') THEN 'Lowers'
            WHEN price_level IN ('K', 'L', 'M') THEN 'Clubs'
            WHEN price_level IN ('N', 'O', 'P', 'Q', 'R', 'S', 'T', 'Y') THEN 'Uppers'
            WHEN price_level IN ('U', 'V', 'W') THEN 'Suites'
            WHEN price_level IN ('X', 'Z') THEN 'Premium'
            ELSE 'Unknown'
        END AS arena_level_internal,
        count(distinct seat_id) as capacity
    from
        custom.cth_v_manifest_2324
    group by
        arena_level_internal),
event_dates as
    (select
        event_datetime,
        date(transaction_date) as transaction_date,
        datediff('days', date(transaction_date), date(event_datetime)) as days_out
    from
        custom.cth_v_ticket_2324
    group by
        event_datetime,
        date(transaction_date)),
cross_join as
    (select
        *
    from
        arena_levels
    cross join
        event_dates),
ticket_info as
    (select
        event_datetime,
        datediff('days',date(transaction_date), date(event_datetime)) as days_out,
        arena_level_internal,
        sum(paid_seats) as paid_seats,
        sum(gross_revenue) as gross_revenue
    from
        custom.cth_v_ticket_2324
    group by
        event_datetime,
        date(transaction_date),
        arena_level_internal)
select
    date(cross_join.event_datetime) as event_date,
    cross_join.days_out,
    cross_join.arena_level_internal,
    capacity,
    coalesce(paid_seats,0) as paid_seats,
    coalesce(gross_revenue::int,0) as gross_revenue
from
    cross_join
left join
    ticket_info on cross_join.event_datetime = ticket_info.event_datetime
    and cross_join.arena_level_internal = ticket_info.arena_level_internal
    and cross_join.days_out = ticket_info.days_out
order by
    event_date,
    arena_level_internal,
    days_out
"""

df_2324 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
#df_2324 = pl.from_pandas(df_2324)

In [6]:
df_2324['cumulative_tickets']  = df_2324.groupby(['event_date', 'arena_level_internal'])['paid_seats'].cumsum()

In [7]:
#df_2324 = df_2324.sample(n = 2000000, seed = 1993)

In [7]:
df_2324 = df_2324.merge(tier_df, on = 'event_date', how = 'left')
#df_2324 = df_2324.join(pl_tier_df, on="event_date", how="left")

In [9]:
# df_2324 = df_2324.with_columns([
#     pl.col('event_date').cast(pl.Date)
# ])

# df_2324 = df_2324.with_columns([
#     pl.col('gross_revenue').cast(pl.Int16)
# ])

In [10]:
# df_2324 = df_2324.with_columns([
#     pl.col('event_date').dt.weekday().cast(pl.Int16).alias('dow')
#     ])

# pcs = sorted(df_2324['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2324 = df_2324.with_columns([
#     pl.col('pc_one').map_elements(
#         lambda x: pc_dict.get(x, None)
#     ).cast(pl.Int16)
#     .alias('pc_number')
# ])

# df_2324 = df_2324.with_columns([
#     pl.when(pl.col('tier') == 'A').then(5)
#     .when(pl.col('tier') == 'B').then(4)
#     .when(pl.col('tier') == 'C').then(3)
#     .when(pl.col('tier') == 'D').then(2)
#     .otherwise(1)
#     .cast(pl.Int16)
#     .alias('tier_num')
# ])

# df_2324 = df_2324.with_columns([
#     pl.when(pl.col('arena_level_internal') == 'Clubs').then(6)
#     .when(pl.col('arena_level_internal') == 'Lowers').then(5)
#     .when(pl.col('arena_level_internal') == 'Uppers').then(4)
#     .when(pl.col('arena_level_internal') == 'Suites').then(3)
#     .when(pl.col('arena_level_internal') == 'Premium').then(2)
#     .otherwise(1)
#     .cast(pl.Int16)
#     .alias('arena_level_num')
# ])

In [8]:
df_2324['cap_remaining'] = df_2324['capacity'] - df_2324['cumulative_tickets']

df_2324 = df_2324[df_2324['tier'].isin(['A','B','C','D','E'])]
df_2324 = df_2324[df_2324['days_out'] >= 0]

In [9]:
df_2324['dow'] = [datetime.weekday(x) for x in df_2324['event_date']]
df_2324['tier_num'] = [5 if tier == 'A' else (4 if tier == 'B' else (3 if tier == 'C' else (2 if tier == 'D' else 1))) for tier in df_2324['tier']]
#df_2324['random'] = [x for x in (np.random.rand(len(df_2324),1)/2)]

# pcs = sorted(df_2324['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2324['pc_num'] = df_2324.apply(lambda row: pc_dict[row['pc_one']], axis = 1)


df_2324['arena_level_num'] = [6 if arena_level_internal == 'Premium' else (5 if arena_level_internal == 'Clubs' else (4 if arena_level_internal == 'Lowers' else 
                            (3 if arena_level_internal == 'Uppers' else (2 if arena_level_internal == 'Suites' else 1)))) for arena_level_internal in df_2324['arena_level_internal']]

#df_2324 = df_2324.sample(n=len(df_2324), random_state=1993)
df_2324 = df_2324.reset_index()

In [None]:
def run_model(df_train, df_test):

    #df_train_subset = df_train[df_train['date_diff'] == days_out]

    X_train = df_train[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra','days_out', 'cap_remaining']]
    y_train = df_train[['cumulative_tickets']]

    #df_test_subset = df_test[(df_test['days_out'] == days_out) & (df_test['ticket_type_final'] == 'Not Sold')]

    X_test = df_test[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra', 'days_out', 'cap_remaining']]
    #y_test = df_test[['is_sold']]
 
    if len(X_test) > 0:
        ss = StandardScaler()
        x_train_scaled = ss.fit_transform(X_train)
        x_test_scaled = ss.fit_transform(X_test)

        clf = RandomForestClassifier(random_state = 1993)
        clf.fit(x_train_scaled, y_train)

        predicted_df = pd.DataFrame(data = clf.predict(x_test_scaled), columns = ['cumulative_tickets_predicted'])
        predicted_df = pl.from_pandas(predicted_df)
        final_df = pl.concat([df_test, predicted_df], how = 'horizontal')

        return final_df

In [14]:
# def run_model(df_test, df_train):

#     #df_test_subset = df_test[(df_test['days_out'] == days_out) & (df_test['ticket_type_final'] == 'Not Sold')]

#     X_test = df_test[['dow', 'tier_num', 'pc_num', 'is_premier', 'original_six_plus_extra','days_out']]
#     y_test = df_test[['ticket_type_group']]

#     #df_train_subset = df_train[df_train['date_diff'] == days_out]

#     X_train = df_train[['dow', 'tier_num', 'pc_num', 'is_premier', 'original_six_plus_extra','days_out']]
#     y_train = df_train[['ticket_type_group']]

#     if len(X_test) > 0:
#         ss = StandardScaler()
#         x_train_scaled = ss.fit_transform(X_train)
#         x_test_scaled = ss.fit_transform(X_test)

#         clf = RandomForestClassifier(random_state = 1993)
#         clf.fit(x_train_scaled, y_train)

#         predicted_df = pd.DataFrame(data = clf.predict_proba(x_test_scaled), columns = ['Nightly', 'Not Sold', 'Plans'])
#         final_df = pd.concat([df_test.reset_index(), predicted_df], axis = 1)

#         return df_test

In [10]:
q = """
with arena_levels as
    (select
        CASE
            WHEN price_level IN ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', '1', '2', '3', '4', '5', '6', '7', '8') THEN 'Lowers'
            WHEN price_level IN ('K', 'L', 'M') THEN 'Clubs'
            WHEN price_level IN ('N', 'O', 'P', 'Q', 'R', 'S', 'T', 'Y') THEN 'Uppers'
            WHEN price_level IN ('U', 'V', 'W') THEN 'Suites'
            WHEN price_level IN ('X', 'Z') THEN 'Premium'
            ELSE 'Unknown'
        END AS arena_level_internal,
        count(distinct seat_id) as capacity
    from
        custom.cth_v_manifest_2425
    group by
        arena_level_internal),
event_dates as
    (select
        distinct event_datetime
    from
        custom.cth_v_ticket_2425),
cross_join as
    (select
        *
    from
        arena_levels
    cross join
        event_dates),
current_info as
    (select
         event_datetime,
         arena_level_internal,
         sum(paid_seats) as paid_seats,
         sum(gross_revenue) as gross_revenue
    from
        custom.cth_v_ticket_2425
    group by
        event_datetime,
        arena_level_internal)
select
    cross_join.event_datetime,
    cross_join.arena_level_internal,
    datediff('days', current_date, cth_game_descriptions.event_datetime) as days_out,
    tier,
    original_six_plus_extra,
    is_premier,
    capacity,
    paid_seats,
    gross_revenue,
    capacity-paid_seats as cap_remaining
from
    cross_join
left join
    current_info on cross_join.arena_level_internal = current_info.arena_level_internal
    and cross_join.event_datetime = current_info.event_datetime
left join
    custom.cth_game_descriptions on cross_join.event_datetime = cth_game_descriptions.event_datetime
 """

df_2425 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)
# df_2425['days_out'] = df_2425['days_out'].astype(np.int64)
# df_2425['original_six_plus_extra'] = df_2425['original_six_plus_extra'].astype(np.float32)
# df_2425['days_out']
# for col in df_2425.columns:
#     print(df_2425[col].dtype)
#df_2425 = pl.from_pandas(df_2425)

In [11]:
df_2425['dow'] = [datetime.weekday(x) for x in df_2425['event_datetime']]
df_2425['tier_num'] = [5 if tier == 'A' else (4 if tier == 'B' else (3 if tier == 'C' else (2 if tier == 'D' else 1))) for tier in df_2425['tier']]
#df_2324['random'] = [x for x in (np.random.rand(len(df_2324),1)/2)]

# pcs = sorted(df_2324['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2324['pc_num'] = df_2324.apply(lambda row: pc_dict[row['pc_one']], axis = 1)


df_2425['arena_level_num'] = [6 if arena_level_internal == 'Premium' else (5 if arena_level_internal == 'Clubs' else (4 if arena_level_internal == 'Lowers' else 
                            (3 if arena_level_internal == 'Uppers' else (2 if arena_level_internal == 'Suites' else 1)))) for arena_level_internal in df_2425['arena_level_internal']]

#df_2425 = df_2425.sample(n=len(df_2324), random_state=1993)
df_2425 = df_2425.reset_index()

# df_2425 = df_2425.with_columns([
#     pl.col('event_date').dt.weekday().cast(pl.Int16).alias('dow')
#     ])

# pcs = sorted(df_2425['pc_one'].unique())
# pc_dict = dict((value,count) for count, value in enumerate(pcs))
# df_2425 = df_2425.with_columns([
#     pl.col('pc_one').map_elements(
#         lambda x: pc_dict.get(x, None)
#     ).cast(pl.Int16)
#     .alias('pc_number')
# ])

# df_2425 = df_2425.with_columns([
#     pl.when(pl.col('tier') == 'A').then(5)
#     .when(pl.col('tier') == 'B').then(4)
#     .when(pl.col('tier') == 'C').then(3)
#     .when(pl.col('tier') == 'D').then(2)
#     .otherwise(1)
#     .cast(pl.Int16)
#     .alias('tier_num')
# ])

In [None]:
# df_2425_model = df_2425.filter(pl.col('is_sold') == False)
# df_2425_model = df_2425_model.filter(pl.col('days_out') >= 0)

In [12]:
X_train = df_2324[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra','days_out', 'cap_remaining']]
y_train = df_2324[['cumulative_tickets']]

#df_test_subset = df_test[(df_test['days_out'] == days_out) & (df_test['ticket_type_final'] == 'Not Sold')]

X_test = df_2425[['dow', 'tier_num', 'arena_level_num', 'is_premier', 'original_six_plus_extra', 'days_out', 'cap_remaining']]
#y_test = df_test[['is_sold']]

ss = StandardScaler()
x_train_scaled = ss.fit_transform(X_train)
x_test_scaled = ss.fit_transform(X_test)

clf = RandomForestClassifier(random_state = 1993)
clf.fit(x_train_scaled, y_train)

predicted_df = pd.DataFrame(data = clf.predict(x_test_scaled), columns = ['cumulative_tickets_predicted'])
predicted_df = pl.from_pandas(predicted_df)
final_df = pl.concat([df_2425, predicted_df], how = 'horizontal')

  return fit_method(estimator, *args, **kwargs)


MemoryError: could not allocate 1008730112 bytes

In [25]:
final_df = run_model(df_2324, df_2425)

# cm = confusion_matrix(final_df['is_sold'], final_df['is_sold_predicted'])
# print(accuracy_score(final_df['is_sold'],final_df['is_sold_predicted']))
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Sold','Not Sold'])
# disp.plot()
# plt.show()

  return fit_method(estimator, *args, **kwargs)


MemoryError: could not allocate 1008730112 bytes

In [None]:
# def split_dataframe(df, percent_chunk):
#     total_rows = len(df)
#     chunk_size = int(total_rows * percent_chunk / 100)
    
#     for i in range(0, total_rows, chunk_size):
#         end = min(i + chunk_size, total_rows)
#         yield df.slice(i, end - i)

# def process_partition(partition, model_func):
#     # Apply the model to the current partition
#     result = model_func(partition)
    
#     # Process the result as needed
#     processed_result = result
    
#     return processed_result

# for partition in split_dataframe(df_2324, 1):
#     processed_partition = run_model(partition, partition)
#     print('go')
# processed_partition

In [65]:
# unique_dates = df_2324['event_date'].unique()
# today = str(date.today())
# final_df2 = pd.DataFrame()
# for event_date in unique_dates:
#     days_out = (datetime.strptime(today, '%Y-%m-%d').date() - event_date).days
#     for i in range(1,days_out):
#        df = run_model(df_2324, df_2324)
#        final_df2 = pd.concat([final_df2,df])
#     #df = run_model(df_2324, df_2324, 42)
#     #final_df = pd.concat([final_df,df])
# final_df

In [15]:
df = final_df.group_by(by = 'event_date').agg(pl.col('is_sold_predicted').sum())

In [16]:
df

by,is_sold_predicted
date,u32
2025-03-30,0
2025-04-12,0
2025-03-03,0
2025-03-08,0


In [64]:
df.write_csv("C:\\Users\\riffere\\Desktop\\output.csv")