In [1]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy import optimize
from sklearn.preprocessing import StandardScaler

In [2]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [3]:
q = """
        WITH nightly AS (
            SELECT
                date(event_date) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
                AND is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_v_ticket_2324
            WHERE
                is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
        ),
        atp AS (
            SELECT
                date(event_date) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_v_ticket_2324
            GROUP BY
                event_date
        ),
        attendance AS (
            SELECT
                date(event_date) as event_date,
                sum(did_attend) AS attendance
            FROM
                custom.cth_historical_all_1718_2223
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                count(*) AS attendance
            FROM
                custom.cth_v_attendance_2324
            GROUP BY
                event_date
        )
        SELECT
            n.event_date,
            season,
            n.nightly_tickets,
            atp.atp,
            att.attendance,
            week_day,
            trimester,
            original_six_plus_extra,
            is_dense
        FROM
            nightly n
        LEFT JOIN
            atp ON n.event_date = atp.event_date
        LEFT JOIN
            attendance att ON n.event_date = att.event_date
        LEFT JOIN
            custom.cth_game_descriptions ON date(n.event_date) = date(cth_game_descriptions.event_date)
        WHERE
            n.event_date < (GETDATE() - 1)
        ORDER BY
            n.event_date               
    """

df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

In [4]:
def get_clusters(X: np.array, n_clusters: int) -> np.array:

    kmeans = KMeans(n_clusters, random_state = 1693)
    kmeans.fit(X)
    clusters = kmeans.labels_

    return clusters

def create_clusters(redshift_creds: Dict, df: pd.DataFrame) -> pd.DataFrame:

    ## season list
    seasons = ['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24']

    ## filter out covid & pre/post-season
    df = df[df['season'].isin(['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24'])]
    df = df[df['is_regular_season'] == 1]

    ## add summary statistics
    df = pd.merge(left = df, right = get_summary_statistics(redshift_creds), how = "inner", on = "event_date")

    df_final = pd.DataFrame()

    for i in seasons:

        df_temp = df[df['season'] == i]
        df_clust_temp = df_temp[[
            'week_day', 
            'trimester', 
            'original_six_plus_extra', 
            'is_dense', 
            'nightly_tickets', 
            'atp', 
            'attendance'
            ]]
        
        x = np.array(df_clust_temp)
        x_standard = StandardScaler().fit_transform(x)

        ## get clusters
        df_temp['cluster_season'] = get_clusters(x_standard, 4)
        df_final = pd.concat([df_final,df_temp])
    
    ## reset group numbers to match
    df_final['agg'] = df_final.atp * df_final.nightly_tickets
    df_agg_temp = df_final.groupby(by = ['season','cluster_season'])[['agg']].mean()
    df_agg_temp = pd.DataFrame(df_agg_temp).reset_index()
    df_agg_temp['rank'] = df_agg_temp.groupby(by = ['season'])[['agg']].rank('max')
    
    # now reset all the values
    df_final = df_final.merge(right = df_agg_temp, how = 'left', on = ['season', 'cluster_season'])
    df_final = df_final[['week_day', 'trimester', 'original_six_plus_extra', 'is_dense', 'nightly_tickets', 'atp', 'attendance', 'rank']].rename({'rank':'cluster_season'})

    ## select cols 
    df_clust = df_final[[
        'week_day', 
        'trimester', 
        'original_six_plus_extra', 
        'is_dense', 
        'nightly_tickets', 
        'atp', 
        'attendance'
        ]]

    ## scale
    x = np.array(df_clust)
    x_standard = StandardScaler().fit_transform(x)

    ## get clusters
    df_final['cluster'] = get_clusters(x_standard, 4)

    ## select cols
    df_final = df_final[['event_date', 'cluster', 'season_cluster']]

    return df_final

In [5]:
q = """
    WITH nightly AS (
            SELECT
                date(event_date) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
                AND is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_v_ticket_2324
            WHERE
                is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date),
        atp AS (
            SELECT
                date(event_date) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_v_ticket_2324
            GROUP BY
                event_date)
        SELECT
            n.event_date,
            season,
            n.nightly_tickets,
            atp.atp,
            n.nightly_tickets*atp.atp as agg,
            cluster_season
        FROM
            nightly n
        LEFT JOIN
            atp ON n.event_date = atp.event_date
        LEFT JOIN
            custom.cth_game_descriptions ON date(n.event_date) = date(cth_game_descriptions.event_date)
        WHERE
            n.event_date < (GETDATE() - 1) and game_type = 1
        ORDER BY
            n.event_date"""
df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

In [7]:
df1 = df.groupby(by = ['season','cluster_season'])[['agg']].mean()
df1 = pd.DataFrame(df1).reset_index()
df1['rank'] = df1.groupby(by = ['season'])[['agg']].rank('max')
df.merge(right = df1, how = 'left', on = ['season', 'cluster_season'])
df

Unnamed: 0,event_date,season,nightly_tickets,atp,agg,cluster_season
0,2017-10-07,2017-18,3676,44.109010,162144.719443,4
1,2017-10-12,2017-18,868,46.890624,40701.061430,1
2,2017-10-20,2017-18,3748,51.143397,191685.452064,4
3,2017-10-26,2017-18,644,44.742957,28814.464480,1
4,2017-10-28,2017-18,1949,47.110317,91818.008250,2
...,...,...,...,...,...,...
234,2024-03-30,2023-24,2062,76.415893,157569.570785,3
235,2024-04-09,2023-24,1714,58.754066,100704.468400,3
236,2024-04-11,2023-24,1852,47.321129,87638.731227,2
237,2024-04-13,2023-24,2126,73.059234,155323.930624,2
