In [1]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy import optimize
from sklearn.preprocessing import StandardScaler

In [64]:
q = """
        WITH nightly AS (
            SELECT
                date(event_date) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
                AND is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                count(*) AS nightly_tickets
            FROM
                custom.cth_v_ticket_2324
            WHERE
                is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
        ),
        atp AS (
            SELECT
                date(event_date) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_v_ticket_2324
            GROUP BY
                event_date
        ),
        attendance AS (
            SELECT
                date(event_date) as event_date,
                sum(did_attend) AS attendance
            FROM
                custom.cth_historical_all_1718_2223
            GROUP BY
                event_date
            UNION
            SELECT
                date(event_datetime) as event_date,
                count(*) AS attendance
            FROM
                custom.cth_v_attendance_2324
            GROUP BY
                event_date
        )
        SELECT
            n.event_date,
            season,
            n.nightly_tickets,
            atp.atp,
            att.attendance,
            week_day,
            trimester,
            original_six_plus_extra,
            is_dense
        FROM
            nightly n
        LEFT JOIN
            atp ON n.event_date = atp.event_date
        LEFT JOIN
            attendance att ON n.event_date = att.event_date
        LEFT JOIN
            custom.cth_game_descriptions ON date(n.event_date) = date(cth_game_descriptions.event_date)
        WHERE
            n.event_date < (GETDATE() - 1)
        ORDER BY
            n.event_date               
    """

df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

In [65]:
df

Unnamed: 0,event_date,season,nightly_tickets,atp,attendance,week_day,trimester,original_six_plus_extra,is_dense
0,2017-09-26,2017-18,167,47.308669,3829.0,-1.0,-1.0,-1.00,-1.0
1,2017-09-28,2017-18,135,32.243238,3800.0,-1.0,-1.0,-1.00,-1.0
2,2017-10-07,2017-18,3676,44.109010,14432.0,1.0,1.0,1.00,0.0
3,2017-10-12,2017-18,868,46.890624,8367.0,3.0,1.0,0.00,0.0
4,2017-10-20,2017-18,3748,51.143397,13881.0,2.0,1.0,0.75,0.0
...,...,...,...,...,...,...,...,...,...
255,2024-03-16,2023-24,2029,103.305596,17542.0,7.0,3.0,1.50,0.0
256,2024-03-21,2023-24,2952,50.457514,17685.0,5.0,3.0,0.00,0.0
257,2024-03-26,2023-24,1618,105.045374,16493.0,3.0,3.0,1.00,0.0
258,2024-03-28,2023-24,3776,62.687284,17249.0,5.0,3.0,0.00,1.0


In [28]:
def get_clusters(X: np.array, n_clusters: int) -> np.array:

    kmeans = KMeans(n_clusters, random_state = 1693)
    kmeans.fit(X)
    clusters = kmeans.labels_

    return clusters

def create_clusters(redshift_creds: Dict, df: pd.DataFrame) -> pd.DataFrame:

    ## season list
    seasons = ['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24']

    ## filter out covid & pre/post-season
    df = df[df['season'].isin(['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24'])]
    df = df[df['is_regular_season'] == 1]

    ## add summary statistics
    df = pd.merge(left = df, right = get_summary_statistics(redshift_creds), how = "inner", on = "event_date")

    df_final = pd.DataFrame()

    for i in seasons:

        df_temp = df[df['season'] == i]
        df_clust_temp = df_temp[[
            'week_day', 
            'trimester', 
            'original_six_plus_extra', 
            'is_dense', 
            'nightly_tickets', 
            'atp', 
            'attendance'
            ]]
        
        x = np.array(df_clust_temp)
        x_standard = StandardScaler().fit_transform(x)

        ## get clusters
        df_temp['cluster_season'] = get_clusters(x_standard, 4)
        df_final = pd.concat([df_final,df_temp])

    ## select cols 
    df_clust = df_final[[
        'week_day', 
        'trimester', 
        'original_six_plus_extra', 
        'is_dense', 
        'nightly_tickets', 
        'atp', 
        'attendance'
        ]]

    ## scale
    x = np.array(df_clust)
    x_standard = StandardScaler().fit_transform(x)

    ## get clusters
    df_final['cluster'] = get_clusters(x_standard, 4)

    ## select cols
    df_final = df_final[['event_date', 'cluster', 'season_cluster']]

    return df_final

In [68]:
## season list
seasons = ['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24']

## filter out covid & pre/post-season
df = df[df['season'].isin(['2017-18', '2018-19', '2019-20', '2021-22', '2022-23', '2023-24'])]
df = df[df['is_regular_season'] == 1]

df_final = pd.DataFrame()

for i in seasons:
    #print(i)
    df_temp = df[df['season'] == i]
    #print(df_temp)
    df_clust_temp = df_temp[[
        'week_day', 
        'trimester', 
        'original_six_plus_extra', 
        'is_dense', 
        'nightly_tickets', 
        'atp', 
        'attendance'
        ]]
    x = np.array(df_clust_temp)
    x_standard = StandardScaler().fit_transform(x)
    ## get clusters
    df_temp['cluster_season'] = get_clusters(x_standard, 4)
    df_final = pd.concat([df_final,df_temp])

## select cols 
df_clust = df_final[[
    'week_day', 
    'trimester', 
    'original_six_plus_extra', 
    'is_dense', 
    'nightly_tickets', 
    'atp', 
    'attendance'
]]

## scale
x = np.array(df_clust)
x_standard = StandardScaler().fit_transform(x)

## get clusters
df_final['cluster'] = get_clusters(x_standard, 4)

## select cols
df_final = df_final[['event_date', 'cluster', 'season_cluster']]

  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['cluster_season'] = get_clusters(x_standard, 4)
  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['cluster_season'] = get_clusters(x_standard, 4)
  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/panda

KeyError: "['season_cluster'] not in index"

In [67]:
df_final

Unnamed: 0,event_date,cluster
0,2017-09-26,3
1,2017-09-28,3
2,2017-10-07,1
3,2017-10-12,3
4,2017-10-20,1
...,...,...
255,2024-03-16,1
256,2024-03-21,1
257,2024-03-26,1
258,2024-03-28,1


In [10]:
for i in df['cluster'].unique():

        if df['cluster'].value_counts()[i] < 5:

            mean_atp_df = df[['cluster', 'atp']].groupby(['cluster']).mean()
            mean_atp_df = mean_atp_df.loc[~mean_atp_df.index.isin([i])]

            print(mean_atp_df)

            for index, row in df[df['cluster'] == i].iterrows():

                df_closest = mean_atp_df.iloc[(mean_atp_df['atp']-row['atp']).abs().argsort()[:1]].index
                df.loc[df['event_date'] == row['event_date'], 'cluster'] = df_closest[0]

            if i != 0:

                df.loc[df['cluster'] == 0, 'cluster'] = i