In [22]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy import optimize
from sklearn.preprocessing import StandardScaler

In [24]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [23]:
def get_budget_goals(redshift_creds: Dict) -> pd.DataFrame:

    # redshift_creds['input_schema'] = BudgetGoalsSchema

    q = """
        SELECT
            event_date,
            round(sum(budget_goal)) AS "budget_goal"
        FROM
            custom.cth_budget_summary_2324
        GROUP BY
            event_date
        ORDER BY
            event_date
    """

    return FLA_Redshift(**redshift_creds).query_warehouse(sql_string = q)

def get_summary_statistics(redshift_creds: Dict) -> pd.DataFrame:

    '''
        - Need gross revenue field
        - need current attendance scans
        - join on event date?
    '''
    # redshift_creds['input_schema'] = SummaryStatisticsSchema

    q = """
        WITH nightly AS (
            SELECT
                event_date, 
                count(*) AS nightly_tickets
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
                AND is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
            UNION
            SELECT
                event_date, 
                count(*) AS nightly_tickets
            FROM
                custom.cth_v_ticket_2324
            WHERE
                is_comp = FALSE
                AND ticket_type IN ('Singles', 'Flex')
            GROUP BY
                event_date
        ),
        atp AS (
            SELECT
                event_date, 
                sum(gross_revenue)/count(*) AS atp
            FROM
                custom.cth_historical_all_1718_2223
            WHERE
                season != '2020-21'
            GROUP BY
                event_date
            UNION
            SELECT
                event_date, 
                sum(block_purchase_price)/count(*) AS atp
            FROM
                custom.cth_v_ticket_2324
            GROUP BY
                event_date
        ),
        attendance AS (
            SELECT
                event_date, 
                sum(did_attend) AS attendance
            FROM
                custom.cth_historical_all_1718_2223
            GROUP BY
                event_date
            UNION
            SELECT
                event_date, 
                sum(entry) AS attendance
            FROM
                custom.cth_attendance_scans_2223
            GROUP BY
                event_date
        )
        SELECT
            n.event_date, 
            n.nightly_tickets, 
            atp.atp, 
            att.attendance
        FROM
            nightly n
        LEFT JOIN
            atp ON n.event_date = atp.event_date
        LEFT JOIN
            attendance att ON n.event_date = att.event_date
        WHERE
            n.event_date < (GETDATE() - 1)
        ORDER BY
            n.event_date
    """

    return FLA_Redshift(**redshift_creds).query_warehouse(sql_string = q)

def get_clusters(X: np.array, n_clusters: int) -> np.array:

    kmeans = KMeans(n_clusters, random_state = 1693)
    kmeans.fit(X)
    clusters = kmeans.labels_

    return clusters

def create_clusters(redshift_creds: Dict, df: pd.DataFrame) -> pd.DataFrame:

    ## filter out covid & pre/post-season
    df = df[df['season'].isin(['2023-24'])]
    df = df[df['is_regular_season'] == 1]

    ## add summary statistics
    df = pd.merge(left = df, right = get_summary_statistics(redshift_creds), how = "inner", on = "event_date")

    ## select cols 
    df_clust = df[[
        'week_day', 
        'trimester', 
        'original_six_plus_extra', 
        'is_dense', 
        'nightly_tickets', 
        'atp', 
        'attendance'
    ]]

    ## scale
    x = np.array(df_clust)
    x_standard = StandardScaler().fit_transform(x)

    ## get clusters
    df['cluster'] = get_clusters(x_standard, 4)

    ## select cols
    df = df[['event_date', 'cluster']]

    return df 

def get_offset(event_date: pd.Timestamp):

    if (
        (pd.Timestamp(2021, 11, 6) < event_date < pd.Timestamp(2022, 3, 14))
        or (pd.Timestamp(2022, 11, 5) < event_date < pd.Timestamp(2023, 3, 13))
        or (pd.Timestamp(2023, 11, 4) < event_date < pd.Timestamp(2024, 3, 11))
    ):
        return "-05:00"
    
    else:
        return "-04:00"

In [19]:
def transform(redshift_creds: Dict, df: pd.DataFrame) -> pd.DataFrame:

    ## Clean/create datetime fields
    df['event_datetime'] = [f"{ed.strftime('%Y-%m-%d')} {st}" for ed, st in zip(df['event_date'], df['start_time'])]

    to_datetime_cols = ['event_datetime', 'event_date']
    for col in to_datetime_cols:
        df[col] = pd.to_datetime(df[col])

    df['event_datetime_utc'] = [f"{x.strftime('%Y-%m-%d %H:%M:%S')}{get_offset(x)}" for x in df['event_datetime']]
    df['event_datetime_utc'] = pd.to_datetime(df['event_datetime_utc'], utc=True).dt.tz_convert('US/Eastern')

    ## Event name parameter
    df['event_name_parameter'] = df.apply(
        lambda row: f"{row['game_number']}: {datetime.strftime(row['event_date'], '%m/%d')} - {row['tier']} - {row['abbreviation']} - {datetime.strftime(row['event_date'], '%a')}"
            if row['season'] == "2023-24" else "", axis = 1
    )

    ## Tableau fields
    df['day_of_week'] = df['event_datetime'].dt.strftime("%a")
    df['start_time_tableau'] = [
        f"{t.split(':')[0]} {t.split(' ')[-1]}" if t.split(':')[1].split(' ')[0] == '00'
            else t for t in df['start_time']
    ]

    ## Get new budget goals
    budget_df = get_budget_goals(redshift_creds)
    df = pd.merge(left = df, right = budget_df, on = "event_date", how = "left")

    df['budget_goal'] = df.apply(
        lambda row: row['budget_goal_y'] if row['season'] == "2023-24" else row['budget_goal_x'], axis = 1 
    )
    df = df.drop(columns = ['budget_goal_x', 'budget_goal_y'])

    ## Get clusters
    # df = pd.merge(left = df, right = create_clusters(redshift_creds, df), how = "left", on = "event_date")

    return df

#@task(log_prints = True)
def load(redshift_creds: Dict, df: pd.DataFrame) -> None:

    redshift_creds['output_schema'] = OutputSchema

    FLA_Redshift(**redshift_creds).write_to_warehouse(
        df = df,
        table_name = "cth_game_descriptions"
    )

    ## Create extension table

    q = """
        DROP TABLE IF EXISTS custom.cth_game_descriptions_extension;

        CREATE TABLE custom.cth_game_descriptions_extension AS (
            SELECT
                gd.game_number,
                products.product_id AS "seatgeek_product_id",
                fortress.productcode AS "fortress_product_code",
                gd.event_date,
                gd.abbreviation,
                gd.full_opponent,
                gd.start_time,
                gd.event_datetime,
                to_char(gd.event_datetime, 'Day') AS "day_of_week",
                gd.tier,
                gd.is_premier,
                gd.budget_goal,
                getdate() AS "processed_date"
            FROM
                custom.cth_game_descriptions gd
            LEFT JOIN
                custom.fortress_events fortress ON gd.event_datetime = fortress.eventvaliddate
            LEFT JOIN
                custom.seatgeek_v_products products ON gd.event_date = date(products.event_date)
                    AND products.product_type = 'Event'
                    AND (products.product_description ILIKE '23H%' OR products.product_description ILIKE '24H%')
            WHERE
                season = '2023-24'
            ORDER BY
                event_datetime
        );
    """
    #FLA_Redshift(**redshift_creds).execute_and_commit(sql_string=q)

    return None 

########################################################################
### FLOW ###############################################################
########################################################################

#@flow
def cth_game_descriptions() -> None:

    # bsae credentials
    redshift_creds = get_redshift_credentials()

    df = extract()
    df = transform(redshift_creds, df)
    load(redshift_creds, df)

    return None 


if __name__ == "__main__":

    cth_game_descriptions()

AttributeError: 'coroutine' object has no attribute 'get'

In [5]:
df_clust = df[[
        'week_day', 
        'trimester', 
        'original_six_plus_extra', 
        'is_dense', 
        'nightly_tickets', 
        'atp', 
        'attendance']]

x = np.array(df_clust)
X = StandardScaler().fit_transform(x)

In [11]:
def get_even_clusters(X: np.array, n_clusters: int) -> np.array:

    kmeans = KMeans(n_clusters, random_state = 1693)
    kmeans.fit(X)
    clusters = kmeans.labels_

    return clusters

In [13]:
df['cluster'] = get_even_clusters(X, 4)

  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
df

Unnamed: 0,event_date,nightly_tickets,atp,attendance,week_day,trimester,original_six_plus_extra,is_dense,cluster
0,2024-01-17,3175,64.037092,16009.0,4,2,0.75,1,2
1,2024-02-06,2161,61.352928,14635.0,3,3,1.0,0,1
2,2023-11-12,1670,65.243778,16221.0,1,1,1.0,0,3
3,2024-01-24,1889,48.669586,13849.0,4,2,0.0,0,3
4,2023-12-06,1683,46.431952,14984.0,4,2,0.0,0,3
5,2023-12-30,1362,137.124192,17906.0,7,2,0.75,0,0
6,2023-11-10,1301,62.917105,15450.0,6,1,0.0,0,3
7,2024-02-20,3387,50.582824,16623.0,3,3,0.0,0,1
8,2023-10-19,1187,89.807953,15400.0,5,1,1.0,0,3
9,2023-12-23,1977,95.145841,15615.0,7,2,0.75,0,1


In [10]:
for i in df['cluster'].unique():

        if df['cluster'].value_counts()[i] < 5:

            mean_atp_df = df[['cluster', 'atp']].groupby(['cluster']).mean()
            mean_atp_df = mean_atp_df.loc[~mean_atp_df.index.isin([i])]

            print(mean_atp_df)

            for index, row in df[df['cluster'] == i].iterrows():

                df_closest = mean_atp_df.iloc[(mean_atp_df['atp']-row['atp']).abs().argsort()[:1]].index
                df.loc[df['event_date'] == row['event_date'], 'cluster'] = df_closest[0]

            if i != 0:

                df.loc[df['cluster'] == 0, 'cluster'] = i