In [6]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [3]:
q = """
select 
    season, event_datetime, full_opponent, start_time_tableau, day_of_week, tier, is_premier, trimester, original_six_plus_extra, is_holiday, density
from 
    custom.cth_game_descriptions
where 
    season in ('2022-23','2023-24') and game_type = 1
order 
    by event_datetime
"""
df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)
df

Unnamed: 0,season,event_datetime,full_opponent,start_time_tableau,day_of_week,tier,is_premier,trimester,original_six_plus_extra,is_holiday,density
0,2022-23,2022-10-19 19:30:00,Philadelphia Flyers,7:30 PM,Wed,C,True,1,0.75,1,2.0
1,2022-23,2022-10-21 19:30:00,Tampa Bay Lightning,7:30 PM,Fri,B,True,1,1.00,0,3.0
2,2022-23,2022-10-23 17:00:00,New York Islanders,5 PM,Sun,D,True,1,0.00,0,2.0
3,2022-23,2022-10-29 16:00:00,Ottawa Senators,4 PM,Sat,C,True,1,0.00,0,1.0
4,2022-23,2022-11-09 19:00:00,Carolina Hurricanes,7 PM,Wed,E,False,1,0.00,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
77,2023-24,2024-03-30 12:30:00,Detroit Red Wings,12:30 PM,Sat,C,True,3,0.75,0,2.0
78,2023-24,2024-04-09 19:00:00,Ottawa Senators,7 PM,Tue,D,False,3,0.00,0,2.0
79,2023-24,2024-04-11 19:00:00,Columbus Blue Jackets,7 PM,Thu,E,False,3,0.00,0,3.0
80,2023-24,2024-04-13 17:00:00,Buffalo Sabres,5 PM,Sat,C,True,3,0.00,0,3.0


In [10]:
df['tier_num'] = df.apply(lambda row: 1 if row['tier'] == 'A' else (2 if row['tier'] == 'B' else (3 if row['tier'] == 'C' else (4 if row['tier'] == 'D' else 5))), axis = 1)

pcs = sorted(df['day_of_week'].unique())
pc_dict = dict((value,count) for count, value in enumerate(pcs))
df['day_of_week_num'] = df.apply(lambda row: pc_dict[row['day_of_week']], axis = 1)

y = df[['tier_num']]
y = np.array(y)

df4 = df[['day_of_week_num', 'tier_num', 'original_six_plus_extra','density', 'is_holiday']]
X = np.array(df4)
scaler = StandardScaler()
Xstandard = scaler.fit_transform(X)

In [11]:
clf = LogisticRegression(random_state = 1993).fit(Xstandard,y)

print(clf.score(Xstandard,y))

y_pred = clf.predict_proba(Xstandard)
y_pred_df = pd.DataFrame(list(map(np.ravel,y_pred)), columns = ['A','B','C','D','E'])
print(len(y_pred_df))

0.8780487804878049
82


  y = column_or_1d(y, warn=True)
