In [1]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

from sklearn.preprocessing import MinMaxScaler
from dataclasses import dataclass, field
import requests
import calendar
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import confusion_matrix as cm

In [2]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [3]:
panthers_df = pd.read_csv("C:\\Users\\riffere\\Desktop\\panthers_results.csv")

In [4]:
panthers_df['weekday'] = panthers_df.apply(lambda row: datetime.strptime(row['event_date'], '%m/%d/%y').weekday(), axis =1)

In [5]:
# density
calendar_df = pd.DataFrame(columns = ['days_of_the_year'], data = 
    ['01-01','01-02','01-03','01-04','01-05','01-06','01-07','01-08','01-09','01-10','01-11',
    '01-12','01-13','01-14','01-15','01-16','01-17','01-18','01-19','01-20','01-21','01-22',
    '01-23','01-24','01-25','01-26','01-27','01-28','01-29','01-30','01-31',
    '02-01','02-02','02-03','02-04','02-05','02-06','02-07','02-08','02-09','02-10','02-11',
    '02-12','02-13','02-14','02-15','02-16','02-17','02-18','02-19','02-20','02-21','02-22',
    '02-23','02-24','02-25','02-26','02-27','02-28',
    '03-01','03-02','03-03','03-04','03-05','03-06','03-07','03-08','03-09','03-10','03-11',
    '03-12','03-13','03-14','03-15','03-16','03-17','03-18','03-19','03-20','03-21','03-22',
    '03-23','03-24','03-25','03-26','03-27','03-28','03-29','03-30','03-31',
    '04-01','04-02','04-03','04-04','04-05','04-06','04-07','04-08','04-09','04-10','04-11',
    '04-12','04-13','04-14','04-15','04-16','04-17','04-18','04-19','04-20','04-21','04-22',
    '04-23','04-24','04-25','04-26','04-27','04-28','04-29','04-30',
    '05-01','05-02','05-03','05-04','05-05','05-06','05-07','05-08','05-09','05-10','05-11',
    '05-12','05-13','05-14','05-15','05-16','05-17','05-18','05-19','05-20','05-21','05-22',
    '05-23','05-24','05-25','05-26','05-27','05-28','05-29','05-30','05-31',
    '06-01','06-02','06-03','06-04','06-05','06-06','06-07','06-08','06-09','06-10','06-11',
    '06-12','06-13','06-14','06-15','06-16','06-17','06-18','06-19','06-20','06-21','06-22',
    '06-23','06-24','06-25','06-26','06-27','06-28','06-29','06-30',
    '07-01','07-02','07-03','07-04','07-05','07-06','07-07','07-08','07-09','07-10','07-11',
    '07-12','07-13','07-14','07-15','07-16','07-17','07-18','07-19','07-20','07-21','07-22',
    '07-23','07-24','07-25','07-26','07-27','07-28','07-29','07-30','07-31',
    '08-01','08-02','08-03','08-04','08-05','08-06','08-07','08-08','08-09','08-10','08-11',
    '08-12','08-13','08-14','08-15','08-16','08-17','08-18','08-19','08-20','08-21','08-22',
    '08-23','08-24','08-25','08-26','08-27','08-28','08-29','08-30','08-31',
    '09-01','09-02','09-03','09-04','09-05','09-06','09-07','09-08','09-09','09-10','09-11',
    '09-12','09-13','09-14','09-15','09-16','09-17','09-18','09-19','09-20','09-21','09-22',
    '09-23','09-24','09-25','09-26','09-27','09-28','09-29','09-30',
    '10-01','10-02','10-03','10-04','10-05','10-06','10-07','10-08','10-09','10-10','10-11',
    '10-12','10-13','10-14','10-15','10-16','10-17','10-18','10-19','10-20','10-21','10-22',
    '10-23','10-24','10-25','10-26','10-27','10-28','10-29','10-30','10-31',
    '11-01','11-02','11-03','11-04','11-05','11-06','11-07','11-08','11-09','11-10','11-11',
    '11-12','11-13','11-14','11-15','11-16','11-17','11-18','11-19','11-20','11-21','11-22',
    '11-23','11-24','11-25','11-26','11-27','11-28','11-29','11-30',
    '12-01','12-02','12-03','12-04','12-05','12-06','12-07','12-08','12-09','12-10','12-11',
    '12-12','12-13','12-14','12-15','12-16','12-17','12-18','12-19','12-20','12-21','12-22',
    '12-23','12-24','12-25','12-26','12-27','12-28','12-29','12-30','12-31'])

def get_range(date : datetime.date, calendar_df_active):
    month = date[0:2]
    day = date[3:5]
    year = int(date[6:])
    if calendar.isleap(year):
        leap_day = pd.DataFrame(columns = ['days_of_the_year'], data = ['02-29'])
        calendar_df_active = pd.concat([calendar_df_active[0:59], leap_day, calendar_df_active[59:]], ignore_index= True)
    date = str(str(month) + '-' + str(day))
    if date < '01-04':
        a = calendar_df_active[:calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]+4]
        left = 7 - len(a)
        b = calendar_df_active[(365-left):].values
        total = np.concatenate((a,b), axis = 0)
    elif date > '12-28':
        b = calendar_df_active[calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]-3:]
        left = 7 - len(b)
        a = calendar_df_active[0:(left)].values
        total = np.concatenate((b,a), axis = 0)
    else:
        total = calendar_df_active[calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]-3:
                calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]+4].values
    val = 0
    for i in total:
        month = i[0][0:2]
        day = i[0][3:]
        date = str(month + '/' + day + '/'+str(year))
        if date in event_dates:
            val += 1
    return val

In [6]:
event_dates = panthers_df['event_date'].values
panthers_df['fla_density'] = panthers_df.apply(lambda row: get_range(row['event_date'], calendar_df),axis = 1)

In [7]:
def get_back(date, calendar_df_active):
    month = date[0:2]
    day = date[3:5]
    year = int(date[6:])
    if calendar.isleap(year):
        leap_day = pd.DataFrame(columns = ['days_of_the_year'], data = ['02-29'])
        calendar_df_active = pd.concat([calendar_df_active[0:59], leap_day, calendar_df_active[59:]], ignore_index= True)
    date = str(str(month) + '-' + str(day))
    if date == '01-01':
        total = ['12-31','01-01', '01-02']
    elif date == '12-31':
        total = ['12-30', '12-31', '01-01']
    else:
        total = calendar_df_active[calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]-1:
                calendar_df_active.index[calendar_df_active['days_of_the_year'] == date].tolist()[0]+2].values
    new_list = []
    for i in total:
        month = i[0][0:2]
        day = i[0][3:]
        date = str(month + '/' + day + '/'+str(year))
        new_list.append(date)
    if new_list[0] in event_dates:
        val = 2
    elif new_list[2] in event_dates:
        val = 1
    else:
        val = 0
    return val

In [8]:
panthers_df['fla_back_to_back'] = panthers_df.apply(lambda row: get_back(row['event_date'], calendar_df),axis = 1)

In [9]:
panthers_df['game_loc'] = panthers_df.apply(lambda row: row['opponent'] if row['side'] == 'Away' else 'FLA', axis = 1)
panthers_df['prev_game_loc'] = panthers_df.game_loc.shift(1)
panthers_df.at[0, 'prev_game_loc'] = 'FLA'

In [10]:
nhl_distances = pd.read_csv('C:\\Users\\riffere\\Desktop\\NHL_Distance.csv')
nhl_distances = nhl_distances.set_index('Unnamed: 0')
panthers_df['distance_between'] = panthers_df.apply(lambda row: nhl_distances.at[row['game_loc'], row['prev_game_loc']], axis = 1)

In [11]:
overall_df =  pd.read_csv("C:\\Users\\riffere\\Desktop\\season_team_points.csv")

In [12]:
teams = overall_df['opponent'].unique()

rolling_points_pctg = pd.DataFrame()

for team in teams:
    temp = overall_df[overall_df['opponent'] == team]
    rol_avg = temp['point_pct'].rolling(3, closed='left', min_periods = 1).mean()
    temp['last_3_pp_avg'] = rol_avg
    rolling_points_pctg = pd.concat([rolling_points_pctg, temp], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['last_3_pp_avg'] = rol_avg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['last_3_pp_avg'] = rol_avg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['last_3_pp_avg'] = rol_avg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [13]:
rolling_points_pctg = rolling_points_pctg[['season','opponent','last_3_pp_avg']]

In [14]:
opp_schedules =  pd.read_csv("C:\\Users\\riffere\\Desktop\\entire_nhl_schedule_historical.csv")

In [15]:
opp_schedules_final = pd.DataFrame()
for team in opp_schedules['team_name'].unique():

    temp = opp_schedules[opp_schedules['team_name'] == team]
    event_dates = temp['event_date'].values
    temp['opp_density'] = temp.apply(lambda row: get_range(row['event_date'], calendar_df),axis = 1)

    temp['opp_back_to_back'] = temp.apply(lambda row: get_back(row['event_date'], calendar_df),axis = 1)

    opp_schedules_final = pd.concat([opp_schedules_final, temp], axis = 0)

opp_schedules_final = opp_schedules_final[['game_id','event_date','opp_density','opp_back_to_back', 'team_name']].rename(columns = {'game_id':'id', 'team_name' : 'opponent'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['opp_density'] = temp.apply(lambda row: get_range(row['event_date'], calendar_df),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['opp_back_to_back'] = temp.apply(lambda row: get_back(row['event_date'], calendar_df),axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['o

In [16]:
panthers_df_final = panthers_df.merge(opp_schedules_final, how = 'left', on = ['id', 'event_date', 'opponent'])

panthers_df_final = panthers_df_final.merge(rolling_points_pctg, how = 'left', on = ['season', 'opponent'])

dist = panthers_df_final[['distance_between']]
scaler = MinMaxScaler()
scaled = scaler.fit_transform(dist)
panthers_df_final['scaled_distance'] = scaled

panthers_df_final['home_away_status'] = np.where(panthers_df_final['side'] == 'Home', 0, 1)

panthers_df_final['decision'] = panthers_df_final['result'].map({'Win': 2, 'Loss': 0})

panthers_df_final.loc[287, 'last_3_pp_avg'] = 0.4146
panthers_df_final.loc[317, 'last_3_pp_avg'] = 0.4146

panthers_df_final = panthers_df_final.dropna(subset=['last_3_pp_avg'])

In [17]:
df2425 = panthers_df_final[panthers_df_final['season'] == 20242025]
df2124 = panthers_df_final[panthers_df_final['season'] != 20242025]

x2425 = df2425[['home_away_status', 'fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density', 'decision']]
x2124 = df2124[['home_away_status', 'fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density', 'decision']]

In [18]:
home_x_initial = x2124[x2124['home_away_status'] == 0]
away_x_initial = x2124[x2124['home_away_status'] == 1]

home_x = home_x_initial[['fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density']]
away_x = away_x_initial[['fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density']]

home_y = home_x_initial['decision']
away_y = away_x_initial['decision']

In [19]:
home_x_initial_2425 = x2425[x2425['home_away_status'] == 0]
away_x_initial_2425 = x2425[x2425['home_away_status'] == 1]

home_x_2425 = home_x_initial_2425[['fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density']]
away_x_2425 = away_x_initial_2425[['fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density']]

In [None]:
home_x

Unnamed: 0,fla_back_to_back,fla_density,scaled_distance,last_3_pp_avg,opp_back_to_back,opp_density
0,0,2,0.000000,0.640333,0,3
1,0,3,0.000000,0.616667,0,3
3,0,3,0.066388,0.646000,0,3
5,0,3,0.367409,0.511667,0,3
6,0,4,0.000000,0.672667,1,4
...,...,...,...,...,...,...
237,0,4,0.000000,0.456000,0,3
242,0,3,0.456692,0.474800,0,4
243,0,3,0.000000,0.427533,0,3
244,0,3,0.000000,0.447367,0,3


In [21]:
rfc = RFC(random_state = 1993)
rfc.fit(home_x, home_y)
predicted_home = rfc.predict(home_x)
# print(sum(y_home == predicted_home)/y_home.shape[0])
cm_df_rfc = pd.DataFrame(data = cm(home_y, predicted_home), columns = ['loss', 'win'], index = ['loss', 'win'])
cm_df_rfc

Unnamed: 0,loss,win
loss,38,1
win,0,83


In [22]:
rfc = RFC(random_state = 1993)
rfc.fit(away_x, away_y)
predicted_away = rfc.predict(away_x)
# print(sum(y_away == predicted_away)/y_away.shape[0])
cm_df_rfc = pd.DataFrame(data = cm(away_y, predicted_away), columns = ['loss', 'win'], index = ['loss', 'win'])
cm_df_rfc

Unnamed: 0,loss,win
loss,53,0
win,0,69


In [23]:
predicted_home = rfc.predict(home_x_2425)
predict_soft_home = rfc.predict_proba(home_x_2425)
soft_predict = pd.DataFrame(predict_soft_home, columns = ['loss', 'win'])
home_x_2425['predicted'] = predicted_home
home_x_2425.reset_index(inplace = True)
x_2223_final_1 = pd.concat([home_x_2425, soft_predict], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  home_x_2425['predicted'] = predicted_home


In [24]:
predicted_away = rfc.predict(away_x_2425)
predict_soft_away = rfc.predict_proba(away_x_2425)
soft_predict = pd.DataFrame(predict_soft_away, columns = ['loss', 'win'])
away_x_2425['predicted'] = predicted_away
away_x_2425.reset_index(inplace = True)
x_2223_final = pd.concat([away_x_2425, soft_predict], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  away_x_2425['predicted'] = predicted_away


In [25]:
xx = pd.concat([x_2223_final, x_2223_final_1]).reset_index(drop = True)
xx['diff'] = xx.apply(lambda row: abs(row['win']- row['loss']), axis = 1)

xx_loss = xx[xx['predicted'] == 0]
indxs = list(xx_loss.sort_values(by = 'diff')[0:9].index)
for indx in indxs:
     xx.loc[indx, 'predicted'] = 1

In [26]:
final = df2425.merge(xx, how = 'left', on = ['fla_back_to_back', 'fla_density', 'scaled_distance', 'last_3_pp_avg', 'opp_back_to_back', 'opp_density'])
final = final[['event_date', 'side', 'opponent', 'distance_between','predicted', 'win', 'loss']]
# final = final.drop([31, 65]).reset_index(drop = True)
# # print(final[:42])
# # print(final[42:])

In [27]:
print('overall:', len(final[final['predicted'] == 2]), '-', len(final[final['predicted'] == 0]), '-', len(final[final['predicted'] == 1]))
xx_home = final[final['side'] == 'Home']
print('home:', len(xx_home[xx_home['predicted'] == 2]), '-', len(xx_home[xx_home['predicted'] == 0]), '-', len(xx_home[xx_home['predicted'] == 1]))
xx_away = final[final['side'] == 'Away']
print('away:', len(xx_away[xx_away['predicted'] == 2]), '-', len(xx_away[xx_away['predicted'] == 0]), '-', len(xx_away[xx_away['predicted'] == 1]))
print('points:', sum(xx['predicted']))

overall: 46 - 27 - 9
home: 23 - 14 - 4
away: 23 - 13 - 5
points: 101.0


In [28]:
FLA_Redshift(**rs_creds).write_to_warehouse(df = final, table_name= "nhl_panthers_points_model_2425")

In [29]:
final

Unnamed: 0,event_date,side,opponent,distance_between,predicted,win,loss,processed_date
0,10/08/24,Home,BOS,0.00,2.0,0.58,0.42,2024-10-08 19:20:49.261456
1,10/10/24,Away,OTT,2163.96,2.0,0.71,0.29,2024-10-08 19:20:49.261456
2,10/12/24,Away,BUF,357.74,2.0,0.68,0.32,2024-10-08 19:20:49.261456
3,10/14/24,Away,BOS,641.64,0.0,0.23,0.77,2024-10-08 19:20:49.261456
4,10/15/24,Away,CBJ,1033.70,1.0,0.42,0.58,2024-10-08 19:20:49.261456
...,...,...,...,...,...,...,...,...
77,04/08/25,Home,TOR,1816.57,2.0,0.54,0.46,2024-10-08 19:20:49.261456
78,04/10/25,Home,DET,0.00,2.0,0.52,0.48,2024-10-08 19:20:49.261456
79,04/12/25,Home,BUF,0.00,2.0,0.60,0.40,2024-10-08 19:20:49.261456
80,04/14/25,Home,NYR,0.00,0.0,0.21,0.79,2024-10-08 19:20:49.261456
