# Ticket Prediction 25/26

## 0. Import libraries (& Redshift credentials if in VSCode)

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import null
from datetime import datetime

from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import missingno as msno
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV
import random

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor

from prefect.blocks.system import Secret
from catnip.fla_redshift import FLA_Redshift

In [2]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

## 1. Missing Data Evaluation

### 1.1 Import data

For large data sets grab from the warehouse once and then download as a CSV to pull from after. 

In [3]:
df = pd.read_csv("C:\\Users\\riffere\\Florida Panthers\\SP-BS - Documents\\Data Science\\Resources\\Files\\emily_query_data_2025-07-17.csv")

#df_2526 = pd.read_csv("C:\\Users\\riffere\\Desktop\\ticket_data_2526.csv")

df_2425_regular_season = df[df['game_type'] == 1]

df_2425_regular_season = df_2425_regular_season[df_2425_regular_season['ticket_type'].isin(['Singles','Flex','Nightly Suites','Groups','Secondary'])]

In [4]:
q = """
WITH initial AS (
    SELECT
        game.season,
        game.event_datetime,
        game_type,
        tier,
        abbreviation,
        div,
        is_conf,
        start_time,
        day_of_week,
        is_premier,
        trimester,
        original_six_plus_extra,
        ticket_type,
    ---price_level,
    ---arena_level_internal,
    ---is_comp,
    ---is_renewal,
        CASE
            WHEN ticket_type IN ('Singles','Quarter/Flex','Nightly Suites','Groups','Secondary') THEN gross_revenue
        END AS singles_gross_revenue,
        CASE
            WHEN ticket_type IN ('Singles','Quarter/Flex','Nightly Suites','Groups','Secondary') THEN paid_seats
        END AS singles_paid_seats,
        gross_revenue,
        paid_seats
    FROM
        custom.cth_v_ticket_2526 ticket
    LEFT JOIN
        custom.cth_game_descriptions game ON ticket.event_datetime::date = game.event_date)
SELECT
    season,
    event_datetime,
    game_type,
    tier,
    abbreviation,
    div,
    is_conf,
    start_time,
    day_of_week,
    is_premier,
    trimester,
    original_six_plus_extra,
    SUM(singles_gross_revenue) AS singles_gross_revenue,
    SUM(singles_paid_seats) AS singles_paid_seats,
    SUM(gross_revenue) AS gross_revenue,
    SUM(paid_seats) AS paid_seats,
    datediff('days',current_date,event_datetime::date) AS days_out
FROM
    initial
GROUP BY
    event_datetime,
    season,
    game_type,
    tier,
    abbreviation,
    div,
    is_conf,
    start_time,
    day_of_week,
    is_premier,
    trimester,
    original_six_plus_extra
"""

df_2526 = FLA_Redshift(**rs_creds).query_warehouse(sql_string=q)

In [5]:
df_2425 = (
    df_2425_regular_season
    .groupby([
        'season', 'event_datetime', 'tier', 'abbreviation', 'div', 'is_conf',
        'start_time', 'day_of_week', 'is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event'
        #, 'ticket_type', 'price_level', 'arena_level_internal'
    ])
    .agg(
        gross_revenue_sum=('gross_revenue', 'sum'),
        row_count=('gross_revenue', 'size')
    )
    .reset_index().rename(columns = {'row_count':'paid_seats'})
)

df_2425['running_paid_seats'] = df_2425.groupby('event_datetime')['paid_seats'].cumsum()

In [8]:
time_group_map = {
    '12:30 PM': 'early',
    '1:00 PM': 'early',
    '2:00 PM': 'early',
    '3:00 PM': 'early',
    '3:30 PM': 'early',
    '4:00 PM': 'afternoon',
    '5:00 PM': 'afternoon',
    '6:00 PM': 'afternoon',
    '6:30 PM': 'afternoon',
    '7:00 PM': 'evening',
    '7:30 PM': 'evening',
    '7:45 PM': 'evening',
    '8:00 PM': 'evening'
}

df_2425['time_group'] = df_2425['start_time'].map(time_group_map)
df_2526['time_group'] = df_2526['start_time'].map(time_group_map)

day_map = {
    'Sat': 'is_saturday',
    'Fri': 'is_friday_or_sunday',
    'Sun': 'is_friday_or_sunday',
    'Mon': 'is_weekday',
    'Tue': 'is_weekday',
    'Wed': 'is_weekday',
    'Thu': 'is_weekday',
}

df_2425['day_of_week_group'] = df_2425['day_of_week'].map(day_map)
df_2526['day_of_week_group'] = df_2526['day_of_week'].map(day_map)

df_2425['is_premier'] = df_2425['is_premier'].astype(int)
df_2526['is_premier'] = df_2526['is_premier'].astype(int)

df_encoded_2425 = pd.get_dummies(df_2425, columns=['tier','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')
df_encoded_2526 = pd.get_dummies(df_2526, columns=['tier','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')

# capacity, price, velocity

## 3. Train & Run Model

### 3.1 Spit data into train & test tables

Sometimes the split will be on season, other times train_test_split from sklearn can be used.

train_test_split info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [9]:
x_train = df_encoded_2425[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

x_test = df_encoded_2526[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

y = df_encoded_2425['running_paid_seats']

In [10]:
scaler = StandardScaler()

x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.fit_transform(x_test)

y_log = np.nan_to_num(np.log(np.array(y).ravel()), nan=0.0, posinf=0.0, neginf=0.0)

### 3.2 Write model

Model selection is very important, first you must figure out what your output variable you want which tells you what kind of model you want (classification, clustering, regression). Then you can look at specific model types in that model group. 

Sometimes multiple will need to be tested before settling on the best model for your current data and situation.

Model selection info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

### 3.3 Run model

In [11]:
regr = MLPRegressor(
    hidden_layer_sizes=(100,),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=512,
    learning_rate_init=0.001,
    max_iter=300,
    early_stopping=True,
    random_state=1993,
    verbose=True
)

regr.fit(x_train_scale, y_log) 

Iteration 1, loss = 31.10032528
Validation score: -151.000046
Iteration 2, loss = 18.42666776
Validation score: -80.335503
Iteration 3, loss = 8.86348638
Validation score: -31.745495
Iteration 4, loss = 3.15698529
Validation score: -9.090616
Iteration 5, loss = 1.02716222
Validation score: -3.006608
Iteration 6, loss = 0.52293127
Validation score: -1.641155
Iteration 7, loss = 0.38278757
Validation score: -1.078873
Iteration 8, loss = 0.31105377
Validation score: -0.731292
Iteration 9, loss = 0.26504573
Validation score: -0.503997
Iteration 10, loss = 0.23425995
Validation score: -0.345912
Iteration 11, loss = 0.21203994
Validation score: -0.228664
Iteration 12, loss = 0.19549052
Validation score: -0.144108
Iteration 13, loss = 0.18293321
Validation score: -0.076916
Iteration 14, loss = 0.17274846
Validation score: -0.020907
Iteration 15, loss = 0.16455245
Validation score: 0.024052
Iteration 16, loss = 0.15762124
Validation score: 0.063352
Iteration 17, loss = 0.15168890
Validation sc

In [12]:
predicted = regr.predict(x_test_scale)

df_encoded_2526['predicted'] = np.exp(predicted)
df_encoded_2526.groupby('event_datetime').sum()['paid_seats']

event_datetime
2025-09-29 18:00:00    15258
2025-10-04 19:00:00    16047
2025-10-07 17:00:00    18459
2025-10-09 19:00:00    16574
2025-10-11 19:00:00    17422
2025-10-23 19:00:00    17179
2025-10-25 18:00:00    17759
2025-10-28 19:00:00    15822
2025-11-01 18:00:00    17124
2025-11-13 19:00:00    15447
2025-11-15 18:00:00    17553
2025-11-17 19:00:00    14948
2025-11-20 19:00:00    15311
2025-11-22 19:00:00    17586
2025-11-26 19:00:00    16765
2025-11-28 16:00:00    17204
2025-12-02 19:30:00    17030
2025-12-04 19:00:00    15934
2025-12-06 15:30:00    17321
2025-12-07 17:00:00    17242
2025-12-17 19:00:00    15297
2025-12-19 19:00:00    15620
2025-12-20 18:00:00    17042
2025-12-27 19:00:00    17141
2025-12-29 19:00:00    16858
2025-12-30 19:00:00    17204
2026-01-04 17:00:00    17278
2026-01-19 19:00:00    16819
2026-01-27 19:00:00    15687
2026-01-31 16:00:00    17967
2026-02-02 19:00:00    15844
2026-02-04 19:00:00    17254
2026-02-26 19:00:00    17271
2026-02-27 19:00:00    17410

In [13]:
df_encoded_2526['total'] = df_encoded_2526['predicted'] + df_encoded_2526['paid_seats']

df_encoded_2526[['event_datetime','days_out','paid_seats','predicted', 'total']]

Unnamed: 0,event_datetime,days_out,paid_seats,predicted,total
0,2026-04-15 19:30:00,219,16938,2078.149121,19016.149121
1,2025-11-17 19:00:00,70,14826,4331.971625,19157.971625
2,2026-03-24 19:00:00,197,15291,6275.125403,21566.125403
3,2026-04-13 19:00:00,217,17329,3470.513485,20799.513485
4,2026-02-04 19:00:00,149,17152,2644.654144,19796.654144
5,2026-01-04 17:00:00,118,16952,3162.336888,20114.336888
6,2025-12-20 18:00:00,103,16901,7774.415641,24675.415641
7,2025-10-07 17:00:00,29,18030,784.113505,18814.113505
8,2026-02-02 19:00:00,147,15645,5613.165569,21258.165569
9,2026-01-27 19:00:00,141,15220,6229.904575,21449.904575


# THIS IS WHERE MY CURRENT WORK ENDS

## 4.1 Model evaluation

### 4.1 Classification Reports

(For classification models) Run classification report to see precision, recall, and fl score breakdown by group

Classification report info: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [16]:
def run_classification_report(x,y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    predicted = logi.predict(x)

    print(classification_report(predicted, y))

In [17]:
run_classification_report(x_train, y)

### 4.2 Recursive Feature Elimination (with cross validation)

Use Recursive Feature Elimination (with cross validation) to order the variables on most important to the classification to the least important. 

Especially useful for large datasets when you want to limit the number of variables in your model.

Below is a plot of the mean accuracy based on every possible number of features used in the model. 

It can be used to determine the optimal number of features without outside constraints placed on the model.

RFECV info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html

In [18]:
def run_rfe(estimator, step, scoring, x, y, show_plot):
    
    selector = RFECV(estimator = estimator, step = step, scoring = scoring)
    selector.fit(x, np.array(y)).n_features_

    rfe_features = x.columns[selector.support_]

    if show_plot == True:

        cv_results = pd.DataFrame(selector.cv_results_)
        
        # plt.figure()
        # plt.xlabel("Number of features selected")
        # plt.ylabel("Mean test accuracy")

        # plt.errorbar(
        # x=cv_results["n_features"],
        # y=cv_results["mean_test_score"],
        # yerr=cv_results["std_test_score"],
        #     )
        
        # plt.title("Recursive Feature Elimination \nwith correlated features")
        # plt.show()

    return cv_results

#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html

In [19]:
cv = run_rfe(LogisticRegression(class_weight='balanced'), 1, 'accuracy', x_train, y, True)

### 4.3 Coefficient & Odds Ratio

Similar to RFE, Coefficient & Odds Ratio order the variables by importance. 

Can compare to RFE to see the variables both say are most important.

In [79]:
def get_coeff_odds_ratio(x, y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    # Coefficients and Odds Ratios
    coefficients = logi.coef_[0]
    odds_ratios = np.exp(coefficients)


    # Display feature importance using coefficients and odds ratios
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients,
        'Odds Ratio': odds_ratios
    })
    print("\nFeature Importance (Coefficient and Odds Ratio):")
    print(feature_importance.sort_values(by='Coefficient', ascending=False))

In [81]:
get_coeff_odds_ratio(x_train, y)

### 4.4 Grid Search

Grid Search for Hyperparameters to optimize accuracy and precision.

Grid search info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [83]:
def grid_search(C, penalty, class_weight, solver, x, y):

    param_grid = {'C': C,  
              'penalty': penalty, 
              'class_weight' : class_weight,
              'solver' : solver} 

    grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, refit = True, verbose = 3,n_jobs=-1) 
   
    # fitting the model for grid search 
    grid.fit(x, y) 
 
    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(x) 
   
    # print classification report 
    print(classification_report(y, grid_predictions)) 

In [85]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [87]:
def auc_roc(x, y, display_labels):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))
    predicted = logi.predict(x)

    roc_auc_score(predicted, y)

    disp = ConfusionMatrixDisplay(confusion_matrix(predicted, y), display_labels= display_labels)
    disp.plot()

    plt.show()

In [89]:
auc_roc(x_train, y, ['a', 'b'])

## 5. Model Tuning

### 5.1 Feature Elimination

Use only variables from RFE and Coefficient and Odds ratio say are most important 

In [1]:
x_train_new = df[['put_columns_here']]
y_new = df[['put_y_here']] 

x_test_new = df[['put_columns_here']]

In [None]:
run_classification_report(x_train_new, y_new)

### 5.2 SMOTE (Synthetic Minority Over-Sampling Technique)

In cases where one class of data is much smaller than the other you can use SMOTE to create new fake data in the smaller class to balance them out and run a more accurate model. 

In [3]:
def smote(k_neighbors, x, y):

    sm = SMOTE(random_state=1993, k_neighbors=k_neighbors)

    X_res, y_res = sm.fit_resample(x, y)

    logi = LogisticRegression(class_weight='balanced').fit(X_res, np.array(y_res))

    predicted = logi.predict(X_res)

    print(classification_report(predicted, y_res))

In [5]:
smote(5, x_train, y)

### 5.3 Re-run Evaluation Techniques

Rerun RFE, grid search, and AUC/ROC to see if best variables and hyperparameters have changed and rerun the model

In [None]:
run_rfe(LogisticRegression(), 1, 'accuracy', x_train, y, False)

In [None]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [None]:
auc_roc(x_train, y, ['Non-renewed', 'Renewed'])            

## 6. Final Output

Make sure the final result is in a useable format for your specific scenario.

In [None]:
df_final = [['put_x_columns_here', 'put_y_here']]