# Ticket Prediction 25/26

## 0. Import libraries (& Redshift credentials if in VSCode)

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import null
from datetime import datetime

from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import missingno as msno
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV
import random

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor

## 1. Missing Data Evaluation

### 1.1 Import data

For large data sets grab from the warehouse once and then download as a CSV to pull from after. 

In [2]:
df = pd.read_csv("C:\\Users\\riffere\\Florida Panthers\\SP-BS - Documents\\Data Science\\Resources\\Files\\emily_query_data_2025-07-17.csv")

df_2526 = pd.read_csv("C:\\Users\\riffere\\Desktop\\ticket_data_2526.csv")

df_regular_season = df[df['game_type'] == 1]

In [54]:
df_2425 = (
    df_regular_season
    .groupby([
        'season', 'event_datetime', 'tier', 'abbreviation', 'div', 'is_conf',
        'start_time', 'day_of_week', 'is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event'
        #, 'ticket_type', 'price_level', 'arena_level_internal'
    ])
    .agg(
        gross_revenue_sum=('gross_revenue', 'sum'),
        row_count=('gross_revenue', 'size')
    )
    .reset_index().rename(columns = {'row_count':'paid_seats'})
)

In [55]:
df_2425

Unnamed: 0,season,event_datetime,tier,abbreviation,div,is_conf,start_time,day_of_week,is_premier,trimester,original_six_plus_extra,days_out_from_event,gross_revenue_sum,paid_seats
0,2021-22,2021-10-14 19:00:00,B,PIT,2,1,7:00 PM,Thu,True,1,0.75,0,144959.55,1762
1,2021-22,2021-10-14 19:00:00,B,PIT,2,1,7:00 PM,Thu,True,1,0.75,1,182499.36,3948
2,2021-22,2021-10-14 19:00:00,B,PIT,2,1,7:00 PM,Thu,True,1,0.75,2,25606.04,405
3,2021-22,2021-10-14 19:00:00,B,PIT,2,1,7:00 PM,Thu,True,1,0.75,3,23751.33,3165
4,2021-22,2021-10-14 19:00:00,B,PIT,2,1,7:00 PM,Thu,True,1,0.75,4,12445.65,169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,2024-25,2025-04-14 19:00:00,B,NYR,2,1,7:00 PM,Mon,True,3,1.50,279,2970.78,67
36271,2024-25,2025-04-14 19:00:00,B,NYR,2,1,7:00 PM,Mon,True,3,1.50,280,1609.26,43
36272,2024-25,2025-04-14 19:00:00,B,NYR,2,1,7:00 PM,Mon,True,3,1.50,283,913.04,8
36273,2024-25,2025-04-14 19:00:00,B,NYR,2,1,7:00 PM,Mon,True,3,1.50,284,707764.95,6976


In [64]:
time_group_map = {
    '12:30 PM': 'early',
    '1:00 PM': 'early',
    '2:00 PM': 'early',
    '3:00 PM': 'early',
    '4:00 PM': 'afternoon',
    '5:00 PM': 'afternoon',
    '6:00 PM': 'afternoon',
    '6:30 PM': 'afternoon',
    '7:00 PM': 'evening',
    '7:30 PM': 'evening',
    '7:45 PM': 'evening',
    '8:00 PM': 'evening'
}

df_2425['time_group'] = df_2425['start_time'].map(time_group_map)
df_2526['time_group'] = df_2526['start_time'].map(time_group_map)

day_map = {
    'Sat': 'is_saturday',
    'Fri': 'is_friday_or_sunday',
    'Sun': 'is_friday_or_sunday',
    'Mon': 'is_weekday',
    'Tue': 'is_weekday',
    'Wed': 'is_weekday',
    'Thu': 'is_weekday',
}

df_2425['day_of_week_group'] = df_2425['day_of_week'].map(day_map)
df_2526['day_of_week_group'] = df_2526['day_of_week'].map(day_map)

df_2425['is_premier'] = df_2425['is_premier'].astype(int)
df_2526['is_premier'] = df_2526['is_premier'].astype(int)

df_encoded_2425 = pd.get_dummies(df_2425, columns=['tier','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')
df_encoded_2526 = pd.get_dummies(df_2526, columns=['tier','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')

# capacity, price, velocity

## 3. Train & Run Model

### 3.1 Spit data into train & test tables

Sometimes the split will be on season, other times train_test_split from sklearn can be used.

train_test_split info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [65]:
x_train = df_encoded_2425[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

x_test = df_encoded_2526[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

y = df_encoded_2425['paid_seats']

In [66]:
x_train

Unnamed: 0,is_premier,trimester,original_six_plus_extra,days_out_from_event,early,afternoon,evening,is_weekday,is_friday_or_sunday,is_saturday,A,B,C,D,E
0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
1,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0
2,1,1,0,2,0,0,1,1,0,0,0,1,0,0,0
3,1,1,0,3,0,0,1,1,0,0,0,1,0,0,0
4,1,1,0,4,0,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,1,3,1,279,0,0,1,1,0,0,0,1,0,0,0
36271,1,3,1,280,0,0,1,1,0,0,0,1,0,0,0
36272,1,3,1,283,0,0,1,1,0,0,0,1,0,0,0
36273,1,3,1,284,0,0,1,1,0,0,0,1,0,0,0


In [67]:
scaler = StandardScaler()

x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.fit_transform(x_test)

y_log = np.nan_to_num(np.log(np.array(y).ravel()), nan=0.0, posinf=0.0, neginf=0.0)

### 3.2 Write model

Model selection is very important, first you must figure out what your output variable you want which tells you what kind of model you want (classification, clustering, regression). Then you can look at specific model types in that model group. 

Sometimes multiple will need to be tested before settling on the best model for your current data and situation.

Model selection info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

### 3.3 Run model

In [68]:
regr = MLPRegressor(
    hidden_layer_sizes=(100,),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=512,
    learning_rate_init=0.001,
    max_iter=300,
    early_stopping=True,
    random_state=1993,
    verbose=True
)

regr.fit(x_train_scale, y_log) 

Iteration 1, loss = 2.90764070
Validation score: -0.286169
Iteration 2, loss = 0.92981272
Validation score: 0.025331
Iteration 3, loss = 0.85197054
Validation score: 0.056268
Iteration 4, loss = 0.82835759
Validation score: 0.077180
Iteration 5, loss = 0.80914048
Validation score: 0.099053
Iteration 6, loss = 0.79102236
Validation score: 0.117727
Iteration 7, loss = 0.77391745
Validation score: 0.139990
Iteration 8, loss = 0.75905355
Validation score: 0.154794
Iteration 9, loss = 0.74740514
Validation score: 0.167590
Iteration 10, loss = 0.73815526
Validation score: 0.176602
Iteration 11, loss = 0.73120281
Validation score: 0.185440
Iteration 12, loss = 0.72464944
Validation score: 0.190571
Iteration 13, loss = 0.71957970
Validation score: 0.196679
Iteration 14, loss = 0.71563481
Validation score: 0.202164
Iteration 15, loss = 0.71195624
Validation score: 0.207242
Iteration 16, loss = 0.70910913
Validation score: 0.209317
Iteration 17, loss = 0.70572141
Validation score: 0.212108
Itera

In [74]:
predicted = regr.predict(x_train_scale)

df_encoded_2425['predicted'] = np.exp(predicted)
df_encoded_2425.groupby('event_datetime').sum()['paid_seats']

event_datetime
2021-10-14 19:00:00    16294
2021-10-16 18:00:00    13688
2021-10-21 19:00:00    11249
2021-10-25 19:00:00    13943
2021-10-27 19:00:00    12310
                       ...  
2025-03-30 13:00:00    19435
2025-04-08 19:00:00    19263
2025-04-10 19:00:00    19455
2025-04-12 18:00:00    19321
2025-04-14 19:00:00    19460
Name: paid_seats, Length: 164, dtype: int64

## 4.1 Model evaluation

### 4.1 Classification Reports

(For classification models) Run classification report to see precision, recall, and fl score breakdown by group

Classification report info: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [16]:
def run_classification_report(x,y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    predicted = logi.predict(x)

    print(classification_report(predicted, y))

In [17]:
run_classification_report(x_train, y)

### 4.2 Recursive Feature Elimination (with cross validation)

Use Recursive Feature Elimination (with cross validation) to order the variables on most important to the classification to the least important. 

Especially useful for large datasets when you want to limit the number of variables in your model.

Below is a plot of the mean accuracy based on every possible number of features used in the model. 

It can be used to determine the optimal number of features without outside constraints placed on the model.

RFECV info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html

In [18]:
def run_rfe(estimator, step, scoring, x, y, show_plot):
    
    selector = RFECV(estimator = estimator, step = step, scoring = scoring)
    selector.fit(x, np.array(y)).n_features_

    rfe_features = x.columns[selector.support_]

    if show_plot == True:

        cv_results = pd.DataFrame(selector.cv_results_)
        
        # plt.figure()
        # plt.xlabel("Number of features selected")
        # plt.ylabel("Mean test accuracy")

        # plt.errorbar(
        # x=cv_results["n_features"],
        # y=cv_results["mean_test_score"],
        # yerr=cv_results["std_test_score"],
        #     )
        
        # plt.title("Recursive Feature Elimination \nwith correlated features")
        # plt.show()

    return cv_results

#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html

In [19]:
cv = run_rfe(LogisticRegression(class_weight='balanced'), 1, 'accuracy', x_train, y, True)

### 4.3 Coefficient & Odds Ratio

Similar to RFE, Coefficient & Odds Ratio order the variables by importance. 

Can compare to RFE to see the variables both say are most important.

In [79]:
def get_coeff_odds_ratio(x, y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    # Coefficients and Odds Ratios
    coefficients = logi.coef_[0]
    odds_ratios = np.exp(coefficients)


    # Display feature importance using coefficients and odds ratios
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients,
        'Odds Ratio': odds_ratios
    })
    print("\nFeature Importance (Coefficient and Odds Ratio):")
    print(feature_importance.sort_values(by='Coefficient', ascending=False))

In [81]:
get_coeff_odds_ratio(x_train, y)

### 4.4 Grid Search

Grid Search for Hyperparameters to optimize accuracy and precision.

Grid search info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [83]:
def grid_search(C, penalty, class_weight, solver, x, y):

    param_grid = {'C': C,  
              'penalty': penalty, 
              'class_weight' : class_weight,
              'solver' : solver} 

    grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, refit = True, verbose = 3,n_jobs=-1) 
   
    # fitting the model for grid search 
    grid.fit(x, y) 
 
    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(x) 
   
    # print classification report 
    print(classification_report(y, grid_predictions)) 

In [85]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [87]:
def auc_roc(x, y, display_labels):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))
    predicted = logi.predict(x)

    roc_auc_score(predicted, y)

    disp = ConfusionMatrixDisplay(confusion_matrix(predicted, y), display_labels= display_labels)
    disp.plot()

    plt.show()

In [89]:
auc_roc(x_train, y, ['a', 'b'])

## 5. Model Tuning

### 5.1 Feature Elimination

Use only variables from RFE and Coefficient and Odds ratio say are most important 

In [1]:
x_train_new = df[['put_columns_here']]
y_new = df[['put_y_here']] 

x_test_new = df[['put_columns_here']]

In [None]:
run_classification_report(x_train_new, y_new)

### 5.2 SMOTE (Synthetic Minority Over-Sampling Technique)

In cases where one class of data is much smaller than the other you can use SMOTE to create new fake data in the smaller class to balance them out and run a more accurate model. 

In [3]:
def smote(k_neighbors, x, y):

    sm = SMOTE(random_state=1993, k_neighbors=k_neighbors)

    X_res, y_res = sm.fit_resample(x, y)

    logi = LogisticRegression(class_weight='balanced').fit(X_res, np.array(y_res))

    predicted = logi.predict(X_res)

    print(classification_report(predicted, y_res))

In [5]:
smote(5, x_train, y)

### 5.3 Re-run Evaluation Techniques

Rerun RFE, grid search, and AUC/ROC to see if best variables and hyperparameters have changed and rerun the model

In [None]:
run_rfe(LogisticRegression(), 1, 'accuracy', x_train, y, False)

In [None]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [None]:
auc_roc(x_train, y, ['Non-renewed', 'Renewed'])            

## 6. Final Output

Make sure the final result is in a useable format for your specific scenario.

In [None]:
df_final = [['put_x_columns_here', 'put_y_here']]