# Ticket Prediction 25/26

## 0. Import libraries (& Redshift credentials if in VSCode)

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import null
from datetime import datetime

from typing import Dict
from concurrent.futures import ThreadPoolExecutor

import missingno as msno
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV
import random

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPRegressor

## 1. Missing Data Evaluation

### 1.1 Import data

For large data sets grab from the warehouse once and then download as a CSV to pull from after. 

In [2]:
df = pd.read_csv("C:\\Users\\riffere\\Florida Panthers\\SP-BS - Documents\\Data Science\\Resources\\Files\\emily_query_data_2025-07-17.csv")

df_2526 = pd.read_csv("C:\\Users\\riffere\\Desktop\\ticket_data_2526.csv")

df_regular_season = df[df['game_type'] == 1]

In [3]:
result = (
    df_regular_season
    .groupby([
        'season', 'event_datetime', 'tier', 'abbreviation', 'div', 'is_conf',
        'start_time', 'day_of_week', 'is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event', 'ticket_type', 'price_level', 'arena_level_internal'
    ])
    .agg(
        gross_revenue_sum=('gross_revenue', 'sum'),
        row_count=('gross_revenue', 'size')
    )
    .reset_index()
)

### 1.2 Look for missing data

See how many missing values are in each column

Look for correlations in the missing data to determine MCAR MAR or MNAR

If missing values are MCAR then you can delete rows. If a large % of a column is missing (and the feature is not significant) then you can delete columns.

Else impute:

* For non time series: Can impute with mean, median, mode, etc
* For time series: ffill or bfill

Missing data info: https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python

In [4]:
time_group_map = {
    '12:30 PM': 'early',
    '1:00 PM': 'early',
    '2:00 PM': 'early',
    '3:00 PM': 'early',
    '4:00 PM': 'afternoon',
    '5:00 PM': 'afternoon',
    '6:00 PM': 'afternoon',
    '6:30 PM': 'afternoon',
    '7:00 PM': 'evening',
    '7:30 PM': 'evening',
    '7:45 PM': 'evening',
    '8:00 PM': 'evening'
}

result['time_group'] = result['start_time'].map(time_group_map)
df_2526['time_group'] = df_2526['start_time'].map(time_group_map)

day_map = {
    'Sat': 'is_saturday',
    'Fri': 'is_friday_or_sunday',
    'Sun': 'is_friday_or_sunday',
    'Mon': 'is_weekday',
    'Tue': 'is_weekday',
    'Wed': 'is_weekday',
    'Thu': 'is_weekday',
}

result['day_of_week_group'] = result['day_of_week'].map(day_map)
df_2526['day_of_week_group'] = df_2526['day_of_week'].map(day_map)

result['is_premier'] = result['is_premier'].astype(int)
df_2526['is_premier'] = df_2526['is_premier'].astype(int)

df_encoded = pd.get_dummies(result, columns=['tier', 'arena_level_internal','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')
df_encoded_2526 = pd.get_dummies(df_2526, columns=['tier', 'arena_level_internal','day_of_week_group', 'time_group'], prefix = '', prefix_sep = '')

# capacity, price, velocity

In [5]:
df_encoded

Unnamed: 0,season,event_datetime,abbreviation,div,is_conf,start_time,day_of_week,is_premier,trimester,original_six_plus_extra,...,Lowers,Premium,Suites,Uppers,is_friday_or_sunday,is_saturday,is_weekday,afternoon,early,evening
0,2021-22,2021-10-14 19:00:00,PIT,2,1,7:00 PM,Thu,1,1,0.75,...,False,False,True,False,False,False,True,False,False,True
1,2021-22,2021-10-14 19:00:00,PIT,2,1,7:00 PM,Thu,1,1,0.75,...,False,False,True,False,False,False,True,False,False,True
2,2021-22,2021-10-14 19:00:00,PIT,2,1,7:00 PM,Thu,1,1,0.75,...,False,False,True,False,False,False,True,False,False,True
3,2021-22,2021-10-14 19:00:00,PIT,2,1,7:00 PM,Thu,1,1,0.75,...,True,False,False,False,False,False,True,False,False,True
4,2021-22,2021-10-14 19:00:00,PIT,2,1,7:00 PM,Thu,1,1,0.75,...,True,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279738,2024-25,2025-04-14 19:00:00,NYR,2,1,7:00 PM,Mon,1,3,1.50,...,True,False,False,False,False,False,True,False,False,True
279739,2024-25,2025-04-14 19:00:00,NYR,2,1,7:00 PM,Mon,1,3,1.50,...,True,False,False,False,False,False,True,False,False,True
279740,2024-25,2025-04-14 19:00:00,NYR,2,1,7:00 PM,Mon,1,3,1.50,...,True,False,False,False,False,False,True,False,False,True
279741,2024-25,2025-04-14 19:00:00,NYR,2,1,7:00 PM,Mon,1,3,1.50,...,False,True,False,False,False,False,True,False,False,True


## 2. Feature Evaluation

### 2.1 Correlation Coefficients

Correlation Coefficients show how related two variables are to see if there is collinearity and if one or multiple columns could be removed. 

The closer to 1 or -1 a correlation coefficient is the more the two variables are correlated. The closer to 0 the less they are correlated.

Correlation info: https://realpython.com/numpy-scipy-pandas-correlation-python/

Interaction Variables: If certain variables are highly correlated you can combine them or variables of that share similar data to create one variable. (ie. Attendance, Spend, Affinity, or Engagement scores)

## 3. Train & Run Model

### 3.1 Spit data into train & test tables

Sometimes the split will be on season, other times train_test_split from sklearn can be used.

train_test_split info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [42]:
x_train = df_encoded[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out_from_event', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

x_test = df_encoded_2526[['is_premier', 'trimester', 'original_six_plus_extra',
        'days_out', 'early','afternoon','evening','is_weekday',
        'is_friday_or_sunday','is_saturday', 'A','B','C','D','E']].astype(int)

y = df_encoded['row_count']

In [43]:
x_train

Unnamed: 0,is_premier,trimester,original_six_plus_extra,days_out_from_event,early,afternoon,evening,is_weekday,is_friday_or_sunday,is_saturday,A,B,C,D,E
0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
1,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
2,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
3,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279738,1,3,1,285,0,0,1,1,0,0,0,1,0,0,0
279739,1,3,1,285,0,0,1,1,0,0,0,1,0,0,0
279740,1,3,1,285,0,0,1,1,0,0,0,1,0,0,0
279741,1,3,1,285,0,0,1,1,0,0,0,1,0,0,0


In [44]:
scaler = StandardScaler()

x_train_scale = scaler.fit_transform(x_train)
x_test_scale = scaler.fit_transform(x_test)

y_log = np.nan_to_num(np.log(np.array(y).ravel()), nan=0.0, posinf=0.0, neginf=0.0)

### 3.2 Write model

Model selection is very important, first you must figure out what your output variable you want which tells you what kind of model you want (classification, clustering, regression). Then you can look at specific model types in that model group. 

Sometimes multiple will need to be tested before settling on the best model for your current data and situation.

Model selection info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

### 3.3 Run model

In [45]:
regr = MLPRegressor(
    hidden_layer_sizes=(100,),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=512,
    learning_rate_init=0.001,
    max_iter=300,
    early_stopping=True,
    random_state=1993,
    verbose=True
)

regr.fit(x_train_scale, y_log) 

Iteration 1, loss = 0.54416344
Validation score: 0.015254
Iteration 2, loss = 0.49631370
Validation score: 0.022329
Iteration 3, loss = 0.49316112
Validation score: 0.026648
Iteration 4, loss = 0.49075049
Validation score: 0.030868
Iteration 5, loss = 0.48783794
Validation score: 0.040715
Iteration 6, loss = 0.48505568
Validation score: 0.044538
Iteration 7, loss = 0.48198598
Validation score: 0.049915
Iteration 8, loss = 0.47915726
Validation score: 0.055806
Iteration 9, loss = 0.47689137
Validation score: 0.062298
Iteration 10, loss = 0.47468237
Validation score: 0.065622
Iteration 11, loss = 0.47298710
Validation score: 0.068662
Iteration 12, loss = 0.47141164
Validation score: 0.074651
Iteration 13, loss = 0.46984619
Validation score: 0.074461
Iteration 14, loss = 0.46860559
Validation score: 0.072982
Iteration 15, loss = 0.46779864
Validation score: 0.080822
Iteration 16, loss = 0.46653179
Validation score: 0.081653
Iteration 17, loss = 0.46598649
Validation score: 0.080391
Iterat

In [None]:
regr.score(x_train_scale,y)

0.19133249436451993

## 4.1 Model evaluation

### 4.1 Classification Reports

(For classification models) Run classification report to see precision, recall, and fl score breakdown by group

Classification report info: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

In [16]:
def run_classification_report(x,y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    predicted = logi.predict(x)

    print(classification_report(predicted, y))

In [17]:
run_classification_report(x_train, y)

### 4.2 Recursive Feature Elimination (with cross validation)

Use Recursive Feature Elimination (with cross validation) to order the variables on most important to the classification to the least important. 

Especially useful for large datasets when you want to limit the number of variables in your model.

Below is a plot of the mean accuracy based on every possible number of features used in the model. 

It can be used to determine the optimal number of features without outside constraints placed on the model.

RFECV info: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html

In [18]:
def run_rfe(estimator, step, scoring, x, y, show_plot):
    
    selector = RFECV(estimator = estimator, step = step, scoring = scoring)
    selector.fit(x, np.array(y)).n_features_

    rfe_features = x.columns[selector.support_]

    if show_plot == True:

        cv_results = pd.DataFrame(selector.cv_results_)
        
        # plt.figure()
        # plt.xlabel("Number of features selected")
        # plt.ylabel("Mean test accuracy")

        # plt.errorbar(
        # x=cv_results["n_features"],
        # y=cv_results["mean_test_score"],
        # yerr=cv_results["std_test_score"],
        #     )
        
        # plt.title("Recursive Feature Elimination \nwith correlated features")
        # plt.show()

    return cv_results

#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html

In [19]:
cv = run_rfe(LogisticRegression(class_weight='balanced'), 1, 'accuracy', x_train, y, True)

### 4.3 Coefficient & Odds Ratio

Similar to RFE, Coefficient & Odds Ratio order the variables by importance. 

Can compare to RFE to see the variables both say are most important.

In [79]:
def get_coeff_odds_ratio(x, y):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))

    # Coefficients and Odds Ratios
    coefficients = logi.coef_[0]
    odds_ratios = np.exp(coefficients)


    # Display feature importance using coefficients and odds ratios
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients,
        'Odds Ratio': odds_ratios
    })
    print("\nFeature Importance (Coefficient and Odds Ratio):")
    print(feature_importance.sort_values(by='Coefficient', ascending=False))

In [81]:
get_coeff_odds_ratio(x_train, y)

### 4.4 Grid Search

Grid Search for Hyperparameters to optimize accuracy and precision.

Grid search info: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [83]:
def grid_search(C, penalty, class_weight, solver, x, y):

    param_grid = {'C': C,  
              'penalty': penalty, 
              'class_weight' : class_weight,
              'solver' : solver} 

    grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, refit = True, verbose = 3,n_jobs=-1) 
   
    # fitting the model for grid search 
    grid.fit(x, y) 
 
    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(x) 
   
    # print classification report 
    print(classification_report(y, grid_predictions)) 

In [85]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [87]:
def auc_roc(x, y, display_labels):

    logi = LogisticRegression(class_weight='balanced').fit(x, np.array(y))
    predicted = logi.predict(x)

    roc_auc_score(predicted, y)

    disp = ConfusionMatrixDisplay(confusion_matrix(predicted, y), display_labels= display_labels)
    disp.plot()

    plt.show()

In [89]:
auc_roc(x_train, y, ['a', 'b'])

## 5. Model Tuning

### 5.1 Feature Elimination

Use only variables from RFE and Coefficient and Odds ratio say are most important 

In [1]:
x_train_new = df[['put_columns_here']]
y_new = df[['put_y_here']] 

x_test_new = df[['put_columns_here']]

In [None]:
run_classification_report(x_train_new, y_new)

### 5.2 SMOTE (Synthetic Minority Over-Sampling Technique)

In cases where one class of data is much smaller than the other you can use SMOTE to create new fake data in the smaller class to balance them out and run a more accurate model. 

In [3]:
def smote(k_neighbors, x, y):

    sm = SMOTE(random_state=1993, k_neighbors=k_neighbors)

    X_res, y_res = sm.fit_resample(x, y)

    logi = LogisticRegression(class_weight='balanced').fit(X_res, np.array(y_res))

    predicted = logi.predict(X_res)

    print(classification_report(predicted, y_res))

In [5]:
smote(5, x_train, y)

### 5.3 Re-run Evaluation Techniques

Rerun RFE, grid search, and AUC/ROC to see if best variables and hyperparameters have changed and rerun the model

In [None]:
run_rfe(LogisticRegression(), 1, 'accuracy', x_train, y, False)

In [None]:
grid_search([0.1, 1, 10, 100], ['l1', 'l2', 'elasticnet'], ['balanced'], ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], x_train, y)

In [None]:
auc_roc(x_train, y, ['Non-renewed', 'Renewed'])            

## 6. Final Output

Make sure the final result is in a useable format for your specific scenario.

In [None]:
df_final = [['put_x_columns_here', 'put_y_here']]