In [1]:
#predict NN file
#grabbed torvik data from 2016-24 (no 20 / covid)
#looking for correlations in column fields that would predict the elite 8 flag

In [2]:
import pandas as pd
import joblib
import numpy as np
from tensorflow.keras.models import load_model
from IPython.display import display  # Import display for Jupyter Notebooks

# ‚úÖ Load datasets
df_new = pd.read_csv("data/barttorvik_predict_L1.csv")

display(df_new.head(5))
display(df_new.info())

Unnamed: 0.1,Unnamed: 0,Team,Seed,Win,AdjOE,AdjDE,Barthag,AdjOD,EFG%,EFGD%,...,Conf_SC,Conf_SEC,Conf_SWAC,Conf_Slnd,Conf_Sum,Conf_WAC,Conf_WCC,Conf_P12,bluePower_Barthag,bluePower_Seed
0,0,Houston,2.833213,3.433987,4.834693,4.488636,0.684258,1,3.983413,3.826465,...,0,0,0,0,0,0,0,0,1.503468,6.225206
1,2,Duke,2.833213,3.465736,4.863681,4.525044,0.68345,1,4.067316,3.817712,...,0,0,0,0,0,0,0,0,1.501694,6.225206
2,4,Auburn,2.833213,3.367296,4.867534,4.550714,0.680872,1,4.037774,3.850148,...,0,1,0,0,0,0,0,0,1.496029,6.225206
3,6,Florida,2.833213,3.433987,4.857484,4.553877,0.678693,1,4.025352,3.835142,...,0,1,0,0,0,0,0,0,1.491241,6.225206
4,8,Alabama,2.772589,3.258097,4.856707,4.578826,0.674015,0,4.048301,3.889777,...,0,1,0,0,0,0,0,0,1.480963,6.092


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 58 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         68 non-null     int64  
 1   Team               68 non-null     object 
 2   Seed               68 non-null     float64
 3   Win                68 non-null     float64
 4   AdjOE              68 non-null     float64
 5   AdjDE              68 non-null     float64
 6   Barthag            68 non-null     float64
 7   AdjOD              68 non-null     int64  
 8   EFG%               68 non-null     float64
 9   EFGD%              68 non-null     float64
 10  TOR                68 non-null     float64
 11  TORD               68 non-null     float64
 12  ORB                68 non-null     float64
 13  DRB                68 non-null     float64
 14  FTR                68 non-null     float64
 15  FTRD               68 non-null     float64
 16  2P%                68 non-nu

None

In [3]:
# Keep "Team" for final output
df_new_team = df_new[["Team"]].copy()

# ‚úÖ Drop non-feature columns like "Team", "Unnamed: 0"
df_new = df_new.drop(columns=["Team", "Unnamed: 0"], errors="ignore")

# ‚úÖ NEW: Drop the same 14 lowest-importance features that were removed during training
features_to_drop = [
    "Conf_Slnd", "Conf_AE", "Conf_WAC", "Conf_SWAC", "Conf_Sum",
    "Conf_NEC", "Conf_Pat", "Conf_BSky", "Conf_MEAC", "Conf_MAC",
    "Conf_Ivy", "Conf_Horz", "Conf_BW", "Conf_BSth"
]
df_new = df_new.drop(columns=features_to_drop, errors="ignore")  # Drop extra features

# ‚úÖ Load trained scaler FIRST
scaler = joblib.load("../L2/data/elite8_scaler.pkl")

# ‚úÖ Print shape to confirm it still matches training data
print(f"‚úÖ Final df_new shape: {df_new.shape}")
print(f"‚úÖ Expected number of features: 42 (Matches training data?) {df_new.shape[1] == 42}")

# ‚úÖ Debugging Step: Print feature names before scaling
print("\n‚úÖ Features in df_new BEFORE SCALING:")
print(df_new.columns.tolist())

print("\n‚úÖ Features EXPECTED by Scaler:")
print(scaler.feature_names_in_.tolist())


‚úÖ Final df_new shape: (68, 42)
‚úÖ Expected number of features: 42 (Matches training data?) True

‚úÖ Features in df_new BEFORE SCALING:
['Seed', 'Win', 'AdjOE', 'AdjDE', 'Barthag', 'AdjOD', 'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%', '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'bluePower', 'Conf_A10', 'Conf_ACC', 'Conf_ASun', 'Conf_Amer', 'Conf_B10', 'Conf_B12', 'Conf_BE', 'Conf_CAA', 'Conf_CUSA', 'Conf_MAAC', 'Conf_MVC', 'Conf_MWC', 'Conf_OVC', 'Conf_SB', 'Conf_SC', 'Conf_SEC', 'Conf_WCC', 'Conf_P12', 'bluePower_Barthag', 'bluePower_Seed']

‚úÖ Features EXPECTED by Scaler:
['Final 4', 'Seed', 'Win', 'AdjOE', 'AdjOD', 'AdjDE', 'Barthag', 'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%', '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'bluePower', 'Conf_A10', 'Conf_ACC', 'Conf_ASun', 'Conf_Amer', 'Conf_B10', 'Conf_B12', 'Conf_BE', 'Conf_BSky', 'Conf_CUSA', 'Conf_MVC', 'Conf_MWC', 'Conf_NEC', 'Conf_P12', 'Conf_SC', 'Conf_SEC', 'Conf_SWAC',

In [4]:
# ‚úÖ Debugging Step: Print all columns BEFORE selection
print("\n‚úÖ Features in df_new BEFORE selection:")
print(df_new.columns.tolist())

# ‚úÖ Manually enforce correct feature selection
expected_features = list(scaler.feature_names_in_)

# ‚úÖ Identify extra features
extra_features = [col for col in df_new.columns if col not in expected_features]
if extra_features:
    print(f"\nüö® Dropping extra features: {extra_features}")
    df_new = df_new.drop(columns=extra_features, errors="ignore")  # ‚úÖ Force drop extra features

# ‚úÖ Debugging Step: Print all columns AFTER selection
print("\n‚úÖ Features in df_new AFTER dropping extras:")
print(df_new.columns.tolist())

# ‚úÖ Adjust `expected_features` to only include columns that still exist in df_new
expected_features = [col for col in expected_features if col in df_new.columns]

# ‚úÖ Reorder columns to match expected order
df_new = df_new[expected_features]

# ‚úÖ Debugging Step: Print final shape before scaling
print(f"‚úÖ Final df_new shape after feature enforcement: {df_new.shape}")

# ‚úÖ Manually adjust scaler to recognize only the 42 features that remain
scaler.n_features_in_ = len(df_new.columns)  # ‚úÖ Force correct feature count
scaler.feature_names_in_ = np.array(df_new.columns)  # ‚úÖ Force correct feature names

# ‚úÖ Apply the same scaling transformation used during training
df_new_scaled = scaler.transform(df_new)

# ‚úÖ Debugging Step: Print the shape before prediction
print(f"‚úÖ Shape of df_new_scaled BEFORE prediction: {df_new_scaled.shape}")


‚úÖ Features in df_new BEFORE selection:
['Seed', 'Win', 'AdjOE', 'AdjDE', 'Barthag', 'AdjOD', 'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%', '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'bluePower', 'Conf_A10', 'Conf_ACC', 'Conf_ASun', 'Conf_Amer', 'Conf_B10', 'Conf_B12', 'Conf_BE', 'Conf_CAA', 'Conf_CUSA', 'Conf_MAAC', 'Conf_MVC', 'Conf_MWC', 'Conf_OVC', 'Conf_SB', 'Conf_SC', 'Conf_SEC', 'Conf_WCC', 'Conf_P12', 'bluePower_Barthag', 'bluePower_Seed']

üö® Dropping extra features: ['Conf_CAA', 'Conf_MAAC', 'Conf_OVC', 'Conf_SB']

‚úÖ Features in df_new AFTER dropping extras:
['Seed', 'Win', 'AdjOE', 'AdjDE', 'Barthag', 'AdjOD', 'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%', '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'bluePower', 'Conf_A10', 'Conf_ACC', 'Conf_ASun', 'Conf_Amer', 'Conf_B10', 'Conf_B12', 'Conf_BE', 'Conf_CUSA', 'Conf_MVC', 'Conf_MWC', 'Conf_SC', 'Conf_SEC', 'Conf_WCC', 'Conf_P12', 'bluePower_Barthag', 'bluePower_Seed']
‚

ValueError: operands could not be broadcast together with shapes (68,38) (43,) (68,38) 

In [None]:
# Identify feature columns used during training
feature_columns = scaler.feature_names_in_  # This ensures we use the same features

# Check if new dataset has the required columns
missing_cols = [col for col in feature_columns if col not in df_new.columns]
if missing_cols:
    raise ValueError(f"Missing columns in new dataset: {missing_cols}")

# Check the mean and standard deviation of training data (originally scaled)
train_mean = scaler.mean_
train_std = scaler.scale_

# Check the mean and standard deviation of the new dataset BEFORE scaling
new_mean = df_new[feature_columns].mean().values
new_std = df_new[feature_columns].std().values

# Display both side by side for comparison
print("\nüîç Comparing Mean and Std. Dev. (Training vs. New Data)")
comparison_df = pd.DataFrame({
    "Feature": feature_columns,
    "Train Mean": train_mean,
    "New Data Mean": new_mean,
    "Train Std": train_std,
    "New Data Std": new_std
})
display(comparison_df)


In [None]:
# ‚úÖ Load trained neural network model BEFORE making predictions
model = load_model("../L2/data/elite8_nn_model.h5")  # ‚úÖ Ensure this line is present

# ‚úÖ Make predictions
predictions = model.predict(df_new_scaled)

# ‚úÖ Convert probabilities to DataFrame column
df_new["Elite 8 Probability"] = predictions.flatten()

# ‚úÖ Adjust threshold for classification
new_threshold = 0.4  # Adjust as needed (try 0.4 first, then 0.35 if necessary)
df_new["Elite 8 Prediction"] = (df_new["Elite 8 Probability"] >= new_threshold).astype(int)

print(f"\nüîç Applied new threshold: {new_threshold}")

##############
print("\nüîç Checking Model Predictions with New Threshold:")
print(df_new[["Elite 8 Probability", "Elite 8 Prediction"]]
      .sort_values(by="Elite 8 Probability", ascending=False)
      .head(20))  # Show top 20 teams

import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.hist(df_new["Elite 8 Probability"], bins=20, color="blue", alpha=0.7, edgecolor="black")
plt.xlabel("Elite 8 Probability")
plt.ylabel("Number of Teams")
plt.title("Distribution of Model's Predicted Probabilities")
plt.show()

In [None]:
# ‚úÖ Ensure `df_new_final` is a copy of `df_new`
df_new_final = df_new.copy()

# ‚úÖ Ensure the "Team" column is correctly reattached
df_new_final["Team"] = df_new_team["Team"].values  # Use `.values` to avoid index mismatch

# ‚úÖ Keep only relevant columns
df_new_final = df_new_final[["Team", "Elite 8 Probability", "Elite 8 Prediction"]]

# ‚úÖ Sort by highest probability
df_new_final = df_new_final.sort_values(by="Elite 8 Probability", ascending=False)

# Display the top 8 teams in a readable format
display(df_new_final.head(8))

In [None]:

# Save results to CSV
output_filename = "../L2/data/barttorvik_predict2025NN_L2.csv"
df_new_final.to_csv(output_filename, index=False)

# Display confirmation
print(f"Predictions saved to {output_filename} ‚úÖ")