In [2]:
# Bringing in the data

import pandas as pd

# Read the CSV file into a DataFrame
df_mm = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/mm_data.csv")

df_p = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/p_data.csv")

In [4]:
# defining the cross validation technique for our use case

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

def logistic_cv(df, n):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Create logistic regression model
    model = LogisticRegression()

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_train = train_data['Champion']

        # Getting selected features for training
        rfe = RFE(model, n_features_to_select = n)  # Update n_features_to_select
        rfe.fit(X_train, y_train)  # Fit the RFE model
        selected_features = X_train.columns[rfe.support_]
        X_train = X_train[selected_features]
        
        X_test = test_data[selected_features]
        y_test = test_data['Champion']

        # Give selected_feature if n = 1
        if len(selected_features) == 1:
            print(selected_features)

        # If more than 1 feature is selected, calculate VIF
        if len(selected_features) > 1:
            vif_data = pd.DataFrame()
            vif_data["Feature"] = X_train.columns
            vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
            print(vif_data)
        
        # Train the model using Logistic Regression
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict_proba(X_test)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        
        # Find the team with the highest predicted score for level 4
        predicted_champion = test_data.iloc[y_pred[:, 1].argmax()]['Team']
        
        # Check if predicted champion matches the actual champion
        if predicted_champion == actual_champion:
            results.append(1)
        else:
            results.append(0)
    
    accuracy = sum(results) / len(results)
    print(f"Accuracy in predicting the champion: {accuracy * 100:.2f}%")
    return accuracy

In [56]:
ns = [1,2,3,4,5]

for n in ns:
    logistic_cv(df_mm, n)

Index(['W'], dtype='object')
Index(['eFG%'], dtype='object')
Index(['W'], dtype='object')
Index(['eFG%'], dtype='object')
Index(['AST'], dtype='object')
Index(['eFG%'], dtype='object')
Index(['2P%'], dtype='object')
Index(['FG%'], dtype='object')
Index(['eFG%'], dtype='object')
Index(['FG%'], dtype='object')
Accuracy in predicting the champion: 40.00%
  Feature       VIF
0     Age  4.674057
1       W  4.674057
  Feature       VIF
0       W  8.250266
1    eFG%  8.250266
  Feature       VIF
0       W  9.081993
1    eFG%  9.081993
  Feature       VIF
0       W  7.717541
1    eFG%  7.717541
  Feature       VIF
0     AST  4.073269
1    eFG%  4.073269
  Feature       VIF
0     AST  5.490839
1    eFG%  5.490839
  Feature       VIF
0     2P%  1.707784
1       L  1.707784
  Feature        VIF
0      FG  10.341901
1     FG%  10.341901
  Feature        VIF
0     FG%  19.802263
1    eFG%  19.802263
  Feature       VIF
0     FG%  1.762679
1       L  1.762679
Accuracy in predicting the champion: 30.

In [6]:
# defining the cross validation technique for our use case

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

def logistic_top5(df, selected_features):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Create logistic regression model
    model = LogisticRegression()

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data[selected_features]
        y_train = train_data['Champion']
        
        X_test = test_data[selected_features]
        y_test = test_data['Champion']

        # If more than 1 feature is selected, calculate VIF
        if len(selected_features) > 1:
            vif_data = pd.DataFrame()
            vif_data["Feature"] = X_train.columns
            vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
            print(vif_data)
        
        # Train the model using Logistic Regression
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict_proba(X_test)

        # Rank teams based on predicted scores
        ranked_teams = test_data[['Team']].copy()
        ranked_teams['Score'] = y_pred[:, 1]
        ranked_teams = ranked_teams.sort_values(by='Score', ascending=False)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]

        # Get top 5 teams and their scores
        top_teams = ranked_teams.head(5)
        top_teams_list = top_teams.apply(lambda row: f"{row['Team']}: {row['Score']:.4f}", axis=1).tolist()
        
        # Store results
        results.append([test_year, actual_champion] + top_teams_list)
    
    # Create dataframe with results
    columns = ['Year', 'Actual Champion', 'First Pick', 'Second Pick', 'Third Pick', 'Fourth Pick', 'Fifth Pick']
    results_df = pd.DataFrame(results, columns=columns)
    results_df = results_df.sort_values(by="Year", ascending=False)

    # Compute accuracy
    accuracy = (results_df['Actual Champion'] == results_df['First Pick'].str.split(': ').str[0]).mean()
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    
    return results_df

In [8]:
selected_features = ['W', 'eFG%']

logistic_top5(df_mm, selected_features)

  Feature     VIF
0       W  7.7102
1    eFG%  7.7102
  Feature       VIF
0       W  8.250266
1    eFG%  8.250266
  Feature       VIF
0       W  9.081993
1    eFG%  9.081993
  Feature       VIF
0       W  7.717541
1    eFG%  7.717541
  Feature      VIF
0       W  7.93086
1    eFG%  7.93086
  Feature       VIF
0       W  9.962458
1    eFG%  9.962458
  Feature       VIF
0       W  8.377401
1    eFG%  8.377401
  Feature       VIF
0       W  8.606272
1    eFG%  8.606272
  Feature       VIF
0       W  7.979711
1    eFG%  7.979711
  Feature       VIF
0       W  8.130166
1    eFG%  8.130166
Model Accuracy: 50.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 0.0776,Oklahoma City Thunder: 0.0629,Indiana Pacers: 0.0549,Denver Nuggets: 0.0540,Minnesota Timberwolves: 0.0507
8,2022,Denver Nuggets,Denver Nuggets: 0.0757,Boston Celtics: 0.0730,Sacramento Kings: 0.0654,Philadelphia 76ers: 0.0637,Milwaukee Bucks: 0.0602
7,2021,Golden State Warriors,Phoenix Suns: 0.0891,Golden State Warriors: 0.0708,Utah Jazz: 0.0678,Denver Nuggets: 0.0674,Miami Heat: 0.0637
6,2020,Milwaukee Bucks,Brooklyn Nets: 0.0940,Utah Jazz: 0.0906,Phoenix Suns: 0.0889,Los Angeles Clippers: 0.0776,Milwaukee Bucks: 0.0772
5,2019,Los Angeles Lakers,Milwaukee Bucks: 0.1231,Los Angeles Lakers: 0.0849,Utah Jazz: 0.0800,Miami Heat: 0.0760,Toronto Raptors: 0.0748
4,2018,Toronto Raptors,Golden State Warriors: 0.1217,Milwaukee Bucks: 0.0987,Toronto Raptors: 0.0808,Houston Rockets: 0.0683,Utah Jazz: 0.0575
3,2017,Golden State Warriors,Golden State Warriors: 0.1075,Houston Rockets: 0.0939,Toronto Raptors: 0.0612,Cleveland Cavaliers: 0.0539,Philadelphia 76ers: 0.0450
2,2016,Golden State Warriors,Golden State Warriors: 0.1048,Houston Rockets: 0.0571,Cleveland Cavaliers: 0.0530,San Antonio Spurs: 0.0476,Los Angeles Clippers: 0.0449
1,2015,Cleveland Cavaliers,Golden State Warriors: 0.1040,San Antonio Spurs: 0.0573,Cleveland Cavaliers: 0.0453,Oklahoma City Thunder: 0.0434,Los Angeles Clippers: 0.0416
0,2014,Golden State Warriors,Golden State Warriors: 0.0858,Atlanta Hawks: 0.0626,Los Angeles Clippers: 0.0609,San Antonio Spurs: 0.0494,Cleveland Cavaliers: 0.0487


In [None]:
# the model with W and eFG% seems to be the best for logistic regression

In [10]:
selected_features = ['W', 'eFG%']

logistic_top5(df_p, selected_features)

  Feature       VIF
0       W  6.048703
1    eFG%  6.048703
  Feature       VIF
0       W  6.736676
1    eFG%  6.736676
  Feature       VIF
0       W  6.500348
1    eFG%  6.500348
  Feature       VIF
0       W  6.474259
1    eFG%  6.474259
  Feature       VIF
0       W  6.925942
1    eFG%  6.925942
  Feature       VIF
0       W  7.140512
1    eFG%  7.140512
  Feature       VIF
0       W  6.862114
1    eFG%  6.862114
  Feature       VIF
0       W  7.314591
1    eFG%  7.314591
  Feature       VIF
0       W  7.321538
1    eFG%  7.321538
  Feature       VIF
0       W  6.613574
1    eFG%  6.613574
Model Accuracy: 40.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 0.0889,Oklahoma City Thunder: 0.0773,Denver Nuggets: 0.0637,Milwaukee Bucks: 0.0580,Indiana Pacers: 0.0557
8,2022,Denver Nuggets,Denver Nuggets: 0.0792,Boston Celtics: 0.0753,Sacramento Kings: 0.0678,Philadelphia 76ers: 0.0666,Milwaukee Bucks: 0.0606
7,2021,Golden State Warriors,Phoenix Suns: 0.0799,Golden State Warriors: 0.0756,Miami Heat: 0.0693,Utah Jazz: 0.0635,Denver Nuggets: 0.0625
6,2020,Milwaukee Bucks,Brooklyn Nets: 0.0812,Phoenix Suns: 0.0799,Utah Jazz: 0.0784,Los Angeles Clippers: 0.0687,Milwaukee Bucks: 0.0684
5,2019,Los Angeles Lakers,Milwaukee Bucks: 0.0951,Los Angeles Lakers: 0.0732,Toronto Raptors: 0.0671,Utah Jazz: 0.0627,Miami Heat: 0.0600
4,2018,Toronto Raptors,Milwaukee Bucks: 0.0924,Golden State Warriors: 0.0884,Toronto Raptors: 0.0847,Houston Rockets: 0.0692,Utah Jazz: 0.0591
3,2017,Golden State Warriors,Houston Rockets: 0.0938,Golden State Warriors: 0.0894,Toronto Raptors: 0.0786,Cleveland Cavaliers: 0.0712,Philadelphia 76ers: 0.0625
2,2016,Golden State Warriors,Golden State Warriors: 0.0939,Houston Rockets: 0.0793,Cleveland Cavaliers: 0.0706,Los Angeles Clippers: 0.0650,San Antonio Spurs: 0.0621
1,2015,Cleveland Cavaliers,Golden State Warriors: 0.0939,San Antonio Spurs: 0.0863,Cleveland Cavaliers: 0.0759,Oklahoma City Thunder: 0.0696,Los Angeles Clippers: 0.0666
0,2014,Golden State Warriors,Golden State Warriors: 0.0847,Atlanta Hawks: 0.0763,Los Angeles Clippers: 0.0736,Houston Rockets: 0.0631,San Antonio Spurs: 0.0614
