In [6]:
# Bringing in the data

import pandas as pd

# Read the CSV file into a DataFrame
df_mm = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/mm_data.csv")

df_p = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/p_data.csv")

In [99]:
# defining the cross validation technique for our use case

import pandas as pd
from sklearn.model_selection import train_test_split

def XGBoostRanker_cv(df, model):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_train = train_data['Levels']
        
        X_test = test_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_test = test_data['Levels']
        
        # Get the group sizes (number of teams per year)
        group_train = train_data.groupby('Year').size().to_numpy()
        group_test = test_data.groupby('Year').size().to_numpy()
        
        # Create DMatrix for training and testing, which includes the group information
        dtrain = xgb.DMatrix(X_train, label=y_train, group=group_train)
        dtest = xgb.DMatrix(X_test, label=y_test, group=group_test)
        
        # Train the model using XGBoost Ranker (using the DMatrix directly)
        model.fit(X_train, y_train, group=group_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        
        # Find the team with the highest predicted score for level 4
        predicted_champion = test_data.iloc[y_pred.argmax()]['Team']
        
        # Check if predicted champion matches the actual champion
        if predicted_champion == actual_champion:
            results.append(1)
        else:
            results.append(0)
    
    accuracy = sum(results) / len(results)
    #print(f"Accuracy in predicting the champion: {accuracy * 100:.2f}%")
    return accuracy

In [101]:
# defining function for tuning XGBoostRanker

def ranking_tuner(obj, df):
    """
    Function to tune hyperparameters for XGBoost Ranker using cross-validation.
    This function performs a grid search over max_depth, learning_rate, and n_estimators.
    """
    # Create an empty list to store the results
    results_list = []

    # Triple loop over all combinations of max_depth, learning_rate, and n_estimators
    for max_depth in max_depth_values:
        for learning_rate in learning_rate_values:
            for n_estimators in n_estimators_values:

                # Initialize the XGBRanker model with the current combination of hyperparameters
                model = xgb.XGBRanker(
                    objective=obj,  # Using NDCG for ranking
                    max_depth=max_depth,
                    learning_rate=learning_rate,
                    n_estimators=n_estimators
                )
                
                # Run cross-validation and get the accuracy for the current combination
                accuracy = XGBoostRanker_cv(df, model)
                
                # Store the result with the corresponding parameters
                results_list.append({
                    'max_depth': max_depth,
                    'learning_rate': learning_rate,
                    'n_estimators': n_estimators,
                    'accuracy': accuracy
                })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Sort the results by accuracy in descending order
    results_df = results_df.sort_values(by='accuracy', ascending=False)

    return results_df

In [107]:
#Using all four combinations of methods

# Define the parameter ranges to tune
max_depth_values = [2, 3, 4, 5, 6, 7, 8, 9]         # Depth of the trees
learning_rate_values = [0.01, 0.05, 0.1, .15, 0.2]  # Learning rate
n_estimators_values = [30, 50, 100, 150, 200]    # Number of boosting rounds

results1 = ranking_tuner('rank:ndcg', df_mm)
results2 = ranking_tuner('rank:pairwise', df_mm)
results3 = ranking_tuner('rank:ndcg', df_p)
results4 = ranking_tuner('rank:pairwise', df_p)

In [119]:
# Min Max on rank:ndcg
results1.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
60,4,0.1,30,0.5
80,5,0.05,30,0.5
76,5,0.01,50,0.5
75,5,0.01,30,0.5
71,4,0.2,50,0.4
141,7,0.15,50,0.4
143,7,0.15,150,0.4
79,5,0.01,200,0.4
78,5,0.01,150,0.4
146,7,0.2,50,0.4


In [121]:
# Min Max on rank:pairwise
results2.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
115,6,0.15,30,0.5
17,2,0.15,100,0.5
199,9,0.2,200,0.4
149,7,0.2,200,0.4
133,7,0.05,150,0.4
134,7,0.05,200,0.4
66,4,0.15,50,0.4
137,7,0.1,100,0.4
138,7,0.1,150,0.4
139,7,0.1,200,0.4


In [123]:
# Percentile on rank:ndcg
results3.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
21,2,0.2,50,0.4
79,5,0.01,200,0.4
17,2,0.15,100,0.4
53,4,0.01,150,0.4
154,8,0.01,200,0.4
50,4,0.01,30,0.4
179,9,0.01,200,0.4
129,7,0.01,200,0.4
104,6,0.01,200,0.4
54,4,0.01,200,0.4


In [125]:
# Percentile on rank:pairwise
results4.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
76,5,0.01,50,0.5
117,6,0.15,100,0.5
92,5,0.15,100,0.5
93,5,0.15,150,0.5
86,5,0.1,50,0.5
80,5,0.05,30,0.5
75,5,0.01,30,0.5
72,4,0.2,100,0.5
70,4,0.2,30,0.5
60,4,0.1,30,0.5


In [None]:
# more investigation into model 4 to identify the best model

Model 117: Depth 6, 0.15 learning rate, 100 n_estimators — A solid model with a good balance between complexity and regularization.
Model 143: Depth 7, 0.15 learning rate, 150 n_estimators — Powerful but may need careful overfitting checks.
Model 92: Depth 5, 0.15 learning rate, 100 n_estimators — This might be a more stable model with fewer chances of overfitting.
Model 93: Depth 5, 0.15 learning rate, 150 n_estimators — Stronger model but needs to be watched for overfitting.

In [153]:
# creating a function to give our top pick for each year along with their scores

import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

def XGBoostRanker_top5(df, model):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    Returns a dataframe with top 5 predicted teams and their scores for each year.
    """
    results = []

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_train = train_data['Levels']
        
        X_test = test_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_test = test_data['Levels']
        
        # Get the group sizes (number of teams per year)
        group_train = train_data.groupby('Year').size().to_numpy()
        
        # Train the model using XGBoost Ranker
        model.fit(X_train, y_train, group=group_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Rank teams based on predicted scores
        ranked_teams = test_data[['Team']].copy()
        ranked_teams['Score'] = y_pred
        ranked_teams = ranked_teams.sort_values(by='Score', ascending=False)

        # Get actual champion (team with Level == 4)
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        
        # Get top 5 teams and their scores
        top_teams = ranked_teams.head(5)
        top_teams_list = top_teams.apply(lambda row: f"{row['Team']}: {row['Score']:.4f}", axis=1).tolist()
        
        # Store results
        results.append([test_year, actual_champion] + top_teams_list)
    
    # Create dataframe with results
    columns = ['Year', 'Actual Champion', 'First Pick', 'Second Pick', 'Third Pick', 'Fourth Pick', 'Fifth Pick']
    results_df = pd.DataFrame(results, columns=columns)
    results_df = results_df.sort_values(by="Year", ascending=False)

    # Compute accuracy
    accuracy = (results_df['Actual Champion'] == results_df['First Pick'].str.split(': ').str[0]).mean()
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    
    return results_df

In [None]:
# more investigation into model 4 to identify the best model

Model 117: Depth 6, 0.15 learning rate, 100 n_estimators — A solid model with a good balance between complexity and regularization.
Model 143: Depth 7, 0.15 learning rate, 150 n_estimators — Powerful but may need careful overfitting checks.
Model 92: Depth 5, 0.15 learning rate, 100 n_estimators — This might be a more stable model with fewer chances of overfitting.
Model 93: Depth 5, 0.15 learning rate, 150 n_estimators — Stronger model but needs to be watched for overfitting.

In [8]:
model_117 = xgb.XGBRanker(objective='rank:pairwise',  # Using pairwise for ranking
                      max_depth=6,
                      learning_rate=0.15,
                      n_estimators=100)

model_143 = xgb.XGBRanker(objective='rank:pairwise',  # Using pairwise for ranking
                      max_depth=7,
                      learning_rate=0.15,
                      n_estimators=150)

model_92 = xgb.XGBRanker(objective='rank:pairwise',  # Using pairwise for ranking
                      max_depth=5,
                      learning_rate=0.15,
                      n_estimators=100)

model_93 = xgb.XGBRanker(objective='rank:pairwise',  # Using pairwise for ranking
                      max_depth=5,
                      learning_rate=0.15,
                      n_estimators=150)

In [157]:
XGBoostRanker_top5(df_p, model_117)

Model Accuracy: 50.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 3.3660,Minnesota Timberwolves: 2.9699,Los Angeles Clippers: 2.4662,New York Knicks: 2.0864,Oklahoma City Thunder: 2.0434
8,2022,Denver Nuggets,Milwaukee Bucks: 3.9889,Philadelphia 76ers: 3.7533,Cleveland Cavaliers: 3.7420,Memphis Grizzlies: 3.3630,Denver Nuggets: 2.9214
7,2021,Golden State Warriors,Phoenix Suns: 3.8897,Golden State Warriors: 3.4008,Boston Celtics: 2.7717,Milwaukee Bucks: 2.5887,Denver Nuggets: 1.5495
6,2020,Milwaukee Bucks,Brooklyn Nets: 5.2409,Denver Nuggets: 4.2628,Phoenix Suns: 4.0739,Utah Jazz: 3.9781,Los Angeles Clippers: 3.9176
5,2019,Los Angeles Lakers,Los Angeles Lakers: 3.9018,Los Angeles Clippers: 3.7593,Milwaukee Bucks: 3.1386,Toronto Raptors: 2.8542,Boston Celtics: 2.3321
4,2018,Toronto Raptors,Toronto Raptors: 5.4145,Golden State Warriors: 5.2745,Milwaukee Bucks: 4.1338,Denver Nuggets: 2.8703,Utah Jazz: 1.7128
3,2017,Golden State Warriors,Boston Celtics: 4.4633,Toronto Raptors: 4.3199,Golden State Warriors: 3.9153,Houston Rockets: 3.4769,Philadelphia 76ers: 2.8314
2,2016,Golden State Warriors,Golden State Warriors: 3.9498,Houston Rockets: 3.5863,San Antonio Spurs: 3.4033,Boston Celtics: 3.3146,Cleveland Cavaliers: 2.3251
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 3.8008,Golden State Warriors: 3.6792,San Antonio Spurs: 3.5030,Oklahoma City Thunder: 2.7346,Toronto Raptors: 1.3650
0,2014,Golden State Warriors,San Antonio Spurs: 4.1105,Los Angeles Clippers: 3.1246,Memphis Grizzlies: 2.8268,Golden State Warriors: 1.5728,Chicago Bulls: 1.4918


In [159]:
XGBoostRanker_top5(df_p, model_143)

Model Accuracy: 50.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 3.9746,Minnesota Timberwolves: 3.2269,Los Angeles Clippers: 2.8709,New York Knicks: 2.0176,Oklahoma City Thunder: 2.0156
8,2022,Denver Nuggets,Milwaukee Bucks: 5.1482,Philadelphia 76ers: 4.8648,Cleveland Cavaliers: 4.4226,Memphis Grizzlies: 3.8416,Denver Nuggets: 3.4934
7,2021,Golden State Warriors,Phoenix Suns: 4.9823,Golden State Warriors: 3.7628,Milwaukee Bucks: 3.5504,Boston Celtics: 3.2039,Dallas Mavericks: 2.4469
6,2020,Milwaukee Bucks,Brooklyn Nets: 6.6003,Denver Nuggets: 5.7908,Utah Jazz: 5.0455,Los Angeles Clippers: 4.6864,Phoenix Suns: 4.6715
5,2019,Los Angeles Lakers,Los Angeles Lakers: 4.9195,Los Angeles Clippers: 4.8666,Milwaukee Bucks: 4.0680,Toronto Raptors: 3.6202,Boston Celtics: 2.6149
4,2018,Toronto Raptors,Toronto Raptors: 6.9263,Golden State Warriors: 6.4140,Milwaukee Bucks: 4.9487,Denver Nuggets: 3.8636,Utah Jazz: 1.7616
3,2017,Golden State Warriors,Boston Celtics: 5.7116,Toronto Raptors: 5.0604,Houston Rockets: 4.8879,Golden State Warriors: 4.7533,Philadelphia 76ers: 3.9930
2,2016,Golden State Warriors,Golden State Warriors: 4.8455,San Antonio Spurs: 4.2440,Houston Rockets: 4.1409,Boston Celtics: 3.8742,Cleveland Cavaliers: 2.5949
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 5.5532,Golden State Warriors: 4.4995,San Antonio Spurs: 4.4535,Oklahoma City Thunder: 3.2411,Toronto Raptors: 2.0286
0,2014,Golden State Warriors,San Antonio Spurs: 5.3783,Los Angeles Clippers: 3.6965,Memphis Grizzlies: 3.1138,Golden State Warriors: 2.0415,Chicago Bulls: 1.9990


In [161]:
XGBoostRanker_top5(df_p, model_92)

Model Accuracy: 50.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 3.2890,Minnesota Timberwolves: 3.0360,Denver Nuggets: 2.5374,Los Angeles Clippers: 1.8172,New York Knicks: 1.6718
8,2022,Denver Nuggets,Milwaukee Bucks: 3.9632,Memphis Grizzlies: 3.5456,Cleveland Cavaliers: 3.4500,Denver Nuggets: 2.8737,Philadelphia 76ers: 2.8253
7,2021,Golden State Warriors,Phoenix Suns: 4.0297,Boston Celtics: 2.7959,Golden State Warriors: 2.7077,Milwaukee Bucks: 2.4415,Dallas Mavericks: 1.7819
6,2020,Milwaukee Bucks,Brooklyn Nets: 5.1562,Denver Nuggets: 4.5837,Utah Jazz: 3.9407,Los Angeles Clippers: 3.7646,Phoenix Suns: 3.5859
5,2019,Los Angeles Lakers,Los Angeles Lakers: 4.1193,Los Angeles Clippers: 3.9658,Milwaukee Bucks: 3.2811,Toronto Raptors: 2.5763,Boston Celtics: 2.0741
4,2018,Toronto Raptors,Toronto Raptors: 5.4956,Golden State Warriors: 5.1426,Milwaukee Bucks: 3.7041,Denver Nuggets: 2.6068,Utah Jazz: 1.8019
3,2017,Golden State Warriors,Boston Celtics: 4.2784,Toronto Raptors: 4.2663,Golden State Warriors: 3.8490,Houston Rockets: 3.5816,Philadelphia 76ers: 3.0889
2,2016,Golden State Warriors,Golden State Warriors: 3.9104,Boston Celtics: 3.5806,San Antonio Spurs: 3.2171,Houston Rockets: 3.1973,Cleveland Cavaliers: 2.3682
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 4.0243,Golden State Warriors: 3.9529,San Antonio Spurs: 3.3479,Oklahoma City Thunder: 2.5335,Toronto Raptors: 1.0280
0,2014,Golden State Warriors,San Antonio Spurs: 3.8776,Los Angeles Clippers: 3.0372,Memphis Grizzlies: 2.6811,Golden State Warriors: 1.3495,Chicago Bulls: 1.1062


In [163]:
XGBoostRanker_top5(df_p, model_93)

Model Accuracy: 50.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 3.8996,Minnesota Timberwolves: 3.4239,Denver Nuggets: 3.3381,Los Angeles Clippers: 1.9093,New York Knicks: 1.8469
8,2022,Denver Nuggets,Milwaukee Bucks: 4.9039,Cleveland Cavaliers: 4.2949,Memphis Grizzlies: 4.2256,Denver Nuggets: 3.6326,Philadelphia 76ers: 3.6317
7,2021,Golden State Warriors,Phoenix Suns: 5.2523,Golden State Warriors: 3.5392,Boston Celtics: 3.3284,Milwaukee Bucks: 2.9628,Dallas Mavericks: 2.4032
6,2020,Milwaukee Bucks,Brooklyn Nets: 6.5290,Denver Nuggets: 5.9452,Utah Jazz: 4.7678,Los Angeles Clippers: 4.7382,Phoenix Suns: 4.3209
5,2019,Los Angeles Lakers,Los Angeles Lakers: 4.9568,Los Angeles Clippers: 4.8755,Milwaukee Bucks: 3.9565,Toronto Raptors: 3.3871,Boston Celtics: 2.3779
4,2018,Toronto Raptors,Toronto Raptors: 7.0276,Golden State Warriors: 6.3322,Milwaukee Bucks: 4.7960,Denver Nuggets: 3.2975,Utah Jazz: 2.0091
3,2017,Golden State Warriors,Boston Celtics: 5.4954,Toronto Raptors: 5.2525,Houston Rockets: 4.8486,Golden State Warriors: 4.7925,Philadelphia 76ers: 4.0282
2,2016,Golden State Warriors,Golden State Warriors: 4.9287,Boston Celtics: 4.0892,Houston Rockets: 4.0792,San Antonio Spurs: 4.0190,Cleveland Cavaliers: 2.5841
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 5.1055,Golden State Warriors: 4.8452,San Antonio Spurs: 4.4399,Oklahoma City Thunder: 3.1619,Toronto Raptors: 1.5963
0,2014,Golden State Warriors,San Antonio Spurs: 5.0999,Los Angeles Clippers: 3.8611,Memphis Grizzlies: 3.2172,Golden State Warriors: 1.5669,Chicago Bulls: 1.3489


In [None]:
# model_117 is my pick

In [36]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split

def XGBoostRanker_with_importance_cv(df, model):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    Also calculates feature importance for the model.
    """
    results = []
    feature_importances = pd.DataFrame()  # Initialize dataframe to store feature importance

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_train = train_data['Levels']
        
        X_test = test_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_test = test_data['Levels']
        
        # Create DMatrix for training and testing, which includes the group information
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)

        # Get the group sizes (number of teams per year)
        group_train = train_data.groupby('Year').size().to_numpy()
        
        # Train the model using XGBoost Ranker
        model.fit(X_train, y_train, group=group_train)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        
        # Find the team with the highest predicted score for level 4
        predicted_champion = test_data.iloc[y_pred.argmax()]['Team']
        
        # Check if predicted champion matches the actual champion
        if predicted_champion == actual_champion:
            results.append(1)
        else:
            results.append(0)
        
        # Extract feature importance from the trained model
        importance = model.get_booster().get_score(importance_type='weight')  # You can also use 'gain' or 'cover'
        
        # Convert to a DataFrame and append to feature_importances
        importance_df = pd.DataFrame(list(importance.items()), columns=['Feature', 'Importance'])
        feature_importances = pd.concat([feature_importances, importance_df], axis=0)

    # Calculate mean accuracy
    accuracy = sum(results) / len(results)

    # Group by feature and calculate the mean importance across all years
    feature_importances = feature_importances.groupby('Feature').agg('mean').reset_index()

    # Sort by importance (in descending order)
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
    print(feature_importances.head(20))
    return accuracy, feature_importances

In [38]:
results = XGBoostRanker_with_importance_cv(df_p, model_117)

         Feature  Importance
97             W  193.100000
36             L   48.125000
32           FT%   34.100000
90           SOS   28.900000
31            FT   27.300000
94          TOV%   27.200000
1       %ASTd_3P   26.600000
93           TOV   25.000000
88            PW   24.900000
21           Age   24.300000
17            3P   24.100000
29           FGA   22.900000
27            FG   22.800000
85            PF   22.400000
12     16-3P_FG%   21.444444
14           2P%   21.000000
70  Opp.FGA_Dist   20.300000
98     corner3P%   19.800000
77        Opp.PF   19.500000
18           3P%   19.000000


In [22]:
print(results)

(0.5,             Feature  Importance
97                W  193.100000
36                L   48.125000
32              FT%   34.100000
90              SOS   28.900000
31               FT   27.300000
..              ...         ...
68          Opp.FG%    4.600000
6           %FGA_3P    4.142857
51  Opp.%FGA_Layups    3.400000
41             ORB%    2.777778
86               PL    2.200000

[100 rows x 2 columns])
