In [6]:
# Bringing in the data
import xgboost as xgb
import pandas as pd

# Read the CSV file into a DataFrame
df_mm = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/mm_data.csv")

df_p = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/p_data.csv")

In [28]:
# defining the cross validation technique for our use case
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split

def XGBoostClassifier_cv(df, model):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_train = train_data['Levels']
        
        X_test = test_data.drop(columns=['Team', 'Levels', 'Champion', 'Year'])
        y_test = test_data['Levels']
        
        # Create DMatrix for training and testing, which includes the group information
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        
        # Train the model using XGBoost Ranker (using the DMatrix directly)
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict_proba(X_test)[:, 4]
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        
        # Find the team with the highest predicted score for level 4
        predicted_champion = test_data.iloc[y_pred.argmax()]['Team']
        
        # Check if predicted champion matches the actual champion
        if predicted_champion == actual_champion:
            results.append(1)
        else:
            results.append(0)
    
    accuracy = sum(results) / len(results)
    #print(f"Accuracy in predicting the champion: {accuracy * 100:.2f}%")
    return accuracy

In [30]:
# defining function for tuning XGBoostRanker

def classifying_tuner(df):
    """
    Function to tune hyperparameters for XGBoost Ranker using cross-validation.
    This function performs a grid search over max_depth, learning_rate, and n_estimators.
    """
    # Create an empty list to store the results
    results_list = []

    # Triple loop over all combinations of max_depth, learning_rate, and n_estimators
    for max_depth in max_depth_values:
        for learning_rate in learning_rate_values:
            for n_estimators in n_estimators_values:

                # Initialize the XGBRanker model with the current combination of hyperparameters
                model = xgb.XGBClassifier(
                    objective='multi:softprob',
                    num_class = 5,
                    max_depth=max_depth,
                    learning_rate=learning_rate,
                    n_estimators=n_estimators
                )
                
                # Run cross-validation and get the accuracy for the current combination
                accuracy = XGBoostClassifier_cv(df, model)
                
                # Store the result with the corresponding parameters
                results_list.append({
                    'max_depth': max_depth,
                    'learning_rate': learning_rate,
                    'n_estimators': n_estimators,
                    'accuracy': accuracy
                })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Sort the results by accuracy in descending order
    results_df = results_df.sort_values(by='accuracy', ascending=False)

    return results_df

In [34]:
#Using all four combinations of methods

# Define the parameter ranges to tune
max_depth_values = [2, 3, 4, 5, 6, 7, 8, 9]         # Depth of the trees
learning_rate_values = [0.01, 0.05, 0.1, .15, 0.2]  # Learning rate
n_estimators_values = [30, 50, 100, 150, 200]    # Number of boosting rounds

results1 = classifying_tuner(df_mm)
results2 = classifying_tuner(df_p)

In [36]:
# Min Max
results1.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
37,3,0.1,100,0.2
142,7,0.15,100,0.2
45,3,0.2,30,0.2
46,3,0.2,50,0.2
174,8,0.2,200,0.2
173,8,0.2,150,0.2
118,6,0.15,150,0.2
160,8,0.1,30,0.2
20,2,0.2,30,0.2
169,8,0.15,200,0.2


In [38]:
# Percentile
results2.head(20)

Unnamed: 0,max_depth,learning_rate,n_estimators,accuracy
0,2,0.01,30,0.2
25,3,0.01,30,0.2
176,9,0.01,50,0.2
175,9,0.01,30,0.2
151,8,0.01,50,0.2
150,8,0.01,30,0.2
126,7,0.01,50,0.2
125,7,0.01,30,0.2
1,2,0.01,50,0.2
75,5,0.01,30,0.2
