In [6]:
# Bringing in the data

import pandas as pd

# Read the CSV file into a DataFrame
df_mm = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/mm_data.csv")

df_p = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/p_data.csv")

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

def multi_logistic_accuracy(df, model, selected_features):
    results = []

    for test_year in range(2014, 2024):
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        test_data = df[df['Year'] == test_year]

        X_train = train_data[selected_features]
        y_train = train_data['Levels']
        X_test = test_data[selected_features]
        y_test = test_data['Levels']

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 4]

        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]
        predicted_champion = test_data.iloc[y_pred.argmax()]['Team']

        results.append(actual_champion == predicted_champion)

    accuracy = sum(results) / len(results)
    return accuracy


In [16]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

def boosting_tuner(df, base_model, selected_features):
    """
    Function to tune hyperparameters for XGBoost Ranker using cross-validation.
    This function performs a grid search over max_depth, learning_rate, and n_estimators.
    """
    # Create an empty list to store the results
    results_list = []

    # Loop over all combinations of learning_rate and n_estimators
    for learning_rate in learning_rate_values:
        for n_estimators in n_estimators_values:

            # Initialize the AdaBoostClassifier model with the current combination of hyperparameters
            model = AdaBoostClassifier(base_model, n_estimators=n_estimators, learning_rate=learning_rate)
            
            # Run cross-validation and get the accuracy for the current combination
            accuracy = multi_logistic_accuracy(df, model, selected_features)
            
            # Store the result with the corresponding parameters
            results_list.append({
                'learning_rate': learning_rate,
                'n_estimators': n_estimators,
                'accuracy': accuracy
            })

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results_list)

    # Sort the results by accuracy in descending order
    results_df = results_df.sort_values(by='accuracy', ascending=False)

    return results_df

In [18]:
selected_features = ['W', 'eFG%']

model1 = LogisticRegression(solver='lbfgs', max_iter=1000)

model2 = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000)

In [24]:
import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

learning_rate_values = [0.01, 0.05, 0.1, .15, 0.2]
n_estimators_values = [30, 50, 100, 150, 200]

results1 = boosting_tuner(df_mm, model1, selected_features)
results2 = boosting_tuner(df_mm, model2, selected_features)
results3 = boosting_tuner(df_p, model1, selected_features)
results4 = boosting_tuner(df_p, model2, selected_features)

In [26]:
results1.head(20)

Unnamed: 0,learning_rate,n_estimators,accuracy
24,0.2,200,0.5
1,0.01,50,0.5
23,0.2,150,0.5
22,0.2,100,0.5
4,0.01,200,0.5
6,0.05,50,0.5
19,0.15,200,0.5
18,0.15,150,0.5
20,0.2,30,0.4
17,0.15,100,0.4


In [28]:
results2.head(20)

Unnamed: 0,learning_rate,n_estimators,accuracy
24,0.2,200,0.5
23,0.2,150,0.5
22,0.2,100,0.5
21,0.2,50,0.5
1,0.01,50,0.4
20,0.2,30,0.4
19,0.15,200,0.4
18,0.15,150,0.4
16,0.15,50,0.4
0,0.01,30,0.4


In [30]:
results3.head(20)

Unnamed: 0,learning_rate,n_estimators,accuracy
12,0.1,100,0.6
9,0.05,200,0.5
18,0.15,150,0.5
14,0.1,200,0.5
13,0.1,150,0.5
8,0.05,150,0.5
22,0.2,100,0.4
19,0.15,200,0.4
17,0.15,100,0.4
0,0.01,30,0.4


In [32]:
results4.head(20)

Unnamed: 0,learning_rate,n_estimators,accuracy
6,0.05,50,0.6
12,0.1,100,0.5
19,0.15,200,0.5
18,0.15,150,0.5
17,0.15,100,0.5
8,0.05,150,0.5
14,0.1,200,0.5
13,0.1,150,0.5
1,0.01,50,0.4
23,0.2,150,0.4


In [34]:
# defining the cross validation technique for our use case

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

def multi_logistic_top5(df, model, selected_features):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data[selected_features]
        y_train = train_data['Levels']
        
        X_test = test_data[selected_features]
        y_test = test_data['Levels']

        # If more than 1 feature is selected, calculate VIF
        if len(selected_features) > 1:
            vif_data = pd.DataFrame()
            vif_data["Feature"] = X_train.columns
            vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
            print(vif_data)
        
        # Train the model using Logistic Regression
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict_proba(X_test)

        # Rank teams based on predicted scores
        ranked_teams = test_data[['Team']].copy()
        ranked_teams['Score'] = y_pred[:, 4]
        ranked_teams = ranked_teams.sort_values(by='Score', ascending=False)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]

        # Get top 5 teams and their scores
        top_teams = ranked_teams.head(5)
        top_teams_list = top_teams.apply(lambda row: f"{row['Team']}: {row['Score']:.4f}", axis=1).tolist()
        
        # Store results
        results.append([test_year, actual_champion] + top_teams_list)
    
    # Create dataframe with results
    columns = ['Year', 'Actual Champion', 'First Pick', 'Second Pick', 'Third Pick', 'Fourth Pick', 'Fifth Pick']
    results_df = pd.DataFrame(results, columns=columns)
    results_df = results_df.sort_values(by="Year", ascending=False)

    # Compute accuracy
    accuracy = (results_df['Actual Champion'] == results_df['First Pick'].str.split(': ').str[0]).mean()
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    
    return results_df

In [40]:
selected_features = ['W', 'eFG%']

boosted_model1 = AdaBoostClassifier(model1, n_estimators = 100, learning_rate = 0.1)

In [42]:
selected_features = ['W', 'eFG%']

boosted_model2 = AdaBoostClassifier(model2, n_estimators = 50, learning_rate = 0.05)

In [44]:
multi_logistic_top5(df_p, boosted_model1, selected_features)

  Feature       VIF
0       W  6.048703
1    eFG%  6.048703
  Feature       VIF
0       W  6.736676
1    eFG%  6.736676
  Feature       VIF
0       W  6.500348
1    eFG%  6.500348
  Feature       VIF
0       W  6.474259
1    eFG%  6.474259
  Feature       VIF
0       W  6.925942
1    eFG%  6.925942
  Feature       VIF
0       W  7.140512
1    eFG%  7.140512
  Feature       VIF
0       W  6.862114
1    eFG%  6.862114
  Feature       VIF
0       W  7.314591
1    eFG%  7.314591
  Feature       VIF
0       W  7.321538
1    eFG%  7.321538
  Feature       VIF
0       W  6.613574
1    eFG%  6.613574
Model Accuracy: 70.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 0.1936,Oklahoma City Thunder: 0.1926,Indiana Pacers: 0.1897,Denver Nuggets: 0.1896,Milwaukee Bucks: 0.1888
8,2022,Denver Nuggets,Denver Nuggets: 0.1955,Sacramento Kings: 0.1949,Philadelphia 76ers: 0.1940,Boston Celtics: 0.1940,Golden State Warriors: 0.1930
7,2021,Golden State Warriors,Golden State Warriors: 0.1962,Phoenix Suns: 0.1962,Utah Jazz: 0.1941,Denver Nuggets: 0.1940,Miami Heat: 0.1939
6,2020,Milwaukee Bucks,Milwaukee Bucks: 0.1950,Los Angeles Clippers: 0.1950,Brooklyn Nets: 0.1950,Utah Jazz: 0.1950,Phoenix Suns: 0.1950
5,2019,Los Angeles Lakers,Milwaukee Bucks: 0.1959,Utah Jazz: 0.1933,Los Angeles Lakers: 0.1930,Miami Heat: 0.1922,Toronto Raptors: 0.1922
4,2018,Toronto Raptors,Golden State Warriors: 0.1942,Milwaukee Bucks: 0.1929,Toronto Raptors: 0.1922,Houston Rockets: 0.1904,Utah Jazz: 0.1894
3,2017,Golden State Warriors,Houston Rockets: 0.1927,Golden State Warriors: 0.1919,Cleveland Cavaliers: 0.1905,Philadelphia 76ers: 0.1893,Toronto Raptors: 0.1893
2,2016,Golden State Warriors,Golden State Warriors: 0.1907,Houston Rockets: 0.1890,Cleveland Cavaliers: 0.1890,Los Angeles Clippers: 0.1890,San Antonio Spurs: 0.1877
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 0.1888,San Antonio Spurs: 0.1888,Toronto Raptors: 0.1888,Oklahoma City Thunder: 0.1888,Golden State Warriors: 0.1888
0,2014,Golden State Warriors,Golden State Warriors: 0.1917,Atlanta Hawks: 0.1912,Los Angeles Clippers: 0.1906,Houston Rockets: 0.1877,San Antonio Spurs: 0.1877


In [46]:
import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

multi_logistic_top5(df_p, boosted_model2, selected_features)

  Feature       VIF
0       W  6.048703
1    eFG%  6.048703
  Feature       VIF
0       W  6.736676
1    eFG%  6.736676
  Feature       VIF
0       W  6.500348
1    eFG%  6.500348
  Feature       VIF
0       W  6.474259
1    eFG%  6.474259
  Feature       VIF
0       W  6.925942
1    eFG%  6.925942
  Feature       VIF
0       W  7.140512
1    eFG%  7.140512
  Feature       VIF
0       W  6.862114
1    eFG%  6.862114
  Feature       VIF
0       W  7.314591
1    eFG%  7.314591
  Feature       VIF
0       W  7.321538
1    eFG%  7.321538
  Feature       VIF
0       W  6.613574
1    eFG%  6.613574
Model Accuracy: 60.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 0.1877,Oklahoma City Thunder: 0.1877,Denver Nuggets: 0.1877,Minnesota Timberwolves: 0.1877,Milwaukee Bucks: 0.1876
8,2022,Denver Nuggets,Denver Nuggets: 0.1877,Sacramento Kings: 0.1877,Boston Celtics: 0.1877,Philadelphia 76ers: 0.1877,Milwaukee Bucks: 0.1877
7,2021,Golden State Warriors,Phoenix Suns: 0.1877,Golden State Warriors: 0.1877,Milwaukee Bucks: 0.1877,Utah Jazz: 0.1877,Miami Heat: 0.1877
6,2020,Milwaukee Bucks,Milwaukee Bucks: 0.1878,Utah Jazz: 0.1878,Phoenix Suns: 0.1878,Brooklyn Nets: 0.1878,Los Angeles Clippers: 0.1878
5,2019,Los Angeles Lakers,Los Angeles Lakers: 0.1878,Utah Jazz: 0.1878,Toronto Raptors: 0.1878,Milwaukee Bucks: 0.1878,Los Angeles Clippers: 0.1877
4,2018,Toronto Raptors,Milwaukee Bucks: 0.1877,Golden State Warriors: 0.1877,Toronto Raptors: 0.1877,Philadelphia 76ers: 0.1877,Houston Rockets: 0.1877
3,2017,Golden State Warriors,Golden State Warriors: 0.1876,Toronto Raptors: 0.1876,Houston Rockets: 0.1876,New Orleans Pelicans: 0.1876,Cleveland Cavaliers: 0.1876
2,2016,Golden State Warriors,Houston Rockets: 0.1876,Golden State Warriors: 0.1876,Cleveland Cavaliers: 0.1875,Los Angeles Clippers: 0.1875,Boston Celtics: 0.1875
1,2015,Cleveland Cavaliers,Cleveland Cavaliers: 0.1876,Golden State Warriors: 0.1876,San Antonio Spurs: 0.1876,Oklahoma City Thunder: 0.1876,Los Angeles Clippers: 0.1876
0,2014,Golden State Warriors,Los Angeles Clippers: 0.1877,Golden State Warriors: 0.1877,Atlanta Hawks: 0.1877,Houston Rockets: 0.1877,San Antonio Spurs: 0.1877
