In [1]:
# Bringing in the data

import pandas as pd

# Read the CSV file into a DataFrame
df_mm = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/mm_data.csv")

df_p = pd.read_csv("C:/Users/hagen/Downloads/NBA DATA/Teams/Transformed Data/p_data.csv")

In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

def multi_logistic_accuracy(df, model, selected_feature_tuple):
    """
    Custom cross-validation for predicting NBA champion using Logistic Regression.
    
    Parameters:
    - df: DataFrame containing the data.
    - model: Logistic Regression model.
    - selected_feature_tuple: Tuple of selected features.
    
    Returns:
    - Accuracy DataFrame sorted by accuracy.
    """
    selected_features = list(selected_feature_tuple)  # Convert tuple to list
    results = []

    for test_year in range(2014, 2024):
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        test_data = df[df['Year'] == test_year]

        X_train, y_train = train_data[selected_features], train_data['Levels']
        X_test, y_test = test_data[selected_features], test_data['Levels']

        # Print VIF if multiple features
        #if len(selected_features) > 1:
            #vif_data = pd.DataFrame({
                #"Feature": selected_features,
                #"VIF": [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
            #})
            #print(f"VIF for {selected_features}:\n{vif_data}\n")

        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        # Rank teams based on predicted scores
        ranked_teams = test_data[['Team']].copy()
        ranked_teams['Score'] = y_pred[:, 4]
        ranked_teams = ranked_teams.sort_values(by='Score', ascending=False)
        predicted_champion = ranked_teams.iloc[0]['Team']
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]

        # Convert boolean comparison to integer (1 if correct, 0 if incorrect)
        accuracy = int(predicted_champion == actual_champion)
        results.append(accuracy)  # Store for each year

    # Compute final accuracy as a percentage
    overall_accuracy = sum(results) / len(results)

    accuracies = pd.DataFrame({'Selected Features': [selected_feature_tuple], 'Accuracy': [overall_accuracy]})
    
    # Sort by Accuracy in descending order (higher accuracy first)
    accuracies_sorted = accuracies.sort_values(by='Accuracy', ascending=False)

    return accuracies_sorted

In [4]:
# defining the cross validation technique for our use case

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

def multi_logistic_top5(df, model, selected_features):
    """
    Custom cross-validation for predicting NBA champion using XGBoostRanker,
    with group information to rank the teams by their predicted level (score).
    """
    results = []

    # Iterate over the years you want to predict (2014 to 2023)
    for test_year in range(2014, 2024):
        
        # Training data: the previous 5 years of data (e.g., 2009-2013 for 2014 prediction)
        train_data = df[df['Year'].isin(range(test_year - 5, test_year))]
        
        # Test data: current year
        test_data = df[df['Year'] == test_year]
        
        # Prepare features and target for training and testing
        X_train = train_data[selected_features]
        y_train = train_data['Levels']
        
        X_test = test_data[selected_features]
        y_test = test_data['Levels']

        # If more than 1 feature is selected, calculate VIF
        if len(selected_features) > 1:
            vif_data = pd.DataFrame()
            vif_data["Feature"] = X_train.columns
            vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
            print(vif_data)
        
        # Train the model using Logistic Regression
        model.fit(X_train, y_train)
        
        # Make predictions on the test data
        y_pred = model.predict_proba(X_test)

        # Rank teams based on predicted scores
        ranked_teams = test_data[['Team']].copy()
        ranked_teams['Score'] = y_pred[:, 4]
        ranked_teams = ranked_teams.sort_values(by='Score', ascending=False)
        
        # Get the actual champion team (level 4) from the test data
        actual_champion = test_data[test_data['Levels'] == 4]['Team'].values[0]

        # Get top 5 teams and their scores
        top_teams = ranked_teams.head(5)
        top_teams_list = top_teams.apply(lambda row: f"{row['Team']}: {row['Score']:.4f}", axis=1).tolist()
        
        # Store results
        results.append([test_year, actual_champion] + top_teams_list)
    
    # Create dataframe with results
    columns = ['Year', 'Actual Champion', 'First Pick', 'Second Pick', 'Third Pick', 'Fourth Pick', 'Fifth Pick']
    results_df = pd.DataFrame(results, columns=columns)
    results_df = results_df.sort_values(by="Year", ascending=False)

    # Compute accuracy
    accuracy = (results_df['Actual Champion'] == results_df['First Pick'].str.split(': ').str[0]).mean()
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    
    return results_df

In [7]:
# min max feature importance

from itertools import combinations
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

champion_mm = df_mm[df_mm['Champion'] == 1]
champion_mm = champion_mm.drop(columns=['L', 'PL', 'Champion', 'Levels','Year', 'Team'])

col_means = champion_mm.mean()
sorted_col_means = col_means.sort_values(ascending=False)
print(sorted_col_means.head(10))

col_meds = champion_mm.median()
sorted_col_meds = col_meds.sort_values(ascending=False)
print(sorted_col_meds.head(10))

col_mins = champion_mm.min()
sorted_col_mins = col_mins.sort_values(ascending=False)
print(sorted_col_mins.head(10))

important_mm = ['W', 'PW', 'SRS', 'NRtg', 'MOV', 'eFG%', 'FG%', 'TS%']

print(champion_mm[important_mm])

pairs1_mm = list(combinations(important_mm, 1))
pairs2_mm = list(combinations(important_mm, 2))
pairs3_mm = list(combinations(important_mm, 3))
pairs4_mm = list(combinations(important_mm, 4))
pairs5_mm = list(combinations(important_mm, 5))

for pair in pairs2_mm:
    vif_data = pd.DataFrame()
    df_subset = df_mm[list(pair)]  # Select only the columns in the pair

    vif_data["Feature"] = df_subset.columns
    vif_data["VIF"] = [variance_inflation_factor(df_subset.values, i) for i in range(df_subset.shape[1])]

    print(f"VIF for features: {pair}")
    print(vif_data)
    print("-" * 50)  # Separator for better readability

W            0.907581
PW           0.891975
SRS          0.887981
NRtg         0.883819
MOV          0.882850
eFG%         0.860737
FG%          0.836710
2P%          0.828092
TS%          0.827180
0-3ft_FG%    0.825777
dtype: float64
eFG%         0.935484
W            0.906977
PW           0.880952
MOV          0.859944
NRtg         0.859459
FG%          0.859375
SRS          0.857731
TS%          0.857143
0-3ft_FG%    0.853448
2P%          0.842857
dtype: float64
PW      0.780488
NRtg    0.774194
MOV     0.771687
SRS     0.757143
W       0.746032
ORtg    0.557823
2P%     0.550725
Age     0.507463
FG      0.466667
FG%     0.444444
dtype: float64
            W        PW       SRS      NRtg       MOV      eFG%       FG%  \
11   0.918367  0.840909  0.855377  0.833333  0.837917  0.431818  0.444444   
40   0.888889  0.813953  0.849744  0.803371  0.803886  0.966667  0.803571   
66   0.906977  0.909091  0.920056  0.888889  0.900407  0.741573  0.859375   
94   1.000000  0.957447  0.885033  0.

In [9]:
# percentile feature importance

from itertools import combinations
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

champion_p = df_p[df_p['Champion'] == 1]
champion_p = champion_p.drop(columns=['L', 'PL', 'Champion', 'Levels', 'Year', 'Team'])

col_means = champion_p.mean()
sorted_col_means = col_means.sort_values(ascending=False)
print(sorted_col_means.head(10))

col_meds = champion_p.median()
sorted_col_meds = col_meds.sort_values(ascending=False)
print(sorted_col_meds.head(10))

col_mins = champion_p.min()
sorted_col_mins = col_mins.sort_values(ascending=False)
print(sorted_col_mins.head(10))

important_p = ['W', 'PW', 'SRS', 'NRtg', 'MOV', 'eFG%', 'FG%', 'TS%', '2P%', 'FG']

print(champion_p[important_p])

pairs1_p = list(combinations(important_p, 1))
pairs2_p = list(combinations(important_p, 2))
pairs3_p = list(combinations(important_p, 3))
pairs4_p = list(combinations(important_p, 4))
pairs5_p = list(combinations(important_p, 5))

for pair in pairs2_p:
    vif_data = pd.DataFrame()
    df_subset = df_p[list(pair)]  # Select only the columns in the pair

    vif_data["Feature"] = df_subset.columns
    vif_data["VIF"] = [variance_inflation_factor(df_subset.values, i) for i in range(df_subset.shape[1])]

    print(f"VIF for features: {pair}")
    print(vif_data)
    print("-" * 50)  # Separator for better readability


W       0.937931
eFG%    0.920690
SRS     0.919540
2P%     0.913793
MOV     0.912644
PW      0.911494
NRtg    0.910345
TS%     0.893103
FG%     0.877011
Age     0.837931
dtype: float64
eFG%         0.965517
W            0.931034
FG%          0.931034
SRS          0.931034
MOV          0.931034
2P%          0.931034
PW           0.931034
FG           0.896552
0-3ft_FG%    0.896552
TS%          0.896552
dtype: float64
W       0.793103
NRtg    0.758621
SRS     0.758621
MOV     0.758621
PW      0.758621
2P%     0.568966
PTS     0.517241
DRB     0.482759
eFG%    0.482759
Age     0.482759
dtype: float64
            W        PW       SRS      NRtg       MOV      eFG%       FG%  \
11   0.931034  0.827586  0.862069  0.810345  0.827586  0.482759  0.413793   
40   0.879310  0.758621  0.758621  0.758621  0.758621  0.931034  0.879310   
66   0.896552  0.931034  0.896552  0.896552  0.896552  0.844828  0.896552   
94   1.000000  0.965517  0.965517  0.965517  0.965517  1.000000  1.000000   
125  1.000

In [11]:
model1 = LogisticRegression(solver='lbfgs', max_iter=1000, class_weight='balanced')

model2 = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000, class_weight='balanced')

In [13]:
#min max data on model1 with 3 variables

results1_mm3 =pd.concat([multi_logistic_accuracy(df_mm, model1, pair) for pair in pairs3_mm])
results1_mm3 = results1_mm3.sort_values(by="Accuracy", ascending=False)
results1_mm3.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(PW, NRtg, eFG%)",0.5
0,"(SRS, NRtg, eFG%)",0.5
0,"(W, MOV, eFG%)",0.5
0,"(PW, MOV, eFG%)",0.5
0,"(W, eFG%, TS%)",0.5
0,"(W, NRtg, eFG%)",0.5
0,"(NRtg, MOV, eFG%)",0.5
0,"(W, SRS, eFG%)",0.5
0,"(PW, SRS, eFG%)",0.5
0,"(W, PW, eFG%)",0.5


In [15]:
#min max data on model1 with 2 variables

results1_mm2 =pd.concat([multi_logistic_accuracy(df_mm, model1, pair) for pair in pairs2_mm])
results1_mm2 = results1_mm2.sort_values(by="Accuracy", ascending=False)
results1_mm2.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(MOV, eFG%)",0.5
0,"(SRS, eFG%)",0.5
0,"(W, eFG%)",0.5
0,"(W, FG%)",0.5
0,"(NRtg, eFG%)",0.5
0,"(PW, eFG%)",0.5
0,"(PW, FG%)",0.4
0,"(SRS, FG%)",0.4
0,"(NRtg, FG%)",0.4
0,"(NRtg, TS%)",0.4


In [17]:
#min max data on model1 with 1 variables

results1_mm1 =pd.concat([multi_logistic_accuracy(df_mm, model1, pair) for pair in pairs1_mm])
results1_mm1 = results1_mm1.sort_values(by="Accuracy", ascending=False)
results1_mm1.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(FG%,)",0.5
0,"(eFG%,)",0.4
0,"(TS%,)",0.4
0,"(W,)",0.3
0,"(PW,)",0.3
0,"(SRS,)",0.3
0,"(NRtg,)",0.3
0,"(MOV,)",0.3


In [19]:
#min max data on model2 with 3 variables

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_mm3 =pd.concat([multi_logistic_accuracy(df_mm, model2, pair) for pair in pairs3_mm])
results2_mm3 = results2_mm3.sort_values(by="Accuracy", ascending=False)
results2_mm3.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(W, MOV, eFG%)",0.5
0,"(W, SRS, FG%)",0.5
0,"(PW, NRtg, eFG%)",0.5
0,"(SRS, NRtg, eFG%)",0.5
0,"(PW, SRS, eFG%)",0.5
0,"(SRS, MOV, eFG%)",0.5
0,"(NRtg, MOV, eFG%)",0.5
0,"(W, NRtg, eFG%)",0.5
0,"(PW, MOV, eFG%)",0.5
0,"(W, SRS, eFG%)",0.5


In [21]:
#min max data on model2 with 2 variables

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_mm2 =pd.concat([multi_logistic_accuracy(df_mm, model2, pair) for pair in pairs2_mm])
results2_mm2 = results2_mm2.sort_values(by="Accuracy", ascending=False)
results2_mm2.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(MOV, eFG%)",0.5
0,"(SRS, FG%)",0.5
0,"(SRS, eFG%)",0.5
0,"(W, eFG%)",0.5
0,"(W, FG%)",0.5
0,"(NRtg, FG%)",0.5
0,"(NRtg, eFG%)",0.5
0,"(PW, eFG%)",0.5
0,"(PW, FG%)",0.4
0,"(NRtg, TS%)",0.4


In [23]:
#min max data on model2 with 1 variables

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_mm1 =pd.concat([multi_logistic_accuracy(df_mm, model2, pair) for pair in pairs1_mm])
results2_mm1 = results2_mm1.sort_values(by="Accuracy", ascending=False)
results2_mm1.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(FG%,)",0.5
0,"(eFG%,)",0.4
0,"(TS%,)",0.4
0,"(W,)",0.3
0,"(PW,)",0.3
0,"(SRS,)",0.3
0,"(NRtg,)",0.3
0,"(MOV,)",0.3


In [25]:
#percentile data on model1 with 3 variables

results1_p3 =pd.concat([multi_logistic_accuracy(df_p, model1, pair) for pair in pairs3_p])
results1_p3 = results1_p3.sort_values(by="Accuracy", ascending=False)
results1_p3.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(SRS, 2P%, FG)",0.6
0,"(PW, eFG%, FG)",0.6
0,"(SRS, eFG%, FG)",0.6
0,"(W, eFG%, FG)",0.6
0,"(W, eFG%, TS%)",0.6
0,"(MOV, eFG%, FG)",0.6
0,"(NRtg, eFG%, FG)",0.6
0,"(MOV, 2P%, FG)",0.5
0,"(NRtg, eFG%, TS%)",0.5
0,"(SRS, eFG%, 2P%)",0.5


In [27]:
#percentile data on model1 with 2 variables

results1_p2 =pd.concat([multi_logistic_accuracy(df_p, model1, pair) for pair in pairs2_p])
results1_p2 = results1_p2.sort_values(by="Accuracy", ascending=False)
results1_p2.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(PW, eFG%)",0.5
0,"(FG%, FG)",0.5
0,"(W, eFG%)",0.5
0,"(W, FG%)",0.5
0,"(W, 2P%)",0.5
0,"(eFG%, FG)",0.5
0,"(eFG%, TS%)",0.5
0,"(SRS, eFG%)",0.5
0,"(NRtg, eFG%)",0.5
0,"(MOV, eFG%)",0.5


In [29]:
#percentile data on model1 with 1 variables

results1_p1 =pd.concat([multi_logistic_accuracy(df_p, model1, pair) for pair in pairs1_p])
results1_p1 = results1_p1.sort_values(by="Accuracy", ascending=False)
results1_p1.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(FG%,)",0.5
0,"(eFG%,)",0.4
0,"(TS%,)",0.4
0,"(FG,)",0.4
0,"(W,)",0.3
0,"(PW,)",0.3
0,"(SRS,)",0.3
0,"(NRtg,)",0.3
0,"(MOV,)",0.3
0,"(2P%,)",0.2


In [31]:
# percentile data on model2 with 3 variables

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_p3 =pd.concat([multi_logistic_accuracy(df_p, model2, pair) for pair in pairs3_p])
results2_p3 = results2_p3.sort_values(by="Accuracy", ascending=False)
results2_p3.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(PW, FG%, FG)",0.5
0,"(NRtg, FG%, FG)",0.5
0,"(PW, eFG%, FG)",0.5
0,"(PW, NRtg, eFG%)",0.5
0,"(SRS, MOV, eFG%)",0.5
0,"(SRS, eFG%, TS%)",0.5
0,"(W, FG%, FG)",0.5
0,"(SRS, eFG%, FG)",0.5
0,"(W, eFG%, FG)",0.5
0,"(SRS, FG%, FG)",0.5


In [33]:
# percentile data on model2 with 2 variables

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_p2 =pd.concat([multi_logistic_accuracy(df_p, model2, pair) for pair in pairs2_p])
results2_p2 = results2_p2.sort_values(by="Accuracy", ascending=False)
results2_p2.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(MOV, eFG%)",0.6
0,"(NRtg, eFG%)",0.6
0,"(PW, eFG%)",0.6
0,"(SRS, eFG%)",0.6
0,"(W, eFG%)",0.5
0,"(W, FG%)",0.5
0,"(FG%, FG)",0.5
0,"(eFG%, FG)",0.5
0,"(eFG%, TS%)",0.5
0,"(SRS, FG%)",0.5


In [35]:
# percentile data on model2 with 1 variable

import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

results2_p1 =pd.concat([multi_logistic_accuracy(df_p, model2, pair) for pair in pairs1_p])
results2_p1 = results2_p1.sort_values(by="Accuracy", ascending=False)
results2_p1.head(10)

Unnamed: 0,Selected Features,Accuracy
0,"(FG%,)",0.5
0,"(eFG%,)",0.4
0,"(TS%,)",0.4
0,"(FG,)",0.4
0,"(W,)",0.3
0,"(PW,)",0.3
0,"(SRS,)",0.3
0,"(NRtg,)",0.3
0,"(MOV,)",0.3
0,"(2P%,)",0.2


In [37]:
import warnings

# Mute the specific FutureWarning for 'multi_class' deprecation in LogisticRegression
warnings.filterwarnings("ignore", category=FutureWarning, 
                        message=".*'multi_class' was deprecated.*")

selected_features = ['MOV', 'eFG%']

multi_logistic_top5(df_p, model2, selected_features)

  Feature     VIF
0     MOV  6.4571
1    eFG%  6.4571
  Feature       VIF
0     MOV  6.948479
1    eFG%  6.948479
  Feature       VIF
0     MOV  6.981837
1    eFG%  6.981837
  Feature       VIF
0     MOV  6.931575
1    eFG%  6.931575
  Feature       VIF
0     MOV  7.018316
1    eFG%  7.018316
  Feature       VIF
0     MOV  7.372382
1    eFG%  7.372382
  Feature       VIF
0     MOV  7.176522
1    eFG%  7.176522
  Feature       VIF
0     MOV  7.394214
1    eFG%  7.394214
  Feature       VIF
0     MOV  7.482804
1    eFG%  7.482804
  Feature       VIF
0     MOV  6.714873
1    eFG%  6.714873
Model Accuracy: 60.00%


Unnamed: 0,Year,Actual Champion,First Pick,Second Pick,Third Pick,Fourth Pick,Fifth Pick
9,2023,Boston Celtics,Boston Celtics: 0.2629,Indiana Pacers: 0.2602,Oklahoma City Thunder: 0.2549,Milwaukee Bucks: 0.2334,Phoenix Suns: 0.2252
8,2022,Denver Nuggets,Denver Nuggets: 0.2563,Boston Celtics: 0.2486,Sacramento Kings: 0.2483,Philadelphia 76ers: 0.2355,Golden State Warriors: 0.2339
7,2021,Golden State Warriors,Utah Jazz: 0.2534,Phoenix Suns: 0.2481,Golden State Warriors: 0.2451,Denver Nuggets: 0.2392,Miami Heat: 0.2307
6,2020,Milwaukee Bucks,Milwaukee Bucks: 0.2542,Brooklyn Nets: 0.2512,Los Angeles Clippers: 0.2498,Phoenix Suns: 0.2461,Utah Jazz: 0.2455
5,2019,Los Angeles Lakers,Milwaukee Bucks: 0.2577,Utah Jazz: 0.2342,Dallas Mavericks: 0.2327,Miami Heat: 0.2316,Los Angeles Lakers: 0.2301
4,2018,Toronto Raptors,Golden State Warriors: 0.2555,Milwaukee Bucks: 0.2535,Toronto Raptors: 0.2450,Houston Rockets: 0.2332,Utah Jazz: 0.2310
3,2017,Golden State Warriors,Golden State Warriors: 0.2557,Houston Rockets: 0.2540,Toronto Raptors: 0.2375,Philadelphia 76ers: 0.2173,Cleveland Cavaliers: 0.1970
2,2016,Golden State Warriors,Golden State Warriors: 0.2597,Houston Rockets: 0.2497,Cleveland Cavaliers: 0.2465,Los Angeles Clippers: 0.2426,San Antonio Spurs: 0.2128
1,2015,Cleveland Cavaliers,Golden State Warriors: 0.2610,San Antonio Spurs: 0.2569,Oklahoma City Thunder: 0.2462,Cleveland Cavaliers: 0.2449,Los Angeles Clippers: 0.2416
0,2014,Golden State Warriors,Golden State Warriors: 0.2600,Los Angeles Clippers: 0.2574,Atlanta Hawks: 0.2527,San Antonio Spurs: 0.2486,Cleveland Cavaliers: 0.2479
