In [None]:
import kagglehub
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from kagglehub import KaggleDatasetAdapter

In [2]:
def load_data(dataset_path: str, file_path: str) -> pd.DataFrame:
    return kagglehub.dataset_load(
        KaggleDatasetAdapter.PANDAS,
        dataset_path,
        file_path
    )

def get_strongest_correlations(df: pd.DataFrame, target_column: str, n: int = 5) -> pd.Series:
    correlations = df.corr(numeric_only = True)[target_column].abs().sort_values(ascending=False)
    return correlations[1:n+1]

In [None]:
# Loading and processing the data
training_df_1 = load_data("parthdande/nba-mvp-voting-dataset-2000-2021", "2001-2010 MVP Data.csv").dropna()
training_df_2 = load_data("parthdande/nba-mvp-voting-dataset-2000-2021", "2010-2021 MVP Data.csv").dropna()
test_data = load_data("parthdande/nba-mvp-voting-dataset-2000-2021", "2022-2023 MVP Data.csv").dropna()
team_data = load_data("sumitrodatta/nba-aba-baa-stats", "Team Summaries.csv").dropna()

team_data = team_data.rename(columns={
    "abbreviation": "Tm",
    "season": "year"
})
training_df = pd.concat([training_df_1, training_df_2], ignore_index=True)
training_df = pd.merge(training_df, team_data, on=["Tm", "year"], how='inner')
training_df = training_df.drop(columns=["Share", "Unnamed: 0",
                                        "Pts Max", "First", "year"])
test_data = test_data[test_data["year"] == 2023]
test_data = pd.merge(test_data, team_data, on=["Tm", "year"], how='inner')

In [4]:
# Finding important features
features = get_strongest_correlations(training_df, target_column="Pts Won", n=10)
print("### Selected Features based on Correlation with 'Pts Won' ###")
print(features)

### Selected Features based on Correlation with 'Pts Won' ###
WS       0.608777
WS/48    0.581054
PTS      0.372497
l        0.345723
w        0.334799
n_rtg    0.306670
mov      0.306197
srs      0.301769
pl       0.294550
pw       0.285400
Name: Pts Won, dtype: float64


In [6]:
# Training the model
X_train = training_df[features.index.tolist()]
y_train = training_df["Pts Won"]

X_test = test_data[features.index.tolist()]
y_test = test_data["Pts Won"]

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

best_alpha = 5
linear_model = Ridge(alpha=best_alpha)
linear_model.fit(X_train_scaled, y_train)
y_pred = linear_model.predict(X_test_scaled)

In [7]:
# Comparing predictions with true values
results_df = test_data.copy() 
results_df['True_Pts_Won'] = y_test.values
results_df['Predicted_Pts_Won'] = y_pred
results_df['Error'] = results_df['True_Pts_Won'] - results_df['Predicted_Pts_Won']

# Sort results by predicted points won
results_df = results_df.sort_values(by='Predicted_Pts_Won', ascending=False)

print("### 2023 NBA MVP Model Predictions vs. True Values (Top 10 Predictions) ###")
print(results_df[['Player', 'Tm', 'True_Pts_Won', 'Predicted_Pts_Won', 'Error']].head(10).to_markdown(index=False))

mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error (MSE): {mse:.4f}")

### Model Predictions vs. True Values (Top 10 Predictions) ###
| Player                  | Tm   |   True_Pts_Won |   Predicted_Pts_Won |      Error |
|:------------------------|:-----|---------------:|--------------------:|-----------:|
| Nikola Jokić            | DEN  |            674 |            476.34   |  197.66    |
| Joel Embiid             | PHI  |            915 |            454.768  |  460.232   |
| Giannis Antetokounmpo   | MIL  |            606 |            315.291  |  290.709   |
| Jayson Tatum            | BOS  |            280 |            314.931  |  -34.9306  |
| Jimmy Butler            | MIA  |              3 |            197.388  | -194.388   |
| Shai Gilgeous-Alexander | OKC  |             46 |            170.942  | -124.942   |
| Domantas Sabonis        | SAC  |             27 |            169.889  | -142.889   |
| Donovan Mitchell        | CLE  |             30 |            111.211  |  -81.2115  |
| Luka Dončić             | DAL  |             10 |            103.

In [None]:
# Create a 2022 test set from the same source file and apply identical preprocessing and evaluation
test_data_2022 = load_data("parthdande/nba-mvp-voting-dataset-2000-2021", "2022-2023 MVP Data.csv").dropna()
# filter to year 2022 and merge team info as done for the 2023 test set
test_data_2022 = test_data_2022[test_data_2022["year"] == 2022]
test_data_2022 = pd.merge(test_data_2022, team_data, on=["Tm", "year"], how='inner')

# Prepare features and target for 2022 using the same selected features
X_test_2022 = test_data_2022[features.index.tolist()]
y_test_2022 = test_data_2022["Pts Won"]

# Scale using the already-fitted scaler from training
X_test_2022_scaled = scaler.transform(X_test_2022)

# Predict and evaluate
y_pred_2022 = linear_model.predict(X_test_2022_scaled)
results_df_2022 = test_data_2022.copy()
results_df_2022['True_Pts_Won'] = y_test_2022.values
results_df_2022['Predicted_Pts_Won'] = y_pred_2022
results_df_2022['Error'] = results_df_2022['True_Pts_Won'] - results_df_2022['Predicted_Pts_Won']
results_df_2022 = results_df_2022.sort_values(by='Predicted_Pts_Won', ascending=False)

print("### 2022 NBA MVP Model Predictions vs. True Values (Top 10 Predictions) ###")
print(results_df_2022[['Player', 'Tm', 'True_Pts_Won', 'Predicted_Pts_Won', 'Error']].head(10).to_markdown(index=False))

mse_2022 = mean_squared_error(y_test_2022, y_pred_2022)
print(f"Mean Squared Error (MSE): {mse_2022:.4f}")

### 2022 NBA MVP Model Predictions vs. True Values (Top 10 Predictions) ###
| Player                | Tm   |   True_Pts_Won |   Predicted_Pts_Won |     Error |
|:----------------------|:-----|---------------:|--------------------:|----------:|
| Nikola Jokić          | DEN  |            875 |            448.366  |  426.634  |
| Giannis Antetokounmpo | MIL  |            595 |            406.844  |  188.156  |
| Joel Embiid           | PHI  |            706 |            378.395  |  327.605  |
| Devin Booker          | PHO  |            216 |            237.304  |  -21.3045 |
| Chris Paul            | PHO  |              2 |            155.903  | -153.903  |
| DeMar DeRozan         | CHI  |              1 |            109.954  | -108.954  |
| Luka Dončić           | DAL  |            146 |            105.935  |   40.0648 |
| Jayson Tatum          | BOS  |             43 |             86.3572 |  -43.3572 |
| Ja Morant             | MEM  |             10 |             79.3539 |  -69.3539 |
