# Load the NBA Dataset

In [15]:
import kagglehub  # spell-checker: ignore
import pandas as pd
import os

path = kagglehub.dataset_download("szymonjwiak/nba-play-by-play-data-1997-2023")  # spell-checker: ignore

print("Dataset downloaded to:", path)

df = pd.read_csv(os.path.join(path, 'pbp2010.csv'))

print("First 5 records:", df.head())
print("Columns:", df.columns.tolist())

Dataset downloaded to: /Users/rahulkarthikt/.cache/kagglehub/datasets/szymonjwiak/nba-play-by-play-data-1997-2023/versions/1
First 5 records:      gameid  period        clock  h_pts  a_pts team  playerid      player  \
0  20900001       1  PT12M00.00S    0.0    0.0  NaN         0         NaN   
1  20900001       1  PT12M00.00S    NaN    NaN  CLE       406   S. O'Neal   
2  20900001       1  PT11M31.00S    2.0    0.0  CLE      2760  A. Varejao   
3  20900001       1  PT11M12.00S    NaN    NaN  BOS       951    R. Allen   
4  20900001       1  PT11M10.00S    NaN    NaN  CLE       406   S. O'Neal   

          type              subtype  result    x   y  dist  \
0       period                start     NaN    0   0     0   
1    Jump Ball                  NaN     NaN    0   0     0   
2    Made Shot  Step Back Jump shot    Made   36  93    10   
3  Missed Shot            Jump Shot  Missed  214  83    23   
4      Rebound              Unknown     NaN    0   0     0   

                      

# Process Data for Synergy Prediction

In [16]:
import pandas as pd
import os
from collections import defaultdict
import re

path = '/Users/rahulkarthikt/.cache/kagglehub/datasets/szymonjwiak/nba-play-by-play-data-1997-2023/versions/1'  # spell-checker: ignore

years = [2010, 2011, 2012]

dfs = []

for year in years:
    df = pd.read_csv(os.path.join(path, f'pbp{year}.csv'))
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

# Calculate player stats

# Points
df_shots = df_all[df_all['type'] == 'Made Shot']
df_shots = df_shots.copy()
df_shots['pts'] = df_shots['desc'].str.extract(r'\((\d+) PTS\)')[0].astype(float)
player_pts = df_shots.groupby(['playerid', 'season'])['pts'].sum().reset_index(name='pts')  # spell-checker: ignore

# Rebounds
df_rebounds = df_all[df_all['type'] == 'Rebound']
player_reb = df_rebounds.groupby(['playerid', 'season']).size().reset_index(name='reb')  # spell-checker: ignore

# Assists
df_assists = df_all[df_all['subtype'] == 'Assist']
player_ast = df_assists.groupby(['playerid', 'season']).size().reset_index(name='ast')  # spell-checker: ignore

# Merge
player_stats = player_pts.merge(player_reb, on=['playerid', 'season'], how='outer').merge(player_ast, on=['playerid', 'season'], how='outer').fillna(0)

print("Player stats sample:")
print(player_stats.head())

# For duo net rating, simplified calculation
# For demonstration, create synthetic duos and net rating

# Take top players
top_players = player_stats.groupby('playerid')['pts'].sum().nlargest(20).index

duo_list = []

for i in range(len(top_players)):
    for j in range(i+1, len(top_players)):
        duo_list.append((top_players[i], top_players[j]))

duo_df = pd.DataFrame(duo_list, columns=['player_a', 'player_b'])

# Add season, assume 2010
duo_df['season'] = 2010

# Merge stats
player_a_stats = player_stats.rename(columns={'playerid': 'player_a', 'pts': 'pts_a', 'reb': 'reb_a', 'ast': 'ast_a'})  # spell-checker: ignore
player_b_stats = player_stats.rename(columns={'playerid': 'player_b', 'pts': 'pts_b', 'reb': 'reb_b', 'ast': 'ast_b'})  # spell-checker: ignore

duo_df = duo_df.merge(player_a_stats[['player_a', 'season', 'pts_a', 'reb_a', 'ast_a']], on=['player_a', 'season'], how='left', validate='many_to_one')
duo_df = duo_df.merge(player_b_stats[['player_b', 'season', 'pts_b', 'reb_b', 'ast_b']], on=['player_b', 'season'], how='left', validate='many_to_one')

# Combined features
duo_df['combined_pts'] = duo_df['pts_a'] + duo_df['pts_b']
duo_df['combined_reb'] = duo_df['reb_a'] + duo_df['reb_b']
duo_df['combined_ast'] = duo_df['ast_a'] + duo_df['ast_b']

# Synthetic target: net rating as a function of combined stats
import numpy as np
rng = np.random.default_rng(42)
duo_df['net_rating'] = (duo_df['combined_pts'] + duo_df['combined_reb'] + duo_df['combined_ast']) / 10000 - 5 + rng.normal(0, 1, len(duo_df))

print("Duo data sample:")
print(duo_df.head())

X = duo_df[['combined_pts', 'combined_reb', 'combined_ast']]
y = duo_df['net_rating']

Player stats sample:
   playerid  season     pts    reb  ast
0       255    2010  3120.0  538.0  0.0
1       255    2011  3663.0  338.0  0.0
2       255    2012  1480.0  171.0  0.0
3       283    2010    13.0   14.0  0.0
4       406    2010  2386.0  416.0  0.0
Duo data sample:
   player_a  player_b  season    pts_a  reb_a  ast_a    pts_b  reb_b  ast_b  \
0      2544    201142    2010  14385.0  656.0    0.0  13737.0  669.0    0.0   
1      2544       977    2010  14385.0  656.0    0.0  14663.0  529.0    0.0   
2      2544      2548    2010  14385.0  656.0    0.0  11750.0  401.0    0.0   
3      2544      2546    2010  14385.0  656.0    0.0  11935.0  505.0    0.0   
4      2544      1717    2010  14385.0  656.0    0.0  10897.0  669.0    0.0   

   combined_pts  combined_reb  combined_ast  net_rating  
0       28122.0        1325.0           0.0   -1.750583  
1       29048.0        1185.0           0.0   -3.016684  
2       26135.0        1057.0           0.0   -1.530349  
3       26320.0

# Build the Multiple Linear Regression Model

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

print("Model trained.")

Model trained.


# Evaluate the Regression Model

In [18]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"R-squared: {r2}")

MSE: 0.8962178453075399
R-squared: 0.1584398097906865


# Create Synergy Classes

In [19]:
duo_df['synergy_class'] = pd.cut(duo_df['net_rating'], bins=[-float('inf'), -3.5, 3.5, float('inf')], labels=[0, 1, 2])

print(duo_df['synergy_class'].value_counts())

synergy_class
1    126
0     64
2      0
Name: count, dtype: int64


# Train the Logistic Regression Classifier

In [20]:
from sklearn.linear_model import LogisticRegression

y_class = duo_df['synergy_class']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)

log_model = LogisticRegression()

log_model.fit(X_train_c, y_train_c)

print("Logistic Regression trained.")

Logistic Regression trained.


# Train the Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(ccp_alpha=0.0)

dt_model.fit(X_train_c, y_train_c)

print("Decision Tree trained.")

Decision Tree trained.


# Evaluate the Classification Models

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold

log_pred = log_model.predict(X_test_c)
dt_pred = dt_model.predict(X_test_c)

log_acc = accuracy_score(y_test_c, log_pred)
dt_acc = accuracy_score(y_test_c, dt_pred)

print(f"Logistic Regression Accuracy: {log_acc}")
print(f"Decision Tree Accuracy: {dt_acc}")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
log_cv = cross_val_score(log_model, X, y_class, cv=kf)
dt_cv = cross_val_score(dt_model, X, y_class, cv=kf)

print(f"Logistic Regression CV Mean: {log_cv.mean()}")
print(f"Decision Tree CV Mean: {dt_cv.mean()}")

Logistic Regression Accuracy: 0.6578947368421053
Decision Tree Accuracy: 0.5789473684210527
Logistic Regression CV Mean: 0.6736842105263158
Decision Tree CV Mean: 0.5473684210526315


# Compare Models with McNemar's Test

In [23]:
from statsmodels.stats.contingency_tables import mcnemar

# Build contingency table
table = [[0, 0], [0, 0]]
for i in range(len(y_test_c)):
    if y_test_c.iloc[i] == dt_pred[i] and y_test_c.iloc[i] != log_pred[i]:
        table[0][1] += 1
    elif y_test_c.iloc[i] != dt_pred[i] and y_test_c.iloc[i] == log_pred[i]:
        table[1][0] += 1

for i in range(len(y_test_c)):
    if y_test_c.iloc[i] == dt_pred[i] and y_test_c.iloc[i] == log_pred[i]:
        table[0][0] += 1
    elif y_test_c.iloc[i] != dt_pred[i] and y_test_c.iloc[i] != log_pred[i]:
        table[1][1] += 1

print("Contingency Table (McNemar):", table)

# Run McNemar's test
result = mcnemar(table, exact=True)  
stat = getattr(result, 'statistic')
pval = getattr(result, 'pvalue')

print(f"McNemar's test statistic: {stat}")
print(f"p-value: {pval}")

if pval < 0.05:
    print("Significant difference between models (reject H0)")
else:
    print("No significant difference between models (fail to reject H0)")

Contingency Table (McNemar): [[19, 3], [6, 10]]
McNemar's test statistic: 3.0
p-value: 0.5078125
No significant difference between models (fail to reject H0)
