Data downloaded from https://www.kaggle.com/datasets/nathanlauga/nba-games?select=games.csv, originally from Sports Data LLC (https://www.sports-reference.com/)

In [54]:
import csv
import pandas as pd
import numpy as np
import math
from scipy import stats

## Processing Data

In [2]:
big_game_df = pd.read_csv("data/nba/games.csv")
team_df = pd.read_csv("data/nba/teams.csv")

In [3]:
game_df = big_game_df[['SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away']].copy()

In [4]:
# only for 2021 season
game_df = game_df[game_df['SEASON'] == 2021]
game_df.dropna( inplace = True)
print(game_df.shape)

(1076, 5)


## Naive Ranking

In [7]:
# each team scored based on the point difference of every game they've played
scores = {team_ID: 0 for team_ID in teams}
games_played = {team_ID: 0 for team_ID in teams}
for index, game in game_df.iterrows():
    diff = int(game['PTS_away']) - int(game['PTS_home'])
    scores[game['HOME_TEAM_ID']] += -diff
    scores[game['VISITOR_TEAM_ID']] += diff
    games_played[game['HOME_TEAM_ID']] += 1
    games_played[game['VISITOR_TEAM_ID']] += 1

In [8]:
naive_r = []
for team, score in scores.items():
    naive_r.append(score/ games_played[team])

In [9]:
naive_rank_df = pd.DataFrame({
    'team': team_names,
    'r': naive_r
})
naive_rank_df = naive_rank_df.sort_values(by =['r'],  ascending = False)
naive_rank_df = naive_rank_df.reset_index(drop = True)
print(naive_rank_df)

             team         r
0            Suns  8.281690
1        Warriors  6.794521
2            Jazz  6.014085
3       Grizzlies  5.270270
4         Celtics  5.125000
5            Heat  4.546667
6       Mavericks  4.253521
7           Bucks  3.465753
8    Timberwolves  3.178082
9           Bulls  2.718310
10      Cavaliers  2.402778
11        Nuggets  2.315068
12          76ers  1.884058
13        Raptors  1.625000
14          Hawks  0.785714
15          Spurs  0.136986
16           Nets -0.056338
17         Knicks -0.309859
18       Clippers -1.383562
19        Hornets -1.472222
20         Pacers -2.541667
21       Pelicans -2.690141
22         Lakers -3.152778
23        Wizards -3.157143
24          Kings -4.342466
25  Trail Blazers -7.257143
26          Magic -7.388889
27        Thunder -7.857143
28        Pistons -8.830986
29        Rockets -9.239437


## Building the graph, matrices, and vectors

In [30]:
def rank(game_df, teams):
    teams = set(game_df['HOME_TEAM_ID'].tolist())
    teams.update(set(game_df['VISITOR_TEAM_ID'].tolist()))
    teams = list(teams)

    team_names = []
    for team in teams:
        team_names.append(str(team_df[team_df['TEAM_ID'] == team].iloc[0]['NICKNAME']))
    
    #init edges, triangles, adj_matrix, curl, neg_divergence, f, and W
    edges = []
    num_nodes = len(teams)
    for i in range(num_nodes):
            for j in range(i + 1, num_nodes):
                edges.append((i,j))
    neg_divergence = np.zeros((len(edges), num_nodes))

    f = np.zeros((len(edges)))
    W = np.zeros((len(edges), len(edges)))
    
    # f, w
    # f is a vector representing pairwise differences between the nodes
    # w is a diagonal matrix containing the weights of each edge

    for index, game in game_df.iterrows():
        home_index = teams.index(game['HOME_TEAM_ID'])
        opp_index = teams.index(game['VISITOR_TEAM_ID'])
        diff = int(game['PTS_away']) - int(game['PTS_home'])
        if home_index < opp_index:
            edge = (home_index, opp_index)
        else:
            diff = -diff
            edge = (opp_index, home_index)
        i = edges.index(edge)
        W[i, i] += 1
        f[i] += diff
    for i in range(len(edges)):
        if W[i, i] != 0:
            f[i] = f[i]*1/W[i,i]
        else:
            f[i] = 0
            
    # neg_divergence
    for i in range(len(edges)):
        for j in range(num_nodes):
            if edges[i][0] == j:
                neg_divergence[i,j] = -1
            elif edges[i][1] == j:
                neg_divergence[i,j] = 1
    
    right_side = np.matmul(np.transpose(neg_divergence), np.matmul(W, f))
    left_side = np.matmul(np.matmul(np.transpose(neg_divergence), W), neg_divergence)
    r = np.matmul(np.linalg.pinv(left_side), right_side)
    
    rank_df = pd.DataFrame({
    'team': team_names,
    'r': r
    })

    rank_df = rank_df.sort_values(by =['r'],  ascending = False)
    rank_df = rank_df.reset_index(drop = True)
    return(rank_df)

    #rank_df.to_csv('data/hodge_ranking.csv')

In [52]:
r_regular = rank(game_df, teams)
print(r_regular)

             team         r
0            Suns  7.556319
1        Warriors  6.492398
2            Jazz  5.609594
3       Grizzlies  4.829419
4         Celtics  4.436565
5            Heat  4.335416
6       Mavericks  4.225644
7           Bucks  3.164290
8    Timberwolves  2.894025
9       Cavaliers  2.364359
10          Bulls  2.351655
11        Nuggets  2.102093
12          76ers  1.767214
13        Raptors  1.702416
14          Hawks  1.282467
15           Nets  0.101050
16          Spurs  0.045777
17         Knicks -0.306826
18       Clippers -1.228548
19        Hornets -1.299388
20       Pelicans -2.183597
21         Pacers -2.251863
22         Lakers -3.192266
23        Wizards -3.263490
24          Kings -4.236429
25  Trail Blazers -6.199932
26          Magic -6.842637
27        Thunder -7.306982
28        Pistons -8.199562
29        Rockets -8.749182


## Grouping method: grouping teams with lots of W's and teams with lots of L's together

In [62]:
"""
a (very convulated) way to distribute all nodes into 
groupings such that each group has at least 
group_size number of nodes
"""
num_nodes = len(teams)
group_size = 10
group_lengths = [group_size]*(math.floor(num_nodes/group_size))
for n in range(num_nodes - group_size * len(group_lengths)):
    group_lengths[n % len(group_lengths)] += 1
print(group_lengths)

[10, 10, 10]


In [69]:
# get each teams total number of wins and total number of games played
better_than = {team_ID: 0 for team_ID in teams}
games_played = {team_ID: 0 for team_ID in teams}
for index, game in game_df.iterrows():
    diff = int(game['PTS_away']) - int(game['PTS_home'])
    if diff > 0:
        better_than[game['VISITOR_TEAM_ID']] += 1
    else:
        better_than[game['HOME_TEAM_ID']] += 1
    games_played[game['HOME_TEAM_ID']] += 1
    games_played[game['VISITOR_TEAM_ID']] += 1

# get overall score: (number of wins)/(total games)
for team, score in better_than.items():
    better_than[team] = score/games_played[team]

# sort by score
scores_df = pd.DataFrame(list(better_than.items()), columns = ['TEAM_ID', 'SCORE'])
scores_df = scores_df.sort_values(by =['SCORE'],  ascending = False)
scores_df = scores_df.reset_index(drop = True)
ranked_teams = list(scores_df['TEAM_ID'])
#print(scores_df)

# fill list with groups of teams, sorted by score
groupings = []
for group_length in group_lengths:
    groupings.append(ranked_teams[0:group_length])
    ranked_teams = ranked_teams[group_length:]

In [66]:
r_groups = pd.DataFrame(columns = ['team', 'r'])

# create ranking for each grouping
for grouping in groupings:
    small_game_df = game_df[game_df['HOME_TEAM_ID'].isin(grouping)]
    small_game_df = small_game_df[small_game_df['VISITOR_TEAM_ID'].isin(grouping)]
    r_groups = r_groups.append(rank(small_game_df, grouping))

r_groups = r_groups.reset_index(drop = True)
print(r_groups)
    

             team         r
0        Warriors  4.317160
1            Suns  3.970623
2            Jazz  3.110800
3            Heat  2.349909
4       Mavericks  2.134221
5         Celtics  1.917020
6       Grizzlies -1.788253
7           Bucks -2.190572
8           76ers -6.022508
9           Bulls -7.798399
10      Cavaliers  2.738796
11        Hornets  1.021284
12        Nuggets  1.021019
13        Raptors  0.967144
14           Nets -0.353357
15       Clippers -0.399147
16          Hawks -0.568799
17         Knicks -0.910685
18        Wizards -1.344098
19   Timberwolves -2.172159
20          Spurs  8.230980
21       Pelicans  2.781119
22          Kings  1.681835
23         Pacers  1.610577
24         Lakers  1.537968
25        Thunder -2.001000
26  Trail Blazers -2.627769
27          Magic -2.651659
28        Rockets -3.934368
29        Pistons -4.627683


In [68]:
print(stats.kendalltau(r_groups['team'], r_regular['team']))

KendalltauResult(correlation=0.21839080459770116, pvalue=0.09369487956659167)
