action items:
- get r^2 error
    - consider rescaling
- save edge flows
- do analysis on k
- look into doing an uneven splitting

## 0. Preprocessing Data
- Loading data and dropping unecessary columns

## 1. Naive Ranking
- ranks based on average point difference

## 2. Regular HR Rank
- Runs on all 30 nodes once

## 3. Initial Grouping Rank
- Runs on 30/3 = 10 nodes at a time

## 4. Improved Grouping Rank
- Runs on 30/3 + 2 = 12 nodes at a time

Data downloaded from https://www.kaggle.com/datasets/nathanlauga/nba-games?select=games.csv, originally from Sports Data LLC (https://www.sports-reference.com/)

In [77]:
import csv
import pandas as pd
import numpy as np
import math
from scipy import stats

## 0. Processing Data

In [78]:
big_game_df = pd.read_csv("data/nba/games.csv")
team_df = pd.read_csv("data/nba/teams.csv")
print(big_game_df.columns)

Index(['GAME_DATE_EST', 'GAME_ID', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID',
       'VISITOR_TEAM_ID', 'SEASON', 'TEAM_ID_home', 'PTS_home', 'FG_PCT_home',
       'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'TEAM_ID_away',
       'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away',
       'REB_away', 'HOME_TEAM_WINS'],
      dtype='object')


In [79]:
game_df = big_game_df[['GAME_ID','SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away']].copy()

In [80]:
# only for 2021 season
game_df = game_df[game_df['SEASON'] == 2021]
game_df.dropna( inplace = True)
print(game_df.shape)

(1076, 6)


### Teams

In [81]:
teams = set(game_df['HOME_TEAM_ID'].tolist())
teams.update(set(game_df['VISITOR_TEAM_ID'].tolist()))
teams = list(teams)

team_names = []
for team in teams:
    team_names.append(str(team_df[team_df['TEAM_ID'] == team].iloc[0]['NICKNAME']))

## 1. Naive Ranking

In [82]:
# each team scored based on the point difference of every game they've played
# uses global variables team, game_df
def naive_rank(game_df = game_df, teams = teams, team_names = team_names):
    scores = {team_ID: 0 for team_ID in teams}
    games_played = {team_ID: 0 for team_ID in teams}
    for index, game in game_df.iterrows():
        diff = int(game['PTS_away']) - int(game['PTS_home'])
        scores[game['HOME_TEAM_ID']] += -diff
        scores[game['VISITOR_TEAM_ID']] += diff
        games_played[game['HOME_TEAM_ID']] += 1
        games_played[game['VISITOR_TEAM_ID']] += 1
    naive_r = []
    for team, score in scores.items():
        naive_r.append(score/ games_played[team])
    naive_rank_df = pd.DataFrame({
        'team': team_names,
        'team_ID': teams,
        'r': naive_r
    })
    naive_rank_df = naive_rank_df.sort_values(by =['r'],  ascending = False)
    naive_rank_df = naive_rank_df.reset_index(drop = True)
    return naive_rank_df

In [83]:
print(naive_rank()[['team', 'r']])

             team         r
0            Suns  8.281690
1        Warriors  6.794521
2            Jazz  6.014085
3       Grizzlies  5.270270
4         Celtics  5.125000
5            Heat  4.546667
6       Mavericks  4.253521
7           Bucks  3.465753
8    Timberwolves  3.178082
9           Bulls  2.718310
10      Cavaliers  2.402778
11        Nuggets  2.315068
12          76ers  1.884058
13        Raptors  1.625000
14          Hawks  0.785714
15          Spurs  0.136986
16           Nets -0.056338
17         Knicks -0.309859
18       Clippers -1.383562
19        Hornets -1.472222
20         Pacers -2.541667
21       Pelicans -2.690141
22         Lakers -3.152778
23        Wizards -3.157143
24          Kings -4.342466
25  Trail Blazers -7.257143
26          Magic -7.388889
27        Thunder -7.857143
28        Pistons -8.830986
29        Rockets -9.239437


## 2. HodgeRank

In [84]:
def get_edges(num_nodes):
    edges = []
    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            edges.append((i,j))
    neg_divergence = np.zeros((len(edges), num_nodes))
    return edges

In [85]:
# f is a vector representing pairwise differences between the nodes
# w is a diagonal matrix containing the weights of each edge

def get_f_W(game_df, teams):
    #init edges, triangles, adj_matrix, curl, neg_divergence, f, and W
    num_nodes = len(teams)
    edges = get_edges(num_nodes)
    
    f = np.zeros((len(edges)))
    W = np.zeros((len(edges), len(edges)))
    for index, game in game_df.iterrows():
        home_index = teams.index(game['HOME_TEAM_ID'])
        opp_index = teams.index(game['VISITOR_TEAM_ID'])
        diff = int(game['PTS_away']) - int(game['PTS_home'])
        if home_index < opp_index:
            edge = (home_index, opp_index)
        else:
            diff = -diff
            edge = (opp_index, home_index)
        i = edges.index(edge)
        W[i, i] += 1
        f[i] += diff
    for i in range(len(edges)):
        if W[i, i] != 0:
            f[i] = f[i]*1/W[i,i]
        else:
            f[i] = 0
    return (f, W)

In [86]:
def get_error(f, W, edges, r):
    sum = 0
    for i in range(len(edges)):
        to_add = 1*(f[i] + (r[edges[i][0]] - r[edges[i][1]]))**2
        sum += to_add
        #print("error at ", i,": ", to_add)
    return sum

In [87]:
def get_neg_divergence(edges, num_nodes):
    neg_divergence = np.zeros((len(edges), num_nodes))            
    # neg_divergence
    for i in range(len(edges)):
        for j in range(num_nodes):
            if edges[i][0] == j:
                neg_divergence[i,j] = -1
            elif edges[i][1] == j:
                neg_divergence[i,j] = 1
    return neg_divergence

In [88]:
def rank(game_df):
    teams = set(game_df['HOME_TEAM_ID'].tolist())
    teams.update(set(game_df['VISITOR_TEAM_ID'].tolist()))
    teams = list(teams)

    team_names = []
    for team in teams:
        team_names.append(str(team_df[team_df['TEAM_ID'] == team].iloc[0]['NICKNAME']))
    
    #init edges, triangles, adj_matrix, curl, neg_divergence, f, and W
    num_nodes = len(teams)
    
    edges = get_edges(num_nodes)
    neg_divergence = get_neg_divergence(edges, num_nodes)
    (f, W) = get_f_W(game_df, teams)
        
    right_side = np.matmul(np.transpose(neg_divergence), np.matmul(W, f))
    left_side = np.matmul(np.matmul(np.transpose(neg_divergence), W), neg_divergence)

    r = np.matmul(np.linalg.pinv(left_side), right_side)
    
    rank_df = pd.DataFrame({
    'team': team_names,
    'r': r
    })

    rank_df = rank_df.sort_values(by =['r'],  ascending = False)
    rank_df = rank_df.reset_index(drop = True)
    
    return(rank_df, get_error(f, W, edges, r))

    #rank_df.to_csv('data/hodge_ranking.csv')

In [89]:
(r_regular, error_regular) = rank(game_df)
print(r_regular)
print("error: ", error_regular)

             team         r
0            Suns  7.556319
1        Warriors  6.492398
2            Jazz  5.609594
3       Grizzlies  4.829419
4         Celtics  4.436565
5            Heat  4.335416
6       Mavericks  4.225644
7           Bucks  3.164290
8    Timberwolves  2.894025
9       Cavaliers  2.364359
10          Bulls  2.351655
11        Nuggets  2.102093
12          76ers  1.767214
13        Raptors  1.702416
14          Hawks  1.282467
15           Nets  0.101050
16          Spurs  0.045777
17         Knicks -0.306826
18       Clippers -1.228548
19        Hornets -1.299388
20       Pelicans -2.183597
21         Pacers -2.251863
22         Lakers -3.192266
23        Wizards -3.263490
24          Kings -4.236429
25  Trail Blazers -6.199932
26          Magic -6.842637
27        Thunder -7.306982
28        Pistons -8.199562
29        Rockets -8.749182
error:  37727.69548748602


## 3. Initial grouping method
grouping teams together based on naive rank

In [90]:
num_nodes = len(teams)
k = 3 #TODO: modify this value
"""
a (very convulated) way to almost evenly distribute all nodes into 
groupings such that each group has at least 
group_size number of nodes
"""
def get_group_lengths(k = 3):
    group_size = math.floor(num_nodes/k)
    group_lengths = [group_size]*k
    for n in range(num_nodes - group_size * len(group_lengths)):
        group_lengths[n % len(group_lengths)] += 1
    return(group_lengths)

group_lengths = get_group_lengths()
#print(group_lengths)

In [91]:
def group_similar_scoring(k = k):
    naive_r = naive_rank()

    # sort by score
    scores_df = naive_r[['r', 'team_ID']]
    scores_df = scores_df.sort_values(by =['r'],  ascending = False)
    scores_df = scores_df.reset_index(drop = True)
    ranked_teams = list(scores_df['team_ID'])
    #print(scores_df)

    # fill list with groups of teams, sorted by score
    groupings = []
    for group_length in get_group_lengths(k):
        groupings.append(ranked_teams[0:group_length])
        ranked_teams = ranked_teams[group_length:]
    return(groupings)

groupings = group_similar_scoring()
print(groupings)

[[1610612756, 1610612744, 1610612762, 1610612763, 1610612738, 1610612748, 1610612742, 1610612749, 1610612750, 1610612741], [1610612739, 1610612743, 1610612755, 1610612761, 1610612737, 1610612759, 1610612751, 1610612752, 1610612746, 1610612766], [1610612754, 1610612740, 1610612747, 1610612764, 1610612758, 1610612757, 1610612753, 1610612760, 1610612765, 1610612745]]


In [92]:
r_groups = pd.DataFrame(columns = ['team', 'r'])
error_groups1 = 0
# create ranking for each grouping
for grouping in groupings:
    small_game_df = game_df[game_df['HOME_TEAM_ID'].isin(grouping)]
    small_game_df = small_game_df[small_game_df['VISITOR_TEAM_ID'].isin(grouping)]
    (group_rank, group_error) = rank(small_game_df)
    r_groups = r_groups.append(group_rank)
    error_groups1 += group_error

r_groups = r_groups.reset_index(drop = True)
print(r_groups)
#print("error: ", error_groups1)
    

             team         r
0            Suns  3.633397
1        Warriors  2.932553
2            Jazz  1.863423
3            Heat  1.362069
4       Mavericks  1.341824
5         Celtics  0.977510
6    Timberwolves  0.826573
7           Bucks -2.960224
8       Grizzlies -4.180203
9           Bulls -5.796922
10      Cavaliers  2.689654
11          76ers  2.506727
12        Raptors  2.068132
13           Nets  1.782542
14        Nuggets  1.619619
15        Hornets  1.173192
16         Knicks -0.754015
17          Hawks -2.177948
18       Clippers -2.330151
19          Spurs -6.577751
20       Pelicans  4.890045
21          Kings  3.450702
22         Lakers  3.178861
23  Trail Blazers  0.362617
24         Pacers -0.279938
25        Rockets -0.898655
26        Thunder -1.047710
27        Wizards -2.308111
28          Magic -2.381868
29        Pistons -4.965944


In [93]:
print(stats.kendalltau(r_groups['team'], r_regular['team']))
print(stats.kendalltau(list(r_groups['team'])[:10], list(r_regular['team'])[:10]))

KendalltauResult(correlation=0.39310344827586213, pvalue=0.001965544378759685)
KendalltauResult(correlation=0.5555555555555555, pvalue=0.02860945767195767)


## 4. Improved grouping method
including the other groups as nodes

In [94]:
group_lengths = get_group_lengths(group_size)
groupings = group_similar_scoring()

In [95]:
r_groups_2 = pd.DataFrame(columns = ['team', 'r'])
fake_teams = set()
error_groups2 = 0

# create ranking for each grouping
for grouping in groupings:
    # create new df such that the nodes of another group is condensed into one node, where the weight on it's edges 
    # is equivalent to the sum of the weights of stuff going out of it. 
    # the 
    new_grouping = grouping.copy()
    small_game_df = game_df.copy()

    for i, other_grouping in enumerate(groupings):
        if other_grouping == grouping: 
            continue
        df_to_remove = game_df[game_df['HOME_TEAM_ID'].isin(other_grouping)]
        df_to_remove = df_to_remove[df_to_remove['VISITOR_TEAM_ID'].isin(other_grouping)]
        cond = small_game_df['GAME_ID'].isin(df_to_remove['GAME_ID'])
        small_game_df.drop(small_game_df[cond].index, inplace = True)
        new_team_name = 'OTHER_TEAM' + str(i)
        fake_teams.add(new_team_name)
        new_grouping.append(new_team_name)
        team_df = team_df.append({'TEAM_ID':new_team_name, 'NICKNAME':new_team_name}, ignore_index = True)

        small_game_df.loc[small_game_df['HOME_TEAM_ID'].isin(other_grouping), 'HOME_TEAM_ID'] = new_team_name
        small_game_df.loc[small_game_df['VISITOR_TEAM_ID'].isin(other_grouping), 'VISITOR_TEAM_ID'] = new_team_name            
    #print(small_game_df.head())
    (group_r, group_error) = rank(small_game_df)
    r_groups_2 = r_groups_2.append(group_r)
    error_groups2 += group_error
    
r_groups_2 = r_groups_2[~r_groups_2['team'].isin(fake_teams)]
r_groups_2 = r_groups_2.reset_index(drop = True)
print(r_groups_2)
#print("error: ", error_groups2)
    

             team         r
0            Suns  4.161130
1        Warriors  3.083983
2            Jazz  2.043965
3       Grizzlies  1.469666
4         Celtics  1.050483
5            Heat  0.865651
6       Mavericks  0.844736
7           Bucks -0.230932
8    Timberwolves -0.570963
9           Bulls -1.189181
10      Cavaliers  1.799867
11        Nuggets  1.664072
12          76ers  1.476747
13        Raptors  1.210336
14          Hawks  0.866823
15           Nets -0.369443
16          Spurs -0.490471
17         Knicks -0.823813
18        Hornets -1.753454
19       Clippers -1.880912
20         Pacers  1.823121
21       Pelicans  1.715692
22         Lakers  0.751654
23        Wizards  0.678636
24          Kings -0.407239
25  Trail Blazers -2.455151
26          Magic -2.718680
27        Thunder -3.434886
28        Pistons -4.196151
29        Rockets -4.882323


In [96]:
#Kendall's Tau
print(stats.kendalltau(r_groups_2['team'], r_regular['team']))

#Kendall's Tau of first n teams
n = 10
print(stats.kendalltau(list(r_groups_2['team'])[:n], list(r_regular['team'])[:n]))

KendalltauResult(correlation=0.9586206896551724, pvalue=1.1518295418806186e-24)
KendalltauResult(correlation=0.9999999999999999, pvalue=5.511463844797178e-07)


In [50]:
#print(error_regular, error_groups1, error_groups2)