action items:
- get r^2 error
    - consider rescaling
- save edge flows
- do analysis on k
- look into doing an uneven splitting

## 0. Processing Data
- Loading data and dropping unecessary columns

## 1. Naive Ranking
- ranks based on average point difference

## 2. Regular HR Rank
- Runs on all 30 nodes once

## 3. Initial Grouping Rank
- Runs on 30/3 = 10 nodes at a time

## 4. Improved Grouping Rank
- Runs on 30/3 + 2 = 12 nodes at a time

In [1]:
import csv
import pandas as pd
import numpy as np
import math
from scipy import stats
from hodgerank_tools import *

## 0. Processing Data

In [2]:
big_game_df = pd.read_csv("data/nba/games.csv")
team_df = pd.read_csv("data/nba/teams.csv")
og_game_df = big_game_df[['GAME_ID','SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away']].copy()
og_game_df = og_game_df[og_game_df['SEASON'] == 2021]
og_game_df.dropna( inplace = True)

teams = set(og_game_df['HOME_TEAM_ID'].tolist())
teams.update(set(og_game_df['VISITOR_TEAM_ID'].tolist()))
teams = list(teams)

team_names = []
for team in teams:
    team_names.append(str(team_df[team_df['TEAM_ID'] == team].iloc[0]['NICKNAME']))

In [3]:
data = []
for i, game in og_game_df.iterrows():
    to_add = [np.nan]*len(teams)
    to_add[teams.index(game['VISITOR_TEAM_ID'])] = int(game['PTS_away'])
    to_add[teams.index(game['HOME_TEAM_ID'])] =  int(game['PTS_home'])
    data.append(to_add)

game_df = pd.DataFrame(data = data, columns = team_names)
print(game_df.head())

   Hawks  Celtics  Cavaliers  Pelicans  Bulls  Mavericks  Nuggets  Warriors  \
0    NaN      NaN        NaN       NaN    NaN        NaN      NaN       NaN   
1    NaN      NaN       91.0       NaN  101.0        NaN      NaN       NaN   
2    NaN      NaN        NaN       NaN    NaN        NaN      NaN       NaN   
3    NaN      NaN        NaN       NaN    NaN        NaN      NaN     122.0   
4    NaN      NaN        NaN       NaN    NaN        NaN    115.0       NaN   

   Rockets  Clippers  ...  Trail Blazers  Kings  Spurs  Thunder  Raptors  \
0      NaN       NaN  ...            NaN    NaN    NaN      NaN      NaN   
1      NaN       NaN  ...            NaN    NaN    NaN      NaN      NaN   
2      NaN       NaN  ...            NaN    NaN  108.0      NaN      NaN   
3      NaN       NaN  ...            NaN    NaN    NaN      NaN      NaN   
4      NaN       NaN  ...            NaN    NaN    NaN      NaN    127.0   

   Jazz  Grizzlies  Wizards  Pistons  Hornets  
0   NaN        NaN  

## 1. Naive Ranks

In [4]:
(naive_r0, naive_r0_error) = naive_rank_0(game_df)
print(naive_r0)
print("error: ", naive_r0_error)

          element           r
0    Timberwolves  114.602740
1           Bucks  114.150685
2            Suns  113.788732
3       Grizzlies  113.648649
4            Jazz  113.605634
5         Hornets  113.402778
6           Bulls  112.605634
7           Spurs  112.287671
8           Hawks  112.000000
9        Warriors  111.821918
10           Nets  111.464789
11        Nuggets  110.917808
12         Lakers  110.444444
13          Kings  110.397260
14         Pacers  110.055556
15           Heat  109.533333
16        Celtics  109.375000
17        Raptors  109.138889
18        Rockets  108.704225
19          76ers  108.695652
20        Wizards  108.371429
21      Mavericks  107.436620
22       Pelicans  107.394366
23       Clippers  106.753425
24         Knicks  106.507042
25  Trail Blazers  106.342857
26      Cavaliers  106.277778
27        Pistons  103.661972
28          Magic  103.625000
29        Thunder  102.500000
error:  48872.85631469036


In [5]:
(naive_r, naive_r_error) = naive_rank(game_df)
print(naive_r)
print("error: ", naive_r_error)

          element         r
0            Suns  8.281690
1        Warriors  6.794521
2            Jazz  6.014085
3       Grizzlies  5.270270
4         Celtics  5.125000
5            Heat  4.546667
6       Mavericks  4.253521
7           Bucks  3.465753
8    Timberwolves  3.178082
9           Bulls  2.718310
10      Cavaliers  2.402778
11        Nuggets  2.315068
12          76ers  1.884058
13        Raptors  1.625000
14          Hawks  0.785714
15          Spurs  0.136986
16           Nets -0.056338
17         Knicks -0.309859
18       Clippers -1.383562
19        Hornets -1.472222
20         Pacers -2.541667
21       Pelicans -2.690141
22         Lakers -3.152778
23        Wizards -3.157143
24          Kings -4.342466
25  Trail Blazers -7.257143
26          Magic -7.388889
27        Thunder -7.857143
28        Pistons -8.830986
29        Rockets -9.239437
error:  37837.91328543654


## 2. HodgeRank

In [6]:
(r_regular, error_regular) = rank(game_df)
print(r_regular)
print("error: ", error_regular)

          element         r
0            Suns  7.556319
1        Warriors  6.492398
2            Jazz  5.609594
3       Grizzlies  4.829419
4         Celtics  4.436565
5            Heat  4.335416
6       Mavericks  4.225644
7           Bucks  3.164290
8    Timberwolves  2.894025
9       Cavaliers  2.364359
10          Bulls  2.351655
11        Nuggets  2.102093
12          76ers  1.767214
13        Raptors  1.702416
14          Hawks  1.282467
15           Nets  0.101050
16          Spurs  0.045777
17         Knicks -0.306826
18       Clippers -1.228548
19        Hornets -1.299388
20       Pelicans -2.183597
21         Pacers -2.251863
22         Lakers -3.192266
23        Wizards -3.263490
24          Kings -4.236429
25  Trail Blazers -6.199932
26          Magic -6.842637
27        Thunder -7.306982
28        Pistons -8.199562
29        Rockets -8.749182
error:  37727.69548748602


## 3. Grouping method
Runs on 30/3 = 10 nodes at a time

In [7]:
k = 3
(r_groups_1, error_groups_1) = simple_group_rank(game_df, k)
print(r_groups_1)
print("error: ", error_groups_1)

          element         r
0            Suns  3.633397
1        Warriors  2.932553
2            Jazz  1.863423
3            Heat  1.362069
4       Mavericks  1.341824
5         Celtics   0.97751
6    Timberwolves  0.826573
7           Bucks -2.960224
8       Grizzlies -4.180203
9           Bulls -5.796922
10      Cavaliers  2.689654
11          76ers  2.506727
12        Raptors  2.068132
13           Nets  1.782542
14        Nuggets  1.619619
15        Hornets  1.173192
16         Knicks -0.754015
17          Hawks -2.177948
18       Clippers -2.330151
19          Spurs -6.577751
20       Pelicans  4.890045
21          Kings  3.450702
22         Lakers  3.178861
23  Trail Blazers  0.362617
24         Pacers -0.279938
25        Rockets -0.898655
26        Thunder  -1.04771
27        Wizards -2.308111
28          Magic -2.381868
29        Pistons -4.965944
error:  7929.2563512115785


In [8]:
print(stats.kendalltau(r_groups_1['element'], r_regular['element']))
print(stats.kendalltau(list(r_groups_1['element'])[:10], list(r_regular['element'])[:10]))

KendalltauResult(correlation=0.39310344827586213, pvalue=0.001965544378759685)
KendalltauResult(correlation=0.5555555555555555, pvalue=0.02860945767195767)


## 4. Improved grouping method
Runs on 30/3 + 2 = 12 nodes at a time

In [9]:
(r_groups_2, error_groups_2) = group_rank(game_df, k)
print(r_groups_2)
print("error: ", error_groups_2)

          element         r
0            Suns   4.16113
1        Warriors  3.083983
2            Jazz  2.043965
3       Grizzlies  1.469666
4         Celtics  1.050483
5            Heat  0.865651
6       Mavericks  0.844736
7           Bucks -0.230932
8    Timberwolves -0.570963
9           Bulls -1.189181
10      Cavaliers  1.799867
11        Nuggets  1.664072
12          76ers  1.476747
13        Raptors  1.210336
14          Hawks  0.866823
15           Nets -0.369443
16          Spurs -0.490471
17         Knicks -0.823813
18        Hornets -1.753454
19       Clippers -1.880912
20         Pacers  1.823121
21       Pelicans  1.715692
22         Lakers  0.751654
23        Wizards  0.678636
24          Kings -0.407239
25  Trail Blazers -2.455151
26          Magic  -2.71868
27        Thunder -3.434886
28        Pistons -4.196151
29        Rockets -4.882323
error:  9912.05272296774


In [10]:
#Kendall's Tau
print(stats.kendalltau(r_groups_2['element'], r_regular['element']))

#Kendall's Tau of first n teams
n = 10
print(stats.kendalltau(list(r_groups_2['element'])[:n], list(r_regular['element'])[:n]))

KendalltauResult(correlation=0.9586206896551724, pvalue=1.1518295418806186e-24)
KendalltauResult(correlation=0.9999999999999999, pvalue=5.511463844797178e-07)
