action items:
- get r^2 error
    - consider rescaling
- save edge flows
- do analysis on k
- look into doing an uneven splitting

## 0. Processing Data
- Loading data and dropping unecessary columns

## 1. Naive Ranking
- ranks based on average point difference

## 2. Regular HR Rank
- Runs on all 30 nodes once

## 3. Initial Grouping Rank
- Runs on 30/3 = 10 nodes at a time

## 4. Improved Grouping Rank
- Runs on 30/3 + 2 = 12 nodes at a time

In [1]:
import csv
import pandas as pd
import numpy as np
import math
from scipy import stats
from hodgerank_tools import *

## 0. Processing Data

In [2]:
big_game_df = pd.read_csv("data/nba/games.csv")
team_df = pd.read_csv("data/nba/teams.csv")
og_game_df = big_game_df[['GAME_ID','SEASON', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away']].copy()
og_game_df = og_game_df[og_game_df['SEASON'] == 2021]
og_game_df.dropna( inplace = True)

teams = set(og_game_df['HOME_TEAM_ID'].tolist())
teams.update(set(og_game_df['VISITOR_TEAM_ID'].tolist()))
teams = list(teams)

team_names = []
for team in teams:
    team_names.append(str(team_df[team_df['TEAM_ID'] == team].iloc[0]['NICKNAME']))

In [3]:
data = np.empty((len(teams), len(teams)))*np.nan
for i, game in og_game_df.iterrows():
    visiting_i = teams.index(game['VISITOR_TEAM_ID'])
    home_i = teams.index(game['HOME_TEAM_ID'])
    data[visiting_i, home_i] = np.nansum([data[visiting_i][home_i], int(game['PTS_home']) - int(game['PTS_away'])])
    data[home_i, visiting_i] = np.nansum([data[home_i][visiting_i],  int(game['PTS_away']) - int(game['PTS_home'])])
game_df = pd.DataFrame(data = data, columns = team_names)
print( og_game_df)
print(game_df.head())

       GAME_ID  SEASON  HOME_TEAM_ID  VISITOR_TEAM_ID  PTS_home  PTS_away
0     22101005    2021    1610612748       1610612750     104.0     113.0
1     22101006    2021    1610612741       1610612739     101.0      91.0
2     22101007    2021    1610612759       1610612754     108.0     119.0
3     22101008    2021    1610612744       1610612749     122.0     109.0
4     22101009    2021    1610612743       1610612761     115.0     127.0
...        ...     ...           ...              ...       ...       ...
1071  12100007    2021    1610612759       1610612762     111.0      85.0
1072  12100008    2021    1610612757       1610612744     107.0     121.0
1073  12100009    2021    1610612758       1610612756     117.0     106.0
1074  12100010    2021    1610612746       1610612743     103.0     102.0
1075  12100001    2021    1610612747       1610612751      97.0     123.0

[1076 rows x 6 columns]
   Hawks  Celtics  Cavaliers  Pelicans  Bulls  Mavericks  Nuggets  Warriors  \
0    NaN

## 1. Naive Ranks

In [4]:
(naive_r0, naive_r0_error) = naive_rank_0(game_df)
print(naive_r0)
print("error: ", naive_r0_error)

             node          r
0            Suns  20.275862
1        Warriors  17.714286
2            Jazz  14.724138
3       Grizzlies  13.448276
4         Celtics  12.724138
5            Heat  11.758621
6       Mavericks  10.413793
7           Bucks   8.724138
8    Timberwolves   8.000000
9           Bulls   6.655172
10      Cavaliers   5.965517
11        Nuggets   5.827586
12          76ers   4.482759
13        Raptors   4.178571
14          Hawks   1.896552
15          Spurs   0.344828
16           Nets  -0.137931
17         Knicks  -0.758621
18       Clippers  -3.482759
19        Hornets  -3.655172
20         Pacers  -6.310345
21       Pelicans  -6.586207
22        Wizards  -7.892857
23         Lakers  -8.107143
24          Kings -10.931034
25  Trail Blazers -17.517241
26          Magic -19.000000
27        Thunder -19.642857
28        Pistons -21.620690
29        Rockets -22.620690
error:  1232.7949859765397


In [5]:
(naive_r, naive_r_error) = naive_rank(game_df)
print(naive_r)
print("error: ", naive_r_error)

             node          r
0            Suns  20.228288
1        Warriors  17.975641
2            Jazz  14.740695
3       Grizzlies  13.388337
4         Celtics  12.725806
5            Heat  11.743176
6       Mavericks  10.441687
7           Bucks   8.606700
8    Timberwolves   7.924318
9           Bulls   6.700993
10      Cavaliers   5.961538
11        Nuggets   5.858561
12        Raptors   4.450000
13          76ers   4.449132
14          Hawks   1.856079
15          Spurs   0.260546
16           Nets  -0.151365
17         Knicks  -0.729529
18       Clippers  -3.540943
19        Hornets  -3.741935
20         Pacers  -6.341191
21       Pelicans  -6.692308
22         Lakers  -8.332051
23        Wizards  -8.566667
24          Kings -10.983871
25  Trail Blazers -17.500000
26          Magic -18.303846
27        Thunder -18.965385
28        Pistons -21.739454
29        Rockets -22.746898
error:  1173.5181000448663


## 2. HodgeRank

In [6]:
(r_regular, error_regular) = rank(game_df)
print(r_regular)
print("error: ", error_regular)

             node          r
0            Suns  19.585811
1        Warriors  17.413335
2            Jazz  14.282454
3       Grizzlies  12.975500
4         Celtics  12.335212
5            Heat  11.385572
6       Mavericks  10.127778
7           Bucks   8.354396
8    Timberwolves   7.694924
9           Bulls   6.512670
10      Cavaliers   5.798042
11        Nuggets   5.698521
12          76ers   4.336411
13        Raptors   4.324488
14          Hawks   1.830416
15          Spurs   0.288449
16           Nets  -0.109632
17         Knicks  -0.668385
18       Clippers  -3.385412
19        Hornets  -3.579656
20         Pacers  -6.091647
21       Pelicans  -6.430975
22         Lakers  -8.014621
23        Wizards  -8.209189
24          Kings -10.578457
25  Trail Blazers -16.875819
26          Magic -17.721033
27        Thunder -18.359647
28        Pistons -20.972942
29        Rockets -21.946563
error:  1036.9539000883763


In [7]:
print(stats.kendalltau(naive_r['node'], r_regular['node']))
print(stats.kendalltau(list(naive_r['node'])[:10], list(r_regular['node'])[:10]))

KendalltauResult(correlation=0.8114942528735632, pvalue=1.2020446998209334e-13)
KendalltauResult(correlation=0.9999999999999999, pvalue=5.511463844797178e-07)


## 3. Grouping method
Runs on 30/3 = 10 nodes at a time

In [8]:
k = 3
(r_groups_1, error_groups_1) = simple_group_rank(game_df, k)
print(r_groups_1)
print("error: ", error_groups_1)

             node         r
0            Suns  7.573689
1        Warriors  4.940741
2            Jazz  2.139997
3       Grizzlies   1.87118
4         Celtics   0.12566
5            Heat -1.132404
6       Mavericks -2.265021
7           Bucks -3.358211
8    Timberwolves -4.616275
9           Bulls -5.279357
10      Cavaliers  4.280101
11        Nuggets  4.047126
12        Raptors  2.851852
13          76ers  2.645692
14          Hawks  0.961104
15          Spurs -0.210939
16           Nets -1.974379
17         Knicks -2.139254
18       Clippers -4.985132
19        Hornets -5.476172
20         Pacers  7.875161
21       Pelicans  6.802697
22        Wizards  5.870468
23         Lakers  5.746408
24          Kings  2.628784
25  Trail Blazers -3.429187
26          Magic  -4.31029
27        Thunder  -4.85103
28        Pistons -7.429187
29        Rockets -8.903824
error:  199.2041572225318


In [9]:
print(stats.kendalltau(r_groups_1['node'], r_regular['node']))
print(stats.kendalltau(list(r_groups_1['node'])[:10], list(r_regular['node'])[:10]))

KendalltauResult(correlation=0.67816091954023, pvalue=7.734459562397896e-09)
KendalltauResult(correlation=0.9999999999999999, pvalue=5.511463844797178e-07)


## 4. Improved grouping method
Runs on 30/3 + 2 = 12 nodes at a time

In [10]:
(r_groups_2, error_groups_2) = group_rank(game_df, k)
print(r_groups_2)
print("error: ", error_groups_2)

             node          r
0            Suns  18.498172
1        Warriors  16.848874
2            Jazz  13.406184
3       Grizzlies  12.676214
4         Celtics  11.542682
5            Heat   10.48927
6       Mavericks   9.394314
7           Bucks   8.050101
8    Timberwolves   7.177697
9           Bulls   6.183632
10      Cavaliers   5.444573
11        Nuggets   5.204217
12        Raptors   4.467142
13          76ers   3.957927
14          Hawks   1.963861
15          Spurs   0.631517
16           Nets  -0.347711
17         Knicks  -0.700827
18       Clippers  -3.329907
19        Hornets  -3.579165
20         Pacers  -5.494859
21       Pelicans  -6.096656
22         Lakers  -7.854916
23        Wizards   -8.45119
24          Kings -10.036776
25          Magic -15.703104
26  Trail Blazers -15.854141
27        Thunder -16.261387
28        Pistons -19.797255
29        Rockets -20.896057
error:  1002.1182894341291


In [11]:
#Kendall's Tau
print(stats.kendalltau(r_groups_2['node'], r_regular['node']))

#Kendall's Tau of first n teams
n = 10
print(stats.kendalltau(list(r_groups_2['node'])[:n], list(r_regular['node'])[:n]))

KendalltauResult(correlation=0.7057471264367816, pvalue=1.1932043519690424e-09)
KendalltauResult(correlation=0.9999999999999999, pvalue=5.511463844797178e-07)
