# BSS-AUS: Basketball Statistic System (AUS)

This system allows to **incrementally** build a database of _stint_ (advance) statistics for each game and each team.

A **stint** is a lineup of players who play together in different interval periods across the game. The system will build the stints for each team from the play-by-play data and compute various statistics.

The data comes as a raw JSON file:

https://fibalivestats.dcd.shared.geniussports.com/data/2087737/data.json

In [1]:
# Let's first load all required packages...
import os
import pandas as pd
import numpy as np

from config import *
import bball_stats
import tools


## 1. Compute stat tables for various games


## 1. Define games to scrape and saved data

First, setup the games we want to scrape and compute, as well as the existing data stored in file to append to.

In [2]:
# games to be computed
# format: (game id, game no for team 1, game no for team 2)
games = [(1999318, 1, 1), (2087737, 2, 2), (2090351, 3, 3)]
games = [(1976463, 0, 0)]
# games = [1976446,1976447,1976448,1976452,1976454,2004608,2004609,1976449,1976451,1976453,1976455,1976458,1976456,1976457,1976459,1976460,1976461,1976462,1976463,1976464,2004610,1976465,1976468,1976469,1976474,1976473,1976482,2036215,2031329,2031330,2031332,2031333,2031334,2031335,2031336,2031337,2031338,2031340,2031341,2046695,2046696,2046697,2031342,2031343,2031344,2031345,2031346,2031347,2046698,2046700,2046701,2046702,2046703,2046704,2046706,2046707,2046709,2046710,2046711,2046712,2046713,2051763,2053811,2053812,2053813,2053814,2053815,2053816,2053817,2053818,2053819,2053820,2053821,2053822,2053823,2053824,2053825,2056454,2056455,2056457,2056458,2056461,2056462,2056460,2056463,2056464,2056466,2056467,2056469,2056471,2056472,2056473,2065653,2065654,2065655,2065656,2065657,2065658,2065659,2069165,2069166,2069167,2069168,2069169,2069170,2069171,2069172,2069175,2069177,2069179,2069181,2069183,2069184,2069186,2069187,2069191,2069192,2069194,2069196,2069199,2069202,2069203,2069204,2069190,2069193,2069195,2069197,2069198,2069200,2069201,2069205,2069173,2069174,2069176,2069178,2069180,2069182,2069180,2069188,2069189]
# games = [(n, 0, 0) for n in games]

reload = True
file_stats_df = os.path.join(DATA_DIR, "stats_df.pkl")
file_games_df = os.path.join(DATA_DIR, "games_df.pkl")

## 2. Compute stat and game tables

Now, let us run the system that scrapes the games' data, compute stats and game info, and adds them to the initial tables of stats and games.

In [3]:
init_stats_df = None
game_df = None
if os.path.exists(file_stats_df) and not reload:
    # load the stat dataframe already stored as a file
    print(f"Loading initial stats df: {file_stats_df}")
    init_stats_df = pd.read_pickle(file_stats_df)
    game_df = pd.read_pickle(file_games_df)
    existing_games = init_stats_df.game_id.unique()
else:
    existing_games = []

stats_dfs = [init_stats_df] if init_stats_df is not None else []
games_data = []
for game in games:
    game_id, game_no1, game_no2 = game
    if game_id in existing_games:
        print(f"Game {game_id} is already in table")
        continue
    print(f"Computing game {game_id}...")

    # now compute the actual stats for the game
    result = bball_stats.build_game_stints_stats_df(game_id)
    df = result['stint_stats_df']
    team1 = result['team1']
    team2 = result['team2']
    try:
        game_info = tools.get_game_info(game_id)    # extract date of game from HTML
    except:
        game_info = { "venue" : np.nan, "date": np.nan}
    print(f"\t .... done: {team1[0]} ({team1[1]}) vs {team2[0]} ({team2[1]}) on {game_info['date']}")

    # fill game info
    df.insert(0, 'game_id', game_id)
    df.insert(3, 'game_no1', game_no1)
    df.insert(4, 'game_no2', game_no2)
    stats_dfs.append(df)

    # build game dataframe table
    games_data.append({"game_id": game_id,
                        "date" : game_info['date'],
                        "team1": team1[0], "team2": team2[0],
                        "s1": team1[1], "s2": team2[1],
                        "game_no1": game_no2, "game_no2": game_no2,
                        "winner": 1 if team1[1] > team2[1] else 2,
                        "venue" : game_info["venue"]}
                      )

# put all dfs together into a single dataframe
stats_df = pd.concat(stats_dfs)
stats_df.reset_index(inplace=True, drop=True)
stats_df.sample(5)

if game_df is not None:
    games_df = pd.concat([game_df, pd.DataFrame(games_data)])
    games_df.reset_index(inplace=True, drop=True)
else:
    games_df = pd.DataFrame(games_data)

Computing game 1976463...
	 .... done: Melbourne United (83) vs New Zealand Breakers (60) on 2021-12-19 00:00:00


If we want we can do some sanity checks, before saving to disk:

In [4]:
games_df

Unnamed: 0,game_id,date,team1,team2,s1,s2,game_no1,game_no2,winner,venue
0,1976463,2021-12-19,Melbourne United,New Zealand Breakers,83,60,0,0,1,John Cain Arena


In [5]:
stats_df
# stats_df.loc[4]

Unnamed: 0,game_id,tno,team,game_no1,game_no2,stint,poss,ortg,drtg,nrtg,...,tov_bh_opp,tov_bp_opp,tov_ofoul_opp,tov_3sec_opp,tov_8sec_opp,tov_24sec_opp,opp_fga_blocked_opp,lineup,intervals,mins
0,1976463,1,Melbourne United,0,0,1,18.88,111.23,99.01,12.22,...,2.0,2.0,0.0,1.0,0.0,0.0,0.0,"(M. Dellavedova, D. Barlow, C. Agada, J. Lual-...","[(1, 00:10:00, 00:03:44), (2, 00:03:52, 00:00:...",10.133333
1,1976463,1,Melbourne United,0,0,2,2.0,100.0,0.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(A. Hukporti, B. Newley, M. Peatling, C. Agada...","[(1, 00:03:44, 00:02:48)]",0.933333
2,1976463,1,Melbourne United,0,0,3,5.88,85.03,58.14,26.89,...,2.0,0.0,0.0,0.0,1.0,0.0,20.0,"(A. Hukporti, J. White, M. Peatling, B. Newley...","[(1, 00:02:48, 00:00:00), (2, 00:10:00, 00:10:...",2.8
3,1976463,1,Melbourne United,0,0,4,3.0,66.67,100.0,-33.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(J. White, D. Barlow, J. Lual-Acuil, S. Ili, C...","[(2, 00:10:00, 00:07:19)]",2.683333
4,1976463,1,Melbourne United,0,0,5,1.0,200.0,0.0,200.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(M. Dellavedova, M. Peatling, J. Lual-Acuil, J...","[(2, 00:07:19, 00:06:45)]",0.566667
5,1976463,1,Melbourne United,0,0,6,1.0,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,"(A. Hukporti, M. Dellavedova, M. Peatling, J. ...","[(2, 00:06:45, 00:06:29)]",0.266667
6,1976463,1,Melbourne United,0,0,7,6.88,29.07,42.86,-13.79,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,"(A. Hukporti, M. Dellavedova, M. Peatling, C. ...","[(2, 00:06:29, 00:03:52)]",2.616667
7,1976463,1,Melbourne United,0,0,8,10.0,160.0,33.78,126.22,...,0.0,1.0,0.0,0.0,0.0,0.0,8.33,"(A. Hukporti, M. Dellavedova, C. Agada, J. Whi...","[(3, 00:10:00, 00:06:59), (4, 00:05:39, 00:03:...",5.4
8,1976463,1,Melbourne United,0,0,9,6.88,174.42,87.5,86.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(M. Dellavedova, J. Lual-Acuil, C. Agada, J. W...","[(3, 00:06:59, 00:04:03), (4, 00:03:16, 00:02:...",4.183333
9,1976463,1,Melbourne United,0,0,10,3.0,100.0,100.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,"(B. Newley, M. Peatling, J. Lual-Acuil, S. Ili...","[(3, 00:04:03, 00:02:09)]",1.9


In [5]:
# (ortg, drtg) should mirror (drtg_opp, ortg)
stats_df.iloc[41][['poss', 'ortg', 'drtg', "poss_opp", "ortg_opp", "drtg_opp"]]

poss         10.88
ortg         73.53
drtg        101.21
poss_opp      9.88
ortg_opp    101.21
drtg_opp     73.53
Name: 41, dtype: object

## 3. Save stats and games to files

We now save the full dataframes (stats and games) in various formats: binary (pickle), csv, and Excel.

This will allows us to re-load that data later to add more games to it quicker.

In [8]:

import os

stats_df.to_pickle(os.path.join(DATA_DIR, "stats_df.pkl"))
games_df.to_pickle(os.path.join(DATA_DIR, "games_df.pkl"))

stats_df.to_csv(os.path.join(DATA_DIR, "stats_df.csv"))
games_df.to_csv(os.path.join(DATA_DIR, "games_df.csv"))

with pd.ExcelWriter(os.path.join(DATA_DIR, 'stats_df.xlsx')) as writer:
    stats_df.to_excel(writer, sheet_name='STATS')
    games_df.to_excel(writer, sheet_name='GAMES')
games_df.to_excel(os.path.join(DATA_DIR, "games_df.xlsx"))


### 3. Inspection & analysis

We use [dtale](https://pypi.org/project/dtale/) package for this.

In [7]:
import dtale

dtale.show(stats_df)
# dtale.show(stats_df[['tno', 'stint', 'poss', 'ortg', 'drtg', "poss_opp", "ortg_opp", "drtg_opp"]])



## Web page analysis

In [8]:
tools.get_game_info(1976446)
# import datetime
# datetime.datetime.strptime("3/2/89", "%d/%m/%y")

{'venue': 'MyState Bank Arena', 'date': datetime.datetime(2021, 12, 3, 0, 0)}