In [1]:
#import numpy as np
import pandas as pd
#import psycopg
#from sqlalchemy import create_engine
#import dotenv
#import os
#import sqlite3

In [2]:
nba = pd.read_csv('ASA All NBA Raw Data.csv', low_memory=False)

In [3]:
pd.set_option('display.max_rows', 81)
nba.head(10)

Unnamed: 0,game_id,game_date,OT,H_A,Team_Abbrev,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_orb_pct,...,pf_per_minute,ts,last_60_minutes_per_game_starting,last_60_minutes_per_game_bench,PG%,SG%,SF%,PF%,C%,active_position_minutes
0,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.061538,9.0,31.716667,22.017778,1.0,36.0,60.0,4.0,0.0,46.253586
1,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.099119,7.44,34.324,18.475954,0.0,0.0,4.0,85.0,11.0,52.15259
2,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.0,7.0,29.82029,16.051693,0.0,32.0,67.0,0.0,0.0,47.021807
3,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.048387,7.88,29.920833,14.603922,90.0,10.0,0.0,0.0,0.0,27.603314
4,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.0,6.88,20.095833,14.538095,0.0,0.0,0.0,0.0,100.0,36.472537
5,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.097297,11.76,33.8125,23.658333,0.0,7.0,62.0,31.0,0.0,54.944529
6,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.109756,10.0,20.776017,11.183333,100.0,0.0,0.0,0.0,0.0,28.08385
7,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.036697,15.88,29.610183,15.938596,0.0,0.0,33.0,57.0,10.0,53.836511
8,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.267618,8.32,12.768456,6.873016,0.0,0.0,16.0,70.0,14.0,52.255931
9,202202170BRK,2022-02-17,0,A,WAS,117,94.5,0.627,13.5,22.9,...,0.231362,5.88,20.789815,4.35,0.0,0.0,0.0,0.0,100.0,36.472537


## Database Normalization
### First normal form:

1. **All tables must have a primary key**: In this table, `game_id` and `player_id` together are unique on every row, and so they form primary key.

2. **All the data must be atomic**: Inactives is non-atomic.

3. **No repeating groups problem**: We can't solve the non-atomicity problem by creating separate columns if this leads to arbitrary ordering language in the column names (for example, `Inactive1`, `Inactive2`, etc.) and if it leads to a lot of missing data (there would be an `Inactive7` which would be missing any time a team has less than 7 inactive players).

In [4]:
nba = nba.drop(['Inactives'], axis=1)

### Functional Dependence
Let X and Y be columns in a data table. Y is functionally dependent on X if each value of X has exactly one value of Y.

That's pretty abstract. So here are some guidelines that help me:

1. This use of "function" is the exact same as the concept of a function from algebra and pre-calculus. A correspondence f(x)=y is a function if each value of x has only one associated value of y.

2. X is either a primary key, or something that should be a primary key in another table.

For example, `game_date` (Y) is functionally dependent on `game_id` (X) because one `game_id` takes place on exactly one date.

### Second normal form:
In this table the primary key is a superkey consisting of two columns: `game_id` and `player_id`. 

2NF is violated if any columns are functionally dependent on part of the primary key but not the entire primary key. This can only happen if the primary key is a superkey.

In [5]:
# remove columns that violate this
games = nba[['game_id', 'game_date', 'OT', 'season']].drop_duplicates()
games

Unnamed: 0,game_id,game_date,OT,season
0,202202170BRK,2022-02-17,0,2022
26,202202170CHO,2022-02-17,2,2022
48,202202170LAC,2022-02-17,0,2022
71,202202170MIL,2022-02-17,0,2022
95,202202170NOP,2022-02-17,0,2022
...,...,...,...,...
108259,202001080GSW,2020-01-08,0,2020
108887,202008020HOU,2020-08-02,0,2020
109683,201911060HOU,2019-11-06,0,2020
110125,201912250GSW,2019-12-25,0,2020


In [6]:
players = nba[['player_id', 'player']].drop_duplicates()
players

Unnamed: 0,player_id,player
0,kispeco01,Corey Kispert
1,kuzmaky01,Kyle Kuzma
2,caldwke01,Kentavious Caldwell-Pope
3,netora01,Raul Neto
4,bryanth01,Thomas Bryant
...,...,...
109702,frazimi01,Michael Frazier
110441,howarwi01,William Howard
110913,mbahalu01,Luc Mbah a Moute
111399,bowmaky01,Ky Bowman


In [7]:
nba = nba.drop(['game_date', 'OT', 'season', 'player'], axis=1)
nba

Unnamed: 0,game_id,H_A,Team_Abbrev,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_orb_pct,Team_ft_rate,Team_off_rtg,...,pf_per_minute,ts,last_60_minutes_per_game_starting,last_60_minutes_per_game_bench,PG%,SG%,SF%,PF%,C%,active_position_minutes
0,202202170BRK,A,WAS,117,94.5,0.627,13.5,22.9,0.157,123.8,...,0.061538,9.00,31.716667,22.017778,1.0,36.0,60.0,4.0,0.0,46.253586
1,202202170BRK,A,WAS,117,94.5,0.627,13.5,22.9,0.157,123.8,...,0.099119,7.44,34.324000,18.475954,0.0,0.0,4.0,85.0,11.0,52.152590
2,202202170BRK,A,WAS,117,94.5,0.627,13.5,22.9,0.157,123.8,...,0.000000,7.00,29.820290,16.051693,0.0,32.0,67.0,0.0,0.0,47.021807
3,202202170BRK,A,WAS,117,94.5,0.627,13.5,22.9,0.157,123.8,...,0.048387,7.88,29.920833,14.603922,90.0,10.0,0.0,0.0,0.0,27.603314
4,202202170BRK,A,WAS,117,94.5,0.627,13.5,22.9,0.157,123.8,...,0.000000,6.88,20.095833,14.538095,0.0,0.0,0.0,0.0,100.0,36.472537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112118,202003070GSW,H,GSW,118,90.9,0.606,7.0,18.9,0.263,129.9,...,0.107914,13.08,33.110667,19.232562,0.0,2.0,77.0,21.0,0.0,57.207786
112119,202003070GSW,H,GSW,118,90.9,0.606,7.0,18.9,0.263,129.9,...,0.036079,6.00,25.470833,20.228571,5.0,45.0,43.0,7.0,0.0,58.202391
112120,202003070GSW,H,GSW,118,90.9,0.606,7.0,18.9,0.263,129.9,...,0.150943,4.00,24.083333,13.228788,0.0,0.0,0.0,9.0,91.0,49.630640
112121,202003070GSW,H,GSW,118,90.9,0.606,7.0,18.9,0.263,129.9,...,0.094340,12.64,34.783333,27.691667,0.0,44.0,48.0,8.0,0.0,58.923515


### Third normal form:
3NF is violated if there are "transitive dependencies", that is, functional dependence between columns when neither column is part of the primary key.

In [8]:
nba.columns

Index(['game_id', 'H_A', 'Team_Abbrev', 'Team_Score', 'Team_pace',
       'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate',
       'Team_off_rtg', 'Opponent_Abbrev', 'Opponent_Score', 'Opponent_pace',
       'Opponent_efg_pct', 'Opponent_tov_pct', 'Opponent_orb_pct',
       'Opponent_ft_rate', 'Opponent_off_rtg', 'player_id', 'starter', 'mp',
       'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct',
       'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
       'plus_minus', 'did_not_play', 'is_inactive', 'ts_pct', 'efg_pct',
       'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'off_rtg',
       'def_rtg', 'bpm', 'minutes', 'double_double', 'triple_double', 'DKP',
       'FDP', 'SDP', 'DKP_per_minute', 'FDP_per_minute', 'SDP_per_minute',
       'pf_per_minute', 'ts', 'last_60_minutes_per_game_starting',
       'last_60_minutes_per_game_bench', 'PG%', '

In [9]:
team_game_table = nba[['game_id', 'H_A','Team_Abbrev', 'Team_Score', 'Team_pace',
       'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate',
       'Team_off_rtg', 'Opponent_Abbrev' ]].drop_duplicates()
player_game = nba.drop(['H_A','Team_Abbrev', 'Team_Score', 'Team_pace',
       'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct', 'Team_ft_rate',
       'Team_off_rtg', 'Opponent_Abbrev','Opponent_Abbrev', 'Opponent_Score', 'Opponent_pace',
       'Opponent_efg_pct', 'Opponent_tov_pct', 'Opponent_orb_pct',
       'Opponent_ft_rate', 'Opponent_off_rtg' ], axis = 1)

In [10]:
player_game

Unnamed: 0,game_id,player_id,starter,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,...,pf_per_minute,ts,last_60_minutes_per_game_starting,last_60_minutes_per_game_bench,PG%,SG%,SF%,PF%,C%,active_position_minutes
0,202202170BRK,kispeco01,1,32:30,6,9,0.667,4,6,0.667,...,0.061538,9.00,31.716667,22.017778,1.0,36.0,60.0,4.0,0.0,46.253586
1,202202170BRK,kuzmaky01,1,30:16,2,7,0.286,0,3,0.000,...,0.099119,7.44,34.324000,18.475954,0.0,0.0,4.0,85.0,11.0,52.152590
2,202202170BRK,caldwke01,1,25:26,3,7,0.429,1,3,0.333,...,0.000000,7.00,29.820290,16.051693,0.0,32.0,67.0,0.0,0.0,47.021807
3,202202170BRK,netora01,1,20:40,5,7,0.714,1,1,1.000,...,0.048387,7.88,29.920833,14.603922,90.0,10.0,0.0,0.0,0.0,27.603314
4,202202170BRK,bryanth01,1,14:04,5,6,0.833,0,1,0.000,...,0.000000,6.88,20.095833,14.538095,0.0,0.0,0.0,0.0,100.0,36.472537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112118,202003070GSW,wiggian01,1,37:04:00,3,10,0.300,0,0,0.000,...,0.107914,13.08,33.110667,19.232562,0.0,2.0,77.0,21.0,0.0,57.207786
112119,202003070GSW,toscaju01,1,27:43:00,3,6,0.500,0,2,0.000,...,0.036079,6.00,25.470833,20.228571,5.0,45.0,43.0,7.0,0.0,58.202391
112120,202003070GSW,bendedr01,0,13:15,4,4,1.000,2,2,1.000,...,0.150943,4.00,24.083333,13.228788,0.0,0.0,0.0,9.0,91.0,49.630640
112121,202003070GSW,muldemy01,1,31:48:00,5,10,0.500,3,7,0.429,...,0.094340,12.64,34.783333,27.691667,0.0,44.0,48.0,8.0,0.0,58.923515
