In [19]:
# Import the dependencies.
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sqlalchemy import create_engine, text

In [20]:
# Create a reference to the file. 
database_path = Path("Resources/nba_db.db")

In [21]:
# Create an engine that can talk to the database
engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()

In [22]:
# Query All Records in the the Database
data = pd.read_sql("SELECT * FROM Basketball", conn)
data.head()

Unnamed: 0,id,rk,player,pos,age,tm,g,gs,mp,fg,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,year
0,0,1,Mahmoud Abdul-Rauf,PG,28,SAC,31,0,17.1,3.3,...,0.2,1.0,1.2,1.9,0.5,0.0,0.6,1.0,7.3,1997-1998
1,1,2,Tariq Abdul-Wahad,SG,23,SAC,59,16,16.3,2.4,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,1.4,6.4,1997-1998
2,2,3,Shareef Abdur-Rahim,SF,21,VAN,82,82,36.0,8.0,...,2.8,4.3,7.1,2.6,1.1,0.9,3.1,2.5,22.3,1997-1998
3,3,4,Cory Alexander,PG,24,TOT,60,22,21.6,2.9,...,0.3,2.2,2.4,3.5,1.2,0.2,1.9,1.6,8.1,1997-1998
4,4,4,Cory Alexander,PG,24,SAS,37,3,13.5,1.6,...,0.2,1.1,1.3,1.9,0.7,0.1,1.3,1.4,4.5,1997-1998


In [23]:
nba_data_df = data.drop(columns=["id","rk"])
nba_data_df.head()

Unnamed: 0,player,pos,age,tm,g,gs,mp,fg,fga,fg%,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,year
0,Mahmoud Abdul-Rauf,PG,28,SAC,31,0,17.1,3.3,8.8,0.377,...,0.2,1.0,1.2,1.9,0.5,0.0,0.6,1.0,7.3,1997-1998
1,Tariq Abdul-Wahad,SG,23,SAC,59,16,16.3,2.4,6.1,0.403,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,1.4,6.4,1997-1998
2,Shareef Abdur-Rahim,SF,21,VAN,82,82,36.0,8.0,16.4,0.485,...,2.8,4.3,7.1,2.6,1.1,0.9,3.1,2.5,22.3,1997-1998
3,Cory Alexander,PG,24,TOT,60,22,21.6,2.9,6.7,0.428,...,0.3,2.2,2.4,3.5,1.2,0.2,1.9,1.6,8.1,1997-1998
4,Cory Alexander,PG,24,SAS,37,3,13.5,1.6,3.9,0.414,...,0.2,1.1,1.3,1.9,0.7,0.1,1.3,1.4,4.5,1997-1998


In [24]:
nba_data_df = nba_data_df.rename(columns={
    "pos": "position",
    "tm": "team",
    "g": "games played",
    "gs": "games scored",
    "mp": "avg minutes played",
    "fg": "avg field goals made",
    "fga": "avg field goals attempted",
    "fg%": "field goal made %",
    "3p": "avg three-point shots made",
    "3pa": "avg three-point shots attempted",
    "3p%": "three-point shots made %",
    "2p": "avg two-point shots made",
    "2pa": "avg two-point shots attempted",
    "2p%": "two-point shots made %",   
    "efg%": "effective goal %",
    "ft": "avg free-throw shots made",
    "fta": "avg free-throw shots attempted",
    "ft%": "free-throw shots made %",
    "orb": "avg offensive rebounds",
    "drb": "avg defensive rebounds",
    "trb": "avg total rebounds",
    "ast": "avg assists",
    "stl": "avg steals",
    "blk": "avg blocks",
    "tov": "avg turnovers",
    "pf": "avg personal fouls",
    "pts": "avg points scored"
})
nba_data_df.head()

Unnamed: 0,player,position,age,team,games played,games scored,avg minutes played,avg field goals made,avg field goals attempted,field goal made %,...,avg offensive rebounds,avg defensive rebounds,avg total rebounds,avg assists,avg steals,avg blocks,avg turnovers,avg personal fouls,avg points scored,year
0,Mahmoud Abdul-Rauf,PG,28,SAC,31,0,17.1,3.3,8.8,0.377,...,0.2,1.0,1.2,1.9,0.5,0.0,0.6,1.0,7.3,1997-1998
1,Tariq Abdul-Wahad,SG,23,SAC,59,16,16.3,2.4,6.1,0.403,...,0.7,1.2,2.0,0.9,0.6,0.2,1.1,1.4,6.4,1997-1998
2,Shareef Abdur-Rahim,SF,21,VAN,82,82,36.0,8.0,16.4,0.485,...,2.8,4.3,7.1,2.6,1.1,0.9,3.1,2.5,22.3,1997-1998
3,Cory Alexander,PG,24,TOT,60,22,21.6,2.9,6.7,0.428,...,0.3,2.2,2.4,3.5,1.2,0.2,1.9,1.6,8.1,1997-1998
4,Cory Alexander,PG,24,SAS,37,3,13.5,1.6,3.9,0.414,...,0.2,1.1,1.3,1.9,0.7,0.1,1.3,1.4,4.5,1997-1998


In [25]:
nba_data_df["total minutes played"] = nba_data_df["avg minutes played"] * nba_data_df["games played"]
nba_data_df["total field goals made"] = nba_data_df["avg field goals made"] * nba_data_df["games played"]
nba_data_df["total field goals attempted"] = nba_data_df["avg field goals attempted"] * nba_data_df["games played"]
nba_data_df["total three-point shots made"] = nba_data_df["avg three-point shots made"] * nba_data_df["games played"]
nba_data_df["total three-point shots attempted"] = nba_data_df["avg three-point shots attempted"] * nba_data_df["games played"]
nba_data_df["total two-point shots made"] = nba_data_df["avg two-point shots made"] * nba_data_df["games played"]
nba_data_df["total two-point shots attempted"] = nba_data_df["avg two-point shots attempted"] * nba_data_df["games played"]
nba_data_df["total free-throw shots made"] = nba_data_df["avg free-throw shots made"] * nba_data_df["games played"]
nba_data_df["total free-throw shots attempted"] = nba_data_df["avg free-throw shots attempted"] * nba_data_df["games played"]
nba_data_df["total offensive rebounds"] = nba_data_df["avg offensive rebounds"] * nba_data_df["games played"]
nba_data_df["total defensive rebounds"] = nba_data_df["avg defensive rebounds"] * nba_data_df["games played"]
nba_data_df["total rebounds"] = nba_data_df["avg total rebounds"] * nba_data_df["games played"]
nba_data_df["total assists"] = nba_data_df["avg assists"] * nba_data_df["games played"]
nba_data_df["total steals"] = nba_data_df["avg steals"] * nba_data_df["games played"]
nba_data_df["total blocks"] = nba_data_df["avg blocks"] * nba_data_df["games played"]
nba_data_df["total turnovers"] = nba_data_df["avg turnovers"] * nba_data_df["games played"]
nba_data_df["total personal fouls"] = nba_data_df["avg personal fouls"] * nba_data_df["games played"]
nba_data_df["total points scored"] = nba_data_df["avg points scored"] * nba_data_df["games played"]
display(nba_data_df.head(5))

Unnamed: 0,player,position,age,team,games played,games scored,avg minutes played,avg field goals made,avg field goals attempted,field goal made %,...,total free-throw shots attempted,total offensive rebounds,total defensive rebounds,total rebounds,total assists,total steals,total blocks,total turnovers,total personal fouls,total points scored
0,Mahmoud Abdul-Rauf,PG,28,SAC,31,0,17.1,3.3,8.8,0.377,...,15.5,6.2,31.0,37.2,58.9,15.5,0.0,18.6,31.0,226.3
1,Tariq Abdul-Wahad,SG,23,SAC,59,16,16.3,2.4,6.1,0.403,...,123.9,41.3,70.8,118.0,53.1,35.4,11.8,64.9,82.6,377.6
2,Shareef Abdur-Rahim,SF,21,VAN,82,82,36.0,8.0,16.4,0.485,...,639.6,229.6,352.6,582.2,213.2,90.2,73.8,254.2,205.0,1828.6
3,Cory Alexander,PG,24,TOT,60,22,21.6,2.9,6.7,0.428,...,102.0,18.0,132.0,144.0,210.0,72.0,12.0,114.0,96.0,486.0
4,Cory Alexander,PG,24,SAS,37,3,13.5,1.6,3.9,0.414,...,37.0,7.4,40.7,48.1,70.3,25.9,3.7,48.1,51.8,166.5


In [50]:
cleaned_nba_data_df = nba_data_df.fillna(0)
cleaned_nba_data_df['field goal made %'] = cleaned_nba_data_df['field goal made %'].replace('NULL', 0)
cleaned_nba_data_df['three-point shots made %'] = cleaned_nba_data_df['three-point shots made %'].replace('NULL', 0)
cleaned_nba_data_df['two-point shots made %'] = cleaned_nba_data_df['two-point shots made %'].replace('NULL', 0)
cleaned_nba_data_df['free-throw shots made %'] = cleaned_nba_data_df['free-throw shots made %'].replace('NULL', 0)
cleaned_nba_data_df['effective goal %'] = cleaned_nba_data_df['effective goal %'].replace('NULL', 0)
cleaned_nba_data_df.to_csv("Resources/cleaned_nba_data.csv", index=False, header=True)
cleaned_nba_data_df.head()

Unnamed: 0,player,position,age,team,games played,games scored,avg minutes played,avg field goals made,avg field goals attempted,field goal made %,...,total free-throw shots attempted,total offensive rebounds,total defensive rebounds,total rebounds,total assists,total steals,total blocks,total turnovers,total personal fouls,total points scored
0,Mahmoud Abdul-Rauf,PG,28,SAC,31,0,17.1,3.3,8.8,0.377,...,15.5,6.2,31.0,37.2,58.9,15.5,0.0,18.6,31.0,226.3
1,Tariq Abdul-Wahad,SG,23,SAC,59,16,16.3,2.4,6.1,0.403,...,123.9,41.3,70.8,118.0,53.1,35.4,11.8,64.9,82.6,377.6
2,Shareef Abdur-Rahim,SF,21,VAN,82,82,36.0,8.0,16.4,0.485,...,639.6,229.6,352.6,582.2,213.2,90.2,73.8,254.2,205.0,1828.6
3,Cory Alexander,PG,24,TOT,60,22,21.6,2.9,6.7,0.428,...,102.0,18.0,132.0,144.0,210.0,72.0,12.0,114.0,96.0,486.0
4,Cory Alexander,PG,24,SAS,37,3,13.5,1.6,3.9,0.414,...,37.0,7.4,40.7,48.1,70.3,25.9,3.7,48.1,51.8,166.5
