In [2]:
#Start of the data cleaning process

In [3]:
#start with imports
import numpy as np
import pandas as pd
import matplotlib as mpl

In [4]:
Advanced_df = pd.read_csv('Assets/NBA_Player_Advanced_Stats.csv')
Totals_df = pd.read_csv('Assets/NBA_Player_Total_Stats.csv')
Contracts_df = pd.read_csv('Assets/NBA_Contracts.csv')

In [45]:
Contracts_df.head()

Unnamed: 0,Signing Year,Rank,Player,Position,Team,Age,Contract Length,Total Value,Avg. Value,Guaranteed Money,Contract Type
0,2011,1,Kevin Durant,SF,OKC,22,5,"$89,163,134","$17,832,627","$89,163,134",rookie-maximum-extension
1,2011,2,Kobe Bryant,SF,LAL,32,3,"$83,547,447","$27,849,149","$83,547,447",Veteran Extension
2,2011,3,Zach Randolph,PF,MEM,29,4,"$66,000,000","$16,500,000","$66,000,000",Veteran Extension
3,2011,T4,Al Horford,C,ATL,24,5,"$60,000,000","$12,000,000","$60,000,000",Rookie Extension
4,2011,T4,Joakim Noah,C,CHI,25,5,"$60,000,000","$12,000,000","$60,000,000",Rookie Extension


In [50]:
#Totals_df.info(), Advanced_df.info()

In [7]:
player_names = Contracts_df['Player'].unique()
unique_players = len(player_names)
total_contracts = len(Contracts_df)

print(f'There are {unique_players} unique players of the {total_contracts} contracts')

There are 94 unique players of the 106 contracts


In [8]:
#Some of our players in our contracts_df have different names than what show in the stats_dfs
adv_players_unq = set(Advanced_df['Player'].unique())
total_players_unq = set(Totals_df['Player'].unique())
missing_players = [name for name in player_names if name not in adv_players_unq and name not in total_players_unq]
print(missing_players)
# Missing Players: ['Nikola Vucevic', 'Anderson Varejao', 'Marcus Morris Sr.', 'Jonas Valanciunas', 'C.J. McCollum', 'Dennis Schröder', 'Louis Williams']
#These missing players are most likely due to either nicknames(Lou Williams) or special characters(Jonas Valančiūnas)

['Nikola Vucevic', 'Anderson Varejao', 'Marcus Morris Sr.', 'Jonas Valanciunas', 'C.J. McCollum', 'Dennis Schröder', 'Louis Williams']


In [13]:
#Let's figure out our spellings within the player stats datasets
#The following code will print out a few extra names than what we need, but it will allow us to see the spellings within the total_players_unq and likely the adv_players_unq aswell
for player in total_players_unq:
    if 'Nikola Vu' in player or 'Anderson Var' in player or 'Marcus Morris' in player or 'Jonas Val' in player or 'Dennis Sch' in player or 'Lou Williams' in player or 'McCollum' in player:
        print(player)

#Sinec some of the names in the total_players_unq have some odd spelling (Schroder and Varejao) we will set those names to the name found in the COntracts_df

Dennis SchrÃ¶der
Anderson VarejÃ£o
Nikola VuÄeviÄ
CJ McCollum
Lou Williams
Jonas ValanÄiÅ«nas
Marcus Morris


In [14]:
name_corrections = {
    'Nikola VuÄeviÄ': 'Nikola Vucevic',
    'Jonas ValanÄiÅ«nas': 'Jonas Valanciunas',
    'Anderson VarejÃ£o': 'Anderson Varejao',
    'Marcus Morris': 'Marcus Morris Sr.',
    'CJ McCollum': 'C.J. McCollum',
    'Dennis SchrÃ¶der': 'Dennis Schröder',
    'Lou Williams': 'Louis Williams'
}

Advanced_df['Player'] = Advanced_df['Player'].replace(name_corrections)
Totals_df['Player'] = Totals_df['Player'].replace(name_corrections)

#Let's make sure that worked though, and make sure that both of our Basketball Reference CSVs had the same names used
advanced_players = set(Advanced_df["Player"].unique())
totals_players = set(Totals_df["Player"].unique())
contract_players = set(player_names)

unmatched_totals = contract_players - totals_players
print('Players not in totals', unmatched_totals)
unmatched_adv = contract_players - advanced_players
print('Players not in advanced', unmatched_adv)

#Our print outs show empty lists which means our originally "missing" (misspelled) names now match those used in our Contract database. 

Players not in totals set()
Players not in advanced set()


In [36]:
#Now we can filter down our stats satasets to only include the players that are in our contract data:
Advanced_df_filtered = Advanced_df[Advanced_df["Player"].isin(player_names)]
print(f"Advanced_df: {Advanced_df.shape} → {Advanced_df_filtered.shape}")

Totals_df_filtered = Totals_df[Totals_df["Player"].isin(player_names)]
print(f"Totals_df: {Totals_df.shape} → {Totals_df_filtered.shape}")

Advanced_df: (9262, 30) → (1179, 30)
Totals_df: (9262, 33) → (1179, 33)


# More Filtering!
### Here we will filter down our dataframe some more to filter out any rows that are not included in our Contracts_df

In [55]:
# Let's create a dict obj that will contain all the players in our contract_df with there give contract years
contract_years_dict = {}

for _,row in Contracts_df.iterrows():
    contract_years_dict[row['Player']] = list(range(row['Signing Year'], row['Signing Year'] + row['Contract Length']))

#print(contract_years_dict)
#Our output looks like: {'Kevin Durant': [2011, 2012, 2013, 2014, 2015],
#                        'Kobe Bryant': [2014, 2015], 
#                        'Zach Randolph': [2015, 2016], ....

#Now that we have each player, and their contract years, we can filter our data even more to only contain their years 
#that match with the contract extensions from our Contracts_df 
filtered_totals = []
filtered_advanced = []

#For each row in our stats dataframes, we will grab the player name and season of play,
#we then check if this name and season/year is within our contract_dict. If it is we add it to our filter
for index, row in Totals_df_filtered.iterrows():
    player, year = row['Player'], row['Season']
    if player in contract_years_dict and year in contract_years_dict[player]:
        filtered_totals.append(row)

for index, row in Advanced_df_filtered.iterrows():
    player, year = row['Player'], row['Season']
    if player in contract_years_dict and year in contract_years_dict[player]:
        filtered_advanced.append(row)
        

In [59]:
#Now that we have our filter from above(as a list of lists) we can turn it back into a dataframe
#this will let us do more statistical analysis
Totals_df_years_filtered = pd.DataFrame(filtered_totals, columns=Totals_df.columns)
print(f'Totals_df: {Totals_df_years_filtered.shape}')

Advanced_df_years_filtered = pd.DataFrame(filtered_advanced, columns=Advanced_df.columns)
print(f'Advanced_df: {Advanced_df_years_filtered.shape}')

Totals_df: (444, 33)
Advanced_df: (444, 30)


In [60]:
#Let's take a look at an example. Kevin Durant is the first row in our Contracts_df with a Contract_length=5 
print(Advanced_df_years_filtered[Advanced_df_years_filtered["Player"] == 'Kevin Durant'])

      Season   Rank        Player   Age Team Pos     G    GS      MP   PER  \
0       2011    1.0  Kevin Durant  23.0  OKC  SF  66.0  66.0  2546.0  26.2   
553     2012    2.0  Kevin Durant  24.0  OKC  SF  81.0  81.0  3119.0  28.3   
1126    2013    1.0  Kevin Durant  25.0  OKC  SF  81.0  81.0  3122.0  29.8   
2116    2014  290.0  Kevin Durant  26.0  OKC  SF  27.0  27.0   913.0  27.6   
2421    2015   30.0  Kevin Durant  27.0  OKC  SF  72.0  72.0  2578.0  28.2   

      ...  USG%   OWS  DWS    WS  WS/48  OBPM  DBPM   BPM  VORP         Honors  
0     ...  31.3   8.5  3.7  12.2  0.230   6.2   0.8   7.0   5.8  MVP-2,AS,NBA1  
553   ...  29.8  13.6  5.3  18.9  0.291   7.6   1.7   9.3   8.9  MVP-2,AS,NBA1  
1126  ...  33.0  14.8  4.4  19.2  0.295   8.8   1.4  10.2   9.6  MVP-1,AS,NBA1  
2116  ...  29.1   3.8  1.0   4.8  0.252   8.6   1.4  10.0   2.8             AS  
2421  ...  30.6  11.0  3.5  14.5  0.270   8.4   1.6   9.9   7.8  MVP-5,AS,NBA2  

[5 rows x 30 columns]


In [61]:
#If we can now add a column variable named Age_of_contract (1,2,3,4,5) we could use this to help with
#future filtering or analysis... Then we could create new CSV's of our filtered stats 
#to load into a notebook for exploration
