In [1]:
import pandas as pd
import numpy as np

In [40]:
### Loop through past 20 years to get MVP and team statistics

# Initialize MVP and team datasets
mvp_data = pd.DataFrame(columns = ['Rank','Player','Age','Tm','Share','G','MP','PTS','TRB','AST','STL','BLK','FG%','3P%','FT%','WS','WS/48','Year'])
team_data = pd.DataFrame(columns = ['Team Name','W/L%','SRS','Playoffs','Year'])

for year in range(2000,2021):
    ## MVP Data
    # Create url
    mvp_urlparts = ['https://www.basketball-reference.com/awards/awards_',str(year),'.html']
    mvp_url = "".join(mvp_urlparts)
    
    # Read in html tables
    mvp_dfs = pd.read_html(mvp_url)
    
    # Pull and format MVP table
    mvp = mvp_dfs[0]
    mvp.columns = mvp.columns.get_level_values(1)
    mvp['Year'] = year
    
    # Drop unnecessary columns
    mvp = mvp.drop(mvp.columns[[4,5,6]], axis=1)
    
    # Append single year data to total dataset
    mvp_data = mvp_data.append(mvp, ignore_index=True)
    
    
    
    ## Team Data
    # Create url
    team_urlparts = ['https://www.basketball-reference.com/leagues/NBA_',str(year),'_standings.html']
    team_url = "".join(team_urlparts)
    
    # Read in html tables
    team_dfs = pd.read_html(team_url)
    
    # Pull and combine Eastern and Western Conference tables
    team_e = team_dfs[0]
    team_w = team_dfs[1]
    team_e.rename(columns={team_e.columns[0]: "Team Name"}, inplace=True)
    team_w.rename(columns={team_w.columns[0]: "Team Name"}, inplace=True)
    team = pd.concat([team_e,team_w]).reset_index(drop=True)
    
    # Team names have an '*' if they made the playoffs
    # Create column to identify playoff appearance and remove '*' from names
    team['Playoffs'] = team['Team Name'].str.contains('\*')
    team['Team Name'] = team['Team Name'].str.replace('*','')
    
    # Create year id
    team['Year'] = year
    
    # Select drop unnecessary columns
    team = team.drop(team.columns[[1,2,4,5,6]], axis=1)
    
    # Append single year data to total data
    team_data = team_data.append(team, ignore_index=True)

In [43]:
### Additional Cleaning

# Convert 'W/L%' and 'SRS' to floats and drop rows of '...Division'
team_data[['W/L%','SRS']] = team_data[['W/L%','SRS']].apply(pd.to_numeric, errors='coerce')
team_data.dropna(inplace=True)

# Create 'MVP' column based on 'Rank' column (then drop 'Rank')
mvp_data['Rank'] = mvp_data['Rank'].apply(pd.to_numeric, errors='coerce')
mvp_data['MVP'] = mvp_data['Rank'] == 1
mvp_data.drop(columns=['Rank'], inplace=True)

# Drop players listed under team 'TOT'
mvp_data.drop(mvp_data[mvp_data['Tm']=='TOT'].index, inplace=True)

# Fill in 3P% NaNs with 0
mvp_data['3P%'] = mvp_data['3P%'].fillna(0)

In [45]:
## Merge two datasets together
# Create key to connect team name with abbreviation
names = sorted(team_data['Team Name'].unique())
abbr = ['ATL','BOS','BRK','CHA','CHO','CHI','CLE','DAL','DEN','DET','GSW','HOU','IND','LAC','LAL','MEM','MIA','MIL','MIN','NJN','NOH','NOP','NOK','NYK','OKC','ORL','PHI','PHO','POR','SAC','SAS','SEA','TOR','UTA','VAN','WAS']

key = pd.DataFrame(list(zip(names,abbr)), columns=['Team Name','Tm'])

# Add key to team_data
team_data = team_data.join(key.set_index('Team Name'), on='Team Name')

# Charlotte Hornets during 2000-2002 had a different abbreviation
team_data.loc[(team_data['Team Name']=='Charlotte Hornets') & (team_data['Year'].isin([2000,2001,2002])),'Tm'] = 'CHH'

# Drop team names
team_data.drop(columns=['Team Name'], inplace=True)

# Combine datasets
data = pd.merge(mvp_data, team_data, how='left', on=['Tm','Year'])

In [46]:
# Export to csv
data.to_csv(r'Cleaned Data.csv', index = False)

In [47]:
data[data['Year']==2020]

Unnamed: 0,Player,Age,Tm,Share,G,MP,PTS,TRB,AST,STL,...,FG%,3P%,FT%,WS,WS/48,Year,MVP,W/L%,SRS,Playoffs
284,Giannis Antetokounmpo,25,MIL,0.952,63,30.4,29.5,13.6,5.6,1.0,...,0.553,0.304,0.633,11.1,0.279,2020,True,0.767,9.41,True
285,LeBron James,35,LAL,0.746,67,34.6,25.3,7.8,10.2,1.2,...,0.493,0.348,0.693,9.8,0.204,2020,False,0.732,6.28,True
286,James Harden,30,HOU,0.363,68,36.5,34.3,6.6,7.5,1.8,...,0.444,0.355,0.865,13.1,0.254,2020,False,0.611,3.13,True
287,Luka Dončić,20,DAL,0.198,61,33.6,28.8,9.4,8.8,1.0,...,0.463,0.316,0.758,8.8,0.207,2020,False,0.573,4.87,True
288,Kawhi Leonard,28,LAC,0.166,57,32.4,27.1,7.1,4.9,1.8,...,0.47,0.378,0.886,8.7,0.226,2020,False,0.681,6.66,True
289,Anthony Davis,26,LAL,0.081,62,34.4,26.1,9.3,3.2,1.5,...,0.503,0.33,0.846,11.1,0.25,2020,False,0.732,6.28,True
290,Chris Paul,34,OKC,0.026,70,31.5,17.6,5.0,6.7,1.6,...,0.489,0.365,0.907,8.9,0.193,2020,False,0.611,2.33,True
291,Damian Lillard,29,POR,0.023,66,37.5,30.0,4.3,8.0,1.1,...,0.463,0.401,0.888,11.6,0.225,2020,False,0.473,-0.61,True
292,Nikola Jokić,24,DEN,0.018,73,32.0,19.9,9.7,7.0,1.2,...,0.528,0.314,0.817,9.8,0.202,2020,False,0.63,2.35,True
293,Pascal Siakam,25,TOR,0.017,60,35.2,22.9,7.3,3.5,1.0,...,0.453,0.359,0.792,5.4,0.123,2020,False,0.736,5.97,True
