## Web Scraping for MVP Prediction

### Loading Packages

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

### Scraping MVP Voting

In [2]:
years = list(range(1992,2023))

award_url = 'https://www.basketball-reference.com/awards/awards_{}.html'

for year in years:
    url = award_url.format(year)
    data = requests.get(url)
    
    with open("Basketball Reference/{}_MVP.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

### Create MVP Dataframe

In [3]:
dfs = []
for year in years:
    with open("Basketball Reference/{}_MVP.html".format(year), encoding = "utf-8") as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr', class_="over_header").decompose()
    mvp_table = soup.find_all(id="mvp")
    df = pd.read_html(str(mvp_table))[0]
    df["Year"] = year
    dfs.append(df)

In [4]:
mvp_df = pd.concat(dfs)
mvp_df.head()

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,28,CHI,80.0,900.0,960,0.938,80,38.8,30.1,6.4,6.1,2.3,0.9,0.519,0.27,0.832,17.7,0.274,1992
1,2,Clyde Drexler,29,POR,12.0,561.0,960,0.584,76,36.2,25.0,6.6,6.7,1.8,0.9,0.47,0.337,0.794,12.8,0.223,1992
2,3,David Robinson,26,SAS,2.0,337.0,960,0.351,68,37.7,23.2,12.2,2.7,2.3,4.5,0.551,0.125,0.701,13.9,0.26,1992
3,4,Karl Malone,28,UTA,1.0,262.0,960,0.273,81,37.7,28.0,11.2,3.0,1.3,0.6,0.526,0.176,0.778,15.1,0.237,1992
4,5,Patrick Ewing,29,NYK,0.0,100.0,960,0.104,82,38.4,24.0,11.2,1.9,1.1,3.0,0.522,0.167,0.738,13.0,0.198,1992


In [5]:
mvp_df.to_csv(r'Documents\MVP Voting.csv', index = False)

### Scrape Player Stats

In [6]:
stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

for year in years:
    url = stats_url.format(year)
    data = requests.get(url)
    
    with open("Basketball Reference/{}_Player_Stats.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

### Create Player Stats Dataframe

In [7]:
dfs = []
for year in years:
    with open("Basketball Reference/{}_Player_Stats.html".format(year), encoding = "utf-8") as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    stats_table = soup.find_all(id="per_game_stats")
    df = pd.read_html(str(stats_table))[0]
    df["Year"] = year
    dfs.append(df)

In [8]:
stats_df = pd.concat(dfs)
stats_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,23,POR,71,1,13.2,2.5,5.1,0.493,0.0,0.0,,2.5,5.1,0.493,0.493,1.1,1.4,0.752,1.1,2.5,3.7,0.4,0.4,0.2,0.9,1.9,6.1,1992
1,2,Mahmoud Abdul-Rauf,PG,22,DEN,81,11,19.0,4.4,10.4,0.421,0.4,1.2,0.33,4.0,9.3,0.433,0.44,1.2,1.3,0.87,0.3,1.1,1.4,2.4,0.5,0.0,1.4,1.6,10.3,1992
2,3,Mark Acres,C,29,ORL,68,6,13.6,1.1,2.2,0.517,0.0,0.0,0.333,1.1,2.2,0.52,0.52,0.8,1.0,0.761,1.4,2.3,3.7,0.3,0.4,0.2,0.5,2.1,3.1,1992
3,4,Michael Adams,PG,29,WSB,78,78,35.8,6.2,15.8,0.393,1.6,4.9,0.324,4.6,10.9,0.425,0.444,4.0,4.6,0.869,0.7,3.2,4.0,7.6,1.9,0.1,2.7,2.1,18.1,1992
4,5,Rafael Addison,SF,27,NJN,76,8,15.5,2.5,5.7,0.433,0.2,0.6,0.286,2.3,5.0,0.452,0.449,0.7,1.0,0.737,0.9,1.3,2.2,0.9,0.4,0.4,0.6,1.4,5.8,1992


In [9]:
stats_df.to_csv(r'Documents\Player Stats for MVP.csv', index = False)

### Scraping Team Records

In [10]:
record_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = record_url.format(year)
    data = requests.get(url)
    
    with open("Basketball Reference/{}_Records.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

In [11]:
dfs = []
for year in years:
    with open("Basketball Reference/{}_Records.html".format(year), encoding = "utf-8") as f:
        page = f.read()
        
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr',class_='thead').decompose()
    record_table = soup.find_all(id="divs_standings_E")
    df = pd.read_html(str(record_table))[0]
    df["Year"] = year
    df["Team"] = df['Eastern Conference']
    del df['Eastern Conference']
    dfs.append(df)
    
    soup = BeautifulSoup(page, 'html.parser')
    soup.find('tr',class_='thead').decompose()
    record_table = soup.find_all(id="divs_standings_W")
    df = pd.read_html(str(record_table))[0]
    df["Year"] = year
    df["Team"] = df['Western Conference']
    del df['Western Conference']
    dfs.append(df)

In [12]:
record_df = pd.concat(dfs)

In [13]:
record_df.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,51,31,0.622,—,106.6,103.0,3.41,1992,Boston Celtics*
1,51,31,0.622,—,101.6,97.7,3.67,1992,New York Knicks*
2,40,42,0.488,11.0,105.4,107.1,-1.54,1992,New Jersey Nets*
3,38,44,0.463,13.0,105.0,109.2,-3.94,1992,Miami Heat*
4,35,47,0.427,16.0,101.9,103.2,-1.34,1992,Philadelphia 76ers


In [14]:
record_df.to_csv(r'Documents\NBA Team Records.csv', index = False)