# MVP Scraping

This is part 1 of 3 for Machine Learning NBA MVP Prediction. In this part I will be scraping data from Basketball Reference that will be used on the later parts of the project. There will be 3 key web scrapes for historical data, which are for MVP voting history, individual players' stats, and individual team records.

In [1]:
#1995-present
years = list(range(1995,2024))

In [2]:
#Scrape MVP voting results from basketball reference
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [3]:
import requests

for year in years:
    url = url_start.format(year)
    data = requests.get(url)
    
    with open("Data/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [4]:
from bs4 import BeautifulSoup

#### MVP Dataframe

In [5]:
with open("Data/1995.html") as f:
    page = f.read()

In [6]:
soup = BeautifulSoup(page,"html.parser")

In [7]:
soup.find("tr", class_= "over_header").decompose()

In [8]:
mvp_table = soup.find_all(id="mvp")

In [9]:
import pandas as pd

In [10]:
mvp_1995 = pd.read_html(str(mvp_table))[0]
mvp_1995

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,David Robinson,29,SAS,73.0,901.0,1050,0.858,81,38.0,27.6,10.8,2.9,1.7,3.2,0.53,0.3,0.774,17.5,0.273
1,2,Shaquille O'Neal,22,ORL,12.0,605.0,1050,0.576,79,37.0,29.3,11.4,2.7,0.9,2.4,0.583,0.0,0.533,14.0,0.23
2,3,Karl Malone,31,UTA,14.0,532.0,1050,0.507,82,38.1,26.7,10.6,3.5,1.6,1.0,0.536,0.268,0.742,13.8,0.212
3,4,Patrick Ewing,32,NYK,2.0,230.0,1050,0.219,79,37.0,23.9,11.0,2.7,0.9,2.0,0.503,0.286,0.75,9.6,0.157
4,5,Hakeem Olajuwon,32,HOU,1.0,147.0,1050,0.14,72,39.6,27.8,10.8,3.5,1.8,3.4,0.517,0.188,0.756,10.7,0.181
5,6,Charles Barkley,31,PHO,1.0,96.0,1050,0.091,68,35.0,23.0,11.1,4.1,1.6,0.7,0.486,0.338,0.748,10.6,0.214
6,7,Scottie Pippen,29,CHI,1.0,83.0,1050,0.079,79,38.2,21.4,8.1,5.2,2.9,1.1,0.48,0.345,0.716,11.8,0.188
7,8,John Stockton,32,UTA,1.0,47.0,1050,0.045,82,35.0,14.7,3.1,12.3,2.4,0.3,0.542,0.449,0.804,13.9,0.233
8,9,Gary Payton,26,SEA,0.0,34.0,1050,0.032,82,36.8,20.6,3.4,7.1,2.5,0.2,0.509,0.302,0.716,11.7,0.187
9,10,Anfernee Hardaway,23,ORL,0.0,23.0,1050,0.022,77,37.7,20.9,4.4,7.2,1.7,0.3,0.512,0.349,0.769,10.7,0.177


In [11]:
#Dataframe for all players who have earned MVP votes
dfs = []
for year in years:
    with open("Data/{}.html".format(year)) as f:
        page = f.read()
    soup = BeautifulSoup(page,"html.parser")
    soup.find("tr", class_= "over_header").decompose()
    mvp_table = soup.find_all(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp["Year"] = year
    
    dfs.append(mvp)

In [12]:
mvps = pd.concat(dfs)
mvps

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,David Robinson,29,SAS,73.0,901.0,1050,0.858,81,38.0,...,10.8,2.9,1.7,3.2,0.530,0.300,0.774,17.5,0.273,1995
1,2,Shaquille O'Neal,22,ORL,12.0,605.0,1050,0.576,79,37.0,...,11.4,2.7,0.9,2.4,0.583,0.000,0.533,14.0,0.230,1995
2,3,Karl Malone,31,UTA,14.0,532.0,1050,0.507,82,38.1,...,10.6,3.5,1.6,1.0,0.536,0.268,0.742,13.8,0.212,1995
3,4,Patrick Ewing,32,NYK,2.0,230.0,1050,0.219,79,37.0,...,11.0,2.7,0.9,2.0,0.503,0.286,0.750,9.6,0.157,1995
4,5,Hakeem Olajuwon,32,HOU,1.0,147.0,1050,0.140,72,39.6,...,10.8,3.5,1.8,3.4,0.517,0.188,0.756,10.7,0.181,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,9,Stephen Curry,34,GSW,0.0,5.0,1000,0.005,56,34.7,...,6.1,6.3,0.9,0.4,0.493,0.427,0.915,7.8,0.192,2023
9,10,Jimmy Butler,33,MIA,0.0,3.0,1000,0.003,64,33.4,...,5.9,5.3,1.8,0.3,0.539,0.350,0.850,12.3,0.277,2023
10,11,De'Aaron Fox,25,SAC,0.0,2.0,1000,0.002,73,33.4,...,4.2,6.1,1.1,0.3,0.512,0.324,0.780,7.4,0.146,2023
11,12T,Jalen Brunson,26,NYK,0.0,1.0,1000,0.001,68,35.0,...,3.5,6.2,0.9,0.2,0.491,0.416,0.829,8.7,0.175,2023


In [13]:
mvps.to_csv("mvps.csv")

  values = values.astype(str)


#### Player Stats DF

In [14]:
#Scrape NBA player stats
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

url = player_stats_url.format(1995)
data = requests.get(url)
with open("Data/1995.html","w+") as f:
    f.write(data.text)

In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

In [16]:
s = Service('C:/Users/.../chromedriver.exe')
driver = webdriver.Chrome(service=s)

In [17]:
import time

year = 1995
url = player_stats_url.format(year)

driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")
time.sleep(3)

html = driver.page_source

In [18]:
with open("Data/{}.html".format(year), "w+") as f:
    f.write(html)

In [19]:
for year in years:
    url = player_stats_url.format(year)

    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")
    time.sleep(3)

    html = driver.page_source
    with open("Data/{}.html".format(year), "w+") as f:
        f.write(html)

In [20]:
#Df of all players
df = []
for year in years:
    with open("Data/{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page,"html.parser")
    soup.find("tr", class_= "thead").decompose()
    player_table = soup.find(id="per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player["Year"] = year
    
    df.append(player)

In [21]:
players = pd.concat(df)
players.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,26,TOT,54,0,9.4,2.2,4.3,...,0.7,1.4,2.1,0.2,0.3,0.2,0.8,1.9,4.7,1995
1,1,Alaa Abdelnaby,PF,26,SAC,51,0,9.3,2.3,4.3,...,0.7,1.4,2.1,0.3,0.3,0.2,0.8,2.0,5.0,1995
2,1,Alaa Abdelnaby,PF,26,PHI,3,0,10.0,0.3,3.7,...,1.0,1.7,2.7,0.0,0.0,0.0,1.7,0.7,0.7,1995
3,2,Mahmoud Abdul-Rauf,PG,25,DEN,73,43,28.5,6.5,13.8,...,0.4,1.4,1.9,3.6,1.1,0.1,1.6,1.7,16.0,1995
4,3,Michael Adams,PG,32,CHH,29,0,15.3,2.3,5.1,...,0.2,0.8,1.0,3.3,0.8,0.0,0.9,1.4,6.5,1995


In [22]:
players.to_csv("players.csv")

#### Team Records DF

In [23]:
#Scrape team standings per year
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    url = team_stats_url.format(year)

    data = requests.get(url)

    with open("Team/{}.html".format(year), "w+") as f:
        f.write(data.text)

In [24]:
#Df of all teams and records per year
dfs = []
for year in years:
    with open("Team/{}.html".format(year)) as f:
        page = f.read()
    
    soup = BeautifulSoup(page,"html.parser")
    soup.find("tr", class_= "thead").decompose()
    team_table = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Eastern Conference"]
    del team["Eastern Conference"]
    dfs.append(team)
    
    soup = BeautifulSoup(page,"html.parser")
    soup.find("tr", class_= "thead").decompose()
    team_table = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team["Year"] = year
    team["Team"] = team["Western Conference"]
    del team["Western Conference"]

    dfs.append(team)
    
teams = pd.concat(dfs)

In [25]:
teams.head()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,57,25,0.695,—,110.9,103.8,6.44,1995,Orlando Magic*
1,55,27,0.671,2.0,98.2,95.1,2.78,1995,New York Knicks*
2,35,47,0.427,22.0,102.8,104.7,-1.92,1995,Boston Celtics*
3,32,50,0.39,25.0,101.1,102.8,-1.85,1995,Miami Heat
4,30,52,0.366,27.0,98.1,101.2,-3.28,1995,New Jersey Nets


In [26]:
teams.to_csv("teams.csv")