## Web Scraping
### I am doing web scraping in orfer to get info about the NBA stats. The tables I need to scrape are 
### 1- the most important player table
### 2- all player stats table
### 3- team data tables

In [1]:
#!pip install requests #Requests allows you to send HTTP/1.1 requests extremely easily.

In [1]:
import requests #allows you to send HTTP requests using Python
import os #a portable way of using operating system dependent functionality. If you just want to read or write a file, manipulate paths use this.
import shutil #offers a number of high-level operations on files and collections of files. In particular, functions are provided which support file copying and removal. 

#### Get the data

In [2]:
#Define the years we want to scrape data from
years= list(range(1991,2022)) # range function doesn't include 2022
print(years)

[1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [5]:
#Define the url we will scrape from
url_start= "https://www.basketball-reference.com/awards/awards_{}.html"
 # {} => is a placeholder.It allows us to replace that part with the element of the years list.

for year in years:
    url = url_start.format(year) # format(x) function replaces the place holder with the x value.
    data = requests.get(url)
    #Save the data
    with open("mvp/{}.html".format(year),"w+") as my_file: # w+ => Read/Write. Creates a new file. Returns FALSE and an error if file already exists.
        my_file.write(data.text)

#.text gives us the actual html page from the response.
#print(data.text)# print the html page of year 2021

### Parsing the votes table with beautifulsoup

In [5]:
#!pip install beautifulsoup4 #a Python library for pulling data out of HTML and XML files
                             # We need it to pull the mvp (most valuable player) table from the html files.

In [4]:
from bs4 import BeautifulSoup

### Extracting the mvp table from year 2020

In [14]:
with open("mvp/2020.html") as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser') # creates a parser class that we can use to extract a table from the page
soup.find("tr",class_="over_header").decompose() #decompose() funtion removes the componet.
mvp_table = soup.find_all(id="mvp")[0]

#The find method is used for finding out the first tag with the specified name or id and returning an object of type bs4.
# The find_all method is used for finding out all tags with the specified tag name or id and returning them as a list of type bs4.


In [15]:
type(mvp_table)

bs4.element.Tag

In [16]:
mvp_table

<table class="sortable stats_table" data-cols-to-freeze=",2" id="mvp">
<caption>Most Valuable Player (Michael Jordan Trophy) Table</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>

<tr>
<th aria-label="rank" class="poptip center" data-stat="rank" scope="col">Rank</th>
<th aria-label="Player" class="poptip sort_default_asc left" data-stat="player" scope="col">Player</th>
<th aria-label="Age" class="poptip sort_default_asc center" data-stat="age" data-tip="Player's age on February 1 of the season" scope="col">Age</th>
<th aria-label="Tm" class="poptip sort_default_asc left" data-stat="team_id" data-tip="Team" scope="col">Tm</th>
<th aria-label="First" class="poptip right" data-over-header="Voting" data-stat="votes_first" data-tip="First Place Votes" scope="col">First</th>
<th aria-label="Pts Won" class="poptip right" data-over-header="Voting" data-stat="points_won" scope="col">P

In [6]:
import pandas as pd

In [18]:
mvp_2020 = pd.read_html(str(mvp_table))[0]
mvp_2020.head(1)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,1,Giannis Antetokounmpo,25,MIL,85.0,962.0,1010,0.952,63,30.4,29.5,13.6,5.6,1.0,1.0,0.553,0.304,0.633,11.1,0.279


### Extracting the mvp tables from every year

In [21]:
dataframes=[]
for year in years:
    with open("mvp/{}.html".format(year))as f:
        page=f.read()

    soup= BeautifulSoup(page, "html.parser")
    soup.find("tr",class_="over_header").decompose()
    my_table=soup.find_all(id="mvp")[0]

    mvp_df=pd.read_html(str(my_table))[0]
    mvp_df['Year']=year #after extractig the table I add a year column to the table and fill it with the year value.
    dataframes.append(mvp_df) # add each extracted df to  the dataframes list

AttributeError: 'NoneType' object has no attribute 'decompose'

In [None]:
len(dataframes)

31

In [None]:
dataframes[0].head(2)

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
0,1,Michael Jordan,27,CHI,77.0,891.0,960,0.928,82,37.0,...,6.0,5.5,2.7,1.0,0.539,0.312,0.851,20.3,0.321,1991
1,2,Magic Johnson,31,LAL,10.0,497.0,960,0.518,79,37.1,...,7.0,12.5,1.3,0.2,0.477,0.32,0.906,15.4,0.251,1991


In [None]:
dataframes

[   Rank             Player  Age   Tm  First  Pts Won  Pts Max  Share   G  \
 0     1     Michael Jordan   27  CHI   77.0    891.0      960  0.928  82   
 1     2      Magic Johnson   31  LAL   10.0    497.0      960  0.518  79   
 2     3     David Robinson   25  SAS    6.0    476.0      960  0.496  82   
 3     4    Charles Barkley   27  PHI    2.0    222.0      960  0.231  67   
 4     5        Karl Malone   27  UTA    0.0    142.0      960  0.148  82   
 5     6      Clyde Drexler   28  POR    1.0     75.0      960  0.078  82   
 6     7      Kevin Johnson   24  PHO    0.0     32.0      960  0.033  77   
 7     8  Dominique Wilkins   31  ATL    0.0     29.0      960  0.030  81   
 8    9T         Larry Bird   34  BOS    0.0     25.0      960  0.026  60   
 9    9T       Terry Porter   27  POR    0.0     25.0      960  0.026  81   
 10   11      Patrick Ewing   28  NYK    0.0     20.0      960  0.021  81   
 11   12      John Stockton   28  UTA    0.0     15.0      960  0.016  82   

#### Combining mvp votes with pandas

In [None]:
#concatenate all dataframes elements in order to get a single dataframe(mvps)

mvps = pd.concat(dataframes)

mvps.tail()
#mvps.shape

Unnamed: 0,Rank,Player,Age,Tm,First,Pts Won,Pts Max,Share,G,MP,...,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Year
10,11,Russell Westbrook,32,WAS,0.0,5.0,1010,0.005,65,36.4,...,11.5,11.7,1.4,0.4,0.439,0.315,0.656,3.7,0.075,2021
11,12,Ben Simmons,24,PHI,0.0,3.0,1010,0.003,58,32.4,...,7.2,6.9,1.6,0.6,0.557,0.3,0.613,6.0,0.153,2021
12,13T,James Harden,31,TOT,0.0,1.0,1010,0.001,44,36.6,...,7.9,10.8,1.2,0.8,0.466,0.362,0.861,7.0,0.208,2021
13,13T,LeBron James,36,LAL,0.0,1.0,1010,0.001,45,33.4,...,7.7,7.8,1.1,0.6,0.513,0.365,0.698,5.6,0.179,2021
14,13T,Kawhi Leonard,29,LAC,0.0,1.0,1010,0.001,52,34.1,...,6.5,5.2,1.6,0.4,0.512,0.398,0.885,8.8,0.238,2021


In [None]:
#Write object to a comma-separated values (csv) file.
mvps.to_csv("mvps.csv")

### Download all the players data
In order to make an assumption about the most valuable player in a season we need to know th stats of all the players and then make a comparison.

In [None]:
#!pip install selenium #The selenium package is used to automate web browser interaction from Python.

In [7]:
player_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

# for year in years:
#     url = player_stats_url.format(year)
    
#     data = requests.get(url)
    
#     with open("player/{}.html".format(year), "w+") as f:
#         f.write(data.text)

In [8]:
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [10]:
driver = webdriver.Chrome()

#### Download all players data for 1991

In [11]:
year=1991
url = player_stats_url.format(year)
    
driver.get(url)
driver.execute_script("window.scrollTo(1,10000)")# this is a javascript command 
                                                 #window.scrollTo(1,10000)=> tells driver toscroll the whole page
                                                #Scroll the document to position 1 horizontally and 1000 vertically
time.sleep(2)
    
html=driver.page_source #driver. page_source retrieves the HTML of the url you passed on driver

In [None]:
html



#### Download all players data for each year

In [12]:
for year in years:
    url = player_stats_url.format(year)
        
    driver.get(url)
    driver.execute_script("window.scrollTo(1,10000)")# this is a javascript command 
                                                    #window.scrollTo(1,10000)=> tells driver toscroll the whole page
    time.sleep(2)

    with open("player/{}.html".format(year), "w+") as f:
        f.write(driver.page_source)

#### Parsing the all player stats with BeautifulSoup

In [None]:
dataframes=[]
for year in years:
    with open("player/{}.html".format(year)) as f:
        page= f.read()
    soup= BeautifulSoup(page,"html.parser")
    soup.find('tr', class_="thead").decompose() # the row names show up sevral times as we scroll down through the table
                                                # since it can cause a problem in the dataframe we remove it.
    html=soup.find_all(id="per_game_stats")
    df=pd.read_html(str(html))[0]
    df["Year"]=year
    dataframes.append(df)

In [None]:
dataframes[0].head(1)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,1,Alaa Abdelnaby,PF,22,POR,43,0,6.7,1.3,2.7,...,0.6,1.4,2.1,0.3,0.1,0.3,0.5,0.9,3.1,1991


#### Combining player stats with pandas

In [None]:
players= pd.concat(dataframes)
players.tail()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
725,536,Delon Wright,PG,28,SAC,27,8,25.8,3.9,8.3,...,1.0,2.9,3.9,3.6,1.6,0.4,1.3,1.1,10.0,2021
726,537,Thaddeus Young,PF,32,CHI,68,23,24.3,5.4,9.7,...,2.5,3.8,6.2,4.3,1.1,0.6,2.0,2.2,12.1,2021
727,538,Trae Young,PG,22,ATL,63,63,33.7,7.7,17.7,...,0.6,3.3,3.9,9.4,0.8,0.2,4.1,1.8,25.3,2021
728,539,Cody Zeller,C,28,CHO,48,21,20.9,3.8,6.8,...,2.5,4.4,6.8,1.8,0.6,0.4,1.1,2.5,9.4,2021
729,540,Ivica Zubac,C,23,LAC,72,33,22.3,3.6,5.5,...,2.6,4.6,7.2,1.3,0.3,0.9,1.1,2.6,9.0,2021


In [None]:
players.shape

(18044, 31)

In [None]:
#Write object to a comma-separated values (csv) file.
players.to_csv("players.csv")

#### Download team data for each year

In [13]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

for year in years:
    data= requests.get(team_stats_url.format(year))
    with open("team/{}.html".format(year),"w+") as f:
        f.write(data.text)

#### Parsing the team data with BeautifulSoup
Here we need to scrape two tables DivisionStandings_E and DivisionStandings_W

In [14]:
dataframes=[]
for year in range(1991,2008):    
    with open("team/{}.html".format(year)) as f:
        page=f.read()

    soup=BeautifulSoup(page,"html.parser")

    scripts=soup.find_all("tr", class_="thead")
    for script in scripts:
        script.decompose()
    #soup.find('tr', class_="thead").decompose()
    #print(soup.find("tr", class_="thead"))

    html_e= soup.find_all(id="divs_standings_E")[0]
    table_e = pd.read_html(str(html_e))[0]
    table_e["Year"]=year
    table_e["Team"] = table_e["Eastern Conference"]
    del table_e["Eastern Conference"]
    dataframes.append(table_e)

    html_w= soup.find_all(id="divs_standings_W")[0]
    table_w = pd.read_html(str(html_w))[0]
    table_w["Year"]=year
    table_w["Team"] = table_w["Western Conference"]
    del table_w["Western Conference"]
    dataframes.append(table_w)


In [16]:
dataframes[0].head(1)

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
0,56,26,0.683,—,111.5,105.7,5.22,1991,Boston Celtics*


In [17]:
teams= pd.concat(dataframes)
teams.tail()

Unnamed: 0,W,L,W/L%,GB,PS/G,PA/G,SRS,Year,Team
10,67,15,0.817,—,100.0,92.8,7.28,2007,Dallas Mavericks*
11,58,24,0.707,9.0,98.5,90.1,8.35,2007,San Antonio Spurs*
12,52,30,0.634,15.0,97.0,92.1,5.04,2007,Houston Rockets*
13,39,43,0.476,28.0,95.5,97.1,-1.19,2007,New Orleans/Oklahoma City Hornets
14,22,60,0.268,45.0,101.6,106.7,-4.44,2007,Memphis Grizzlies


In [18]:
teams.to_csv("teams.csv")