# Project Luther

In [1]:
# Relevant libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import time

## List of Functions

In [11]:
# Running list of functions

def bball_get_col(table, col_tag='th'):
    '''
    The columns of the table should be in the first 'tr' tag within the table tag.  
    The columns will be found as either 'th' or 'td' tags within the 'tr' tag.  
    The 'th' tag is set as default.
    '''
    col_loc = table.find('tr')
    cols = col_loc.find_all(col_tag)

    cols_list = []
    for i in range(len(cols)):
        temp_col = cols[i].get_text()
        cols_list.append(temp_col)
        
    return cols_list

def bball_get_data(table, data_tag='td'):
    '''
    The data within a table should be the second through last 'tr' tag within the table tag.
    The data should be found as 'td' tags, so the default data tag is set to 'td'.
    '''
    all_rows = table.find_all('tr')[1:]

    all_data = []

    for i in range(len(all_rows)):
        row = all_rows[i].find_all('th')
        add_row = all_rows[i].find_all('td')
        row.extend(add_row)
        data = []
        for j in range(len(row)):
            datapoint = row[j].get_text()
            data.append(datapoint)
        all_data.append(data)
        
    return all_data

def bball_scrape_data(url_template, start_year, end_year, delay=5):
    '''
    The url_template should be a formatted string where the start_year and end_year can be cycled through.
    The start_year and end_year should be integers.
    The col_list should be a list.
    The default data_tag is 'td'.
    The delay, which delays each run through of the for loop, is set to 5 seconds as default.  
    The delay prevents the scraper from being blocked due to frequent scraping requests.
    '''
    df = pd.DataFrame()
    
    if start_year > end_year:
        return print('Enter in valid end year.')
    else: 
        for year in range(start_year, end_year+1):
            try:
                url = url_template.format(year=year)
                link = requests.get(url)
            except:
                print(url)
                print('Check to make sure the URL is correct!')
            
            print(url)
            page = link.text

            soup = BeautifulSoup(page, 'lxml')
            table = soup.find('table')

            col_list = bball_get_col(table)
            all_data = bball_get_data(table)

            if len(col_list) != len(all_data[0]):
                print('Column List: \n', col_list)
                print('Data Row: \n', all_data)
                return print('Make sure the length of columns and data are consistent!')
            else:
                temp_df = pd.DataFrame(all_data, columns=col_list)
                temp_df = temp_df.assign(Yr = year)
                df = df.append(temp_df)
            time.sleep(delay)

    return df




def sal_get_col(table_sal):
    col_loc_sal = table_sal.find('tr')
    cols_sal = col_loc_sal.find_all('td')

    cols_list_sal = []
    for i in range(len(cols_sal)):
        temp_col_sal = cols_sal[i].get_text()
        cols_list_sal.append(temp_col_sal)
    return cols_list_sal

def sal_get_data(table_sal):
    all_rows_sal = table_sal.find_all('tr', class_ = ['evenrow', 'oddrow'])

    all_data_sal = []

    for i in range(len(all_rows_sal)):
        row_sal = all_rows_sal[i].find_all('td')
        data_sal = []
        for j in range(len(row_sal)):
            datapoint_sal = row_sal[j].get_text()
            data_sal.append(datapoint_sal)
        all_data_sal.append(data_sal)
        
    return all_data_sal

def sal_scrape_data(url_template, start_year, end_year, start_page, end_page, delay=5):
    df = pd.DataFrame()
    
    if start_year > end_year:
        return print('Enter in valid end year.')
    elif start_page > end_page:
        return pring('Enter in a valid end page.')
    else: 
        for year in range(start_year, end_year+1):
            for page in range(start_page, end_page+1):
                try:
                    url = url_template.format(year=year, page=page)
                    link = requests.get(url)
                except:
                    print(url)
                    print('Check to make sure the URL is correct!')
                
                print(url)
                page = link.text

                soup = BeautifulSoup(page, 'lxml')
                table = soup.find('table')

                col_list = sal_get_col(table)
                all_data = sal_get_data(table)
                
                if not col_list or not all_data:
                    print('Webpage may be empty.')
                    pass
                elif len(col_list) != len(all_data[0]):
                    print('Column List: \n', col_list)
                    print('Data Row: \n', all_data)
                    return print('Make sure the length of columns and data are consistent!')
                else:
                    temp_df = pd.DataFrame(all_data, columns=col_list)
                    temp_df = temp_df.assign(Yr = year)
                    df = df.append(temp_df)
                time.sleep(delay)
                
    return df

## Basketball-reference Web Scrape

In [4]:
# Test the web scrape
url_bball = 'http://www.basketball-reference.com/leagues/NBA_2016_totals.html'
link_bball = requests.get(url_bball)
page_bball = link_bball.text

soup_bball = BeautifulSoup(page_bball, 'lxml')
table_bball = soup_bball.find('table')

### Get column headers

In [5]:
# Pull column names from the table
# th should be in the first tr tag
col_loc_bball = table_bball.find('tr')
cols_bball = col_loc_bball.find_all('th')

cols_list_bball = []
for i in range(len(cols_bball)):
    temp_col_bball = cols_bball[i].get_text()
    cols_list_bball.append(temp_col_bball)

### Get data

In [6]:
# Pull data from the table
# td tags in the second through last tr tag should contain all of the data
all_rows_bball = table_bball.find_all('tr')[1:]

all_data_bball = []

for i in range(len(all_rows_bball)):
    row_bball = all_rows_bball[i].find_all('td')
    data_bball = []
    for j in range(len(row_bball)):
        datapoint_bball = row_bball[j].get_text()
        data_bball.append(datapoint_bball)
    all_data_bball.append(data_bball)

# Remove the first column name since it is just another index of the player list
df_2016 = pd.DataFrame(all_data_bball, columns=cols_list_bball[1:])

### Combine the above three steps into one function

In [8]:
# Combine the above steps into one function that can run through multiple urls

# Get the data from 2000 to 2016

bball_ref_url = 'http://www.basketball-reference.com/leagues/NBA_{year}_advanced.html'
start_year = 2000
end_year = 2017

df_bball = bball_scrape_data(bball_ref_url, start_year, end_year)
df_bball.head()
        

http://www.basketball-reference.com/leagues/NBA_2000_advanced.html
http://www.basketball-reference.com/leagues/NBA_2001_advanced.html
http://www.basketball-reference.com/leagues/NBA_2002_advanced.html
http://www.basketball-reference.com/leagues/NBA_2003_advanced.html
http://www.basketball-reference.com/leagues/NBA_2004_advanced.html
http://www.basketball-reference.com/leagues/NBA_2005_advanced.html
http://www.basketball-reference.com/leagues/NBA_2006_advanced.html
http://www.basketball-reference.com/leagues/NBA_2007_advanced.html
http://www.basketball-reference.com/leagues/NBA_2008_advanced.html
http://www.basketball-reference.com/leagues/NBA_2009_advanced.html
http://www.basketball-reference.com/leagues/NBA_2010_advanced.html
http://www.basketball-reference.com/leagues/NBA_2011_advanced.html
http://www.basketball-reference.com/leagues/NBA_2012_advanced.html
http://www.basketball-reference.com/leagues/NBA_2013_advanced.html
http://www.basketball-reference.com/leagues/NBA_2014_advanced.

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,...,OWS,DWS,WS,WS/48,Unnamed: 16,OBPM,DBPM,BPM,VORP,Yr
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,1578,13.6,0.477,0.036,...,0.4,1.8,2.2,0.068,,-1.6,0.1,-1.5,0.2,2000
1,1,Tariq Abdul-Wahad,SG,25,ORL,46,1205,14.4,0.484,0.041,...,0.4,1.6,2.0,0.082,,-1.3,0.4,-0.8,0.4,2000
2,1,Tariq Abdul-Wahad,SG,25,DEN,15,373,10.8,0.448,0.015,...,0.0,0.2,0.2,0.023,,-2.8,-1.0,-3.8,-0.2,2000
3,2,Shareef Abdur-Rahim,SF,23,VAN,82,3223,20.2,0.547,0.075,...,6.2,2.6,8.8,0.132,,1.8,0.5,2.3,3.5,2000
4,3,Cory Alexander,PG,26,DEN,29,329,8.8,0.381,0.357,...,-0.5,0.4,-0.1,-0.012,,-3.6,0.1,-3.5,-0.1,2000


### Save dataframe to csv file

In [9]:
# Save the dataframe to a csv file
df_bball.to_csv('bball_ref_player_database.csv')

## NBA Player Salaries Web Scrape

In [23]:
# Get the salary data from 2000 to 2016

# Test the web scrape
url_sal = 'http://www.espn.com/nba/salaries/_/year/2000/page/1/seasontype/3'
link_sal = requests.get(url_sal)
page_sal = link_sal.text

soup_sal = BeautifulSoup(page_sal, 'lxml')
table_sal = soup_sal.find('table')

In [26]:
# Pull column names from the table
# First tr tag contains td tags that are the column names
col_loc_sal = table_sal.find('tr')
cols_sal = col_loc_sal.find_all('td')

cols_list_sal = []
for i in range(len(cols_sal)):
    temp_col_sal = cols_sal[i].get_text()
    cols_list_sal.append(temp_col_sal)

In [30]:
# Pull data from the table
# td tags in every tr tag with the class 'evenrow' and 'oddrow' have the salary data
all_rows_sal = table_sal.find_all('tr', class_ = ['evenrow', 'oddrow'])

all_data_sal = []

for i in range(len(all_rows_sal)):
    row_sal = all_rows_sal[i].find_all('td')
    data_sal = []
    for j in range(len(row_sal)):
        datapoint_sal = row_sal[j].get_text()
        data_sal.append(datapoint_sal)
    all_data_sal.append(data_sal)

In [32]:
# Create the dataframe
df_sal_2000 = pd.DataFrame(all_data_sal, columns=cols_list_sal)

In [12]:
salary_url = 'http://www.espn.com/nba/salaries/_/year/{year}/page/{page}/seasontype/3'
start_year2 = 2000
end_year2 = 2017
start_page = 1
end_page = 4

df_sal = sal_scrape_data(salary_url, start_year2, end_year2, start_page, end_page)
df_sal.head()

http://www.espn.com/nba/salaries/_/year/2000/page/1/seasontype/3
http://www.espn.com/nba/salaries/_/year/2000/page/2/seasontype/3
http://www.espn.com/nba/salaries/_/year/2000/page/3/seasontype/3
http://www.espn.com/nba/salaries/_/year/2000/page/4/seasontype/3
http://www.espn.com/nba/salaries/_/year/2001/page/1/seasontype/3
http://www.espn.com/nba/salaries/_/year/2001/page/2/seasontype/3
http://www.espn.com/nba/salaries/_/year/2001/page/3/seasontype/3
http://www.espn.com/nba/salaries/_/year/2001/page/4/seasontype/3
http://www.espn.com/nba/salaries/_/year/2002/page/1/seasontype/3
http://www.espn.com/nba/salaries/_/year/2002/page/2/seasontype/3
http://www.espn.com/nba/salaries/_/year/2002/page/3/seasontype/3
http://www.espn.com/nba/salaries/_/year/2002/page/4/seasontype/3
http://www.espn.com/nba/salaries/_/year/2003/page/1/seasontype/3
http://www.espn.com/nba/salaries/_/year/2003/page/2/seasontype/3
http://www.espn.com/nba/salaries/_/year/2003/page/3/seasontype/3
http://www.espn.com/nba/s

Unnamed: 0,RK,NAME,TEAM,SALARY,Yr
0,1,"Shaquille O'Neal, C",Los Angeles Lakers,"$17,142,000",2000
1,2,"Kevin Garnett, PF",Minnesota Timberwolves,"$16,806,000",2000
2,3,"Alonzo Mourning, C",Miami Heat,"$15,004,000",2000
3,4,"Juwan Howard, PF",Washington Wizards,"$15,000,000",2000
4,5,"Scottie Pippen, SF",Portland Trail Blazers,"$14,795,000",2000


In [13]:
# Save the dataframe to a csv file
df_sal.to_csv('salary_database.csv')

In [6]:
# TEST CELL

# print(rows[:3])
# len(rows)

# cols_list

# df_2016.head()

# url2 = url_template.format(year=2015)
# url2

# url_template = 'http://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
# start_year = 2015
# end_year = 2016

# col_list = get_col()
# col_list = col_list[1:]
# df = scrape_data(url_template, start_year, end_year, col_list)
# df.head()

# try:
#     url3 = 'http://www.basketball-reference.com/leagues/NBA_2016_total'
#     link3 = requests.get(url3)
# except

# len(bball_col_list)
# print(len(all_data[0]))

# a = 0
# if a:
#     print(True)
# else:
#     print(False)

# cols_list_sal
# all_data_sal
df_sal

Unnamed: 0,RK,NAME,TEAM,SALARY,Yr
0,1,"Kobe Bryant, SF",Los Angeles Lakers,"$25,000,000",2016
1,2,"LeBron James, SF",Cleveland Cavaliers,"$22,970,500",2016
2,3,"Carmelo Anthony, SF",New York Knicks,"$22,875,000",2016
3,4,"Dwight Howard, C",Houston Rockets,"$22,359,364",2016
4,5,"Chris Bosh, PF",Miami Heat,"$22,192,730",2016
5,6,"Chris Paul, PG",LA Clippers,"$21,468,695",2016
6,7,"Kevin Durant, SF",Oklahoma City Thunder,"$20,158,622",2016
7,8,"Derrick Rose, PG",Chicago Bulls,"$20,093,064",2016
8,9,"Dwyane Wade, SG",Miami Heat,"$20,000,000",2016
9,10,"LaMarcus Aldridge, PF",San Antonio Spurs,"$19,689,000",2016
