Capstone Project: Basketball Model Win Probability
 
- Objective: to predict whether a team wins or loses a game
- Outcome: Binary
- Duration/Data Points: regular season only. 30 teams, 82 games a season, 25 seasons = ~ 60k data points
- Types of features:
        - For each team and it's opponent
        - Season Stats: % of total games completed, win pct, Offensive Rating, Defensive Rating, Pace        
        - Rolling 10 game stats:
        - Misc stats: current team home/away flag,
        - Team Stats: Home or Away,   
        - Team Record Stats: % of games played this season, Season Win Percentage, Rolling 10 game win %, Rolling 10 game
        Opponent win %, Win Percentage (based on home/away flag) 
        - Opponent Stats: Home or Away, Offensive Efficiency, Defensive Efficiency, Pace 
        - Opponent Record Stats: Season Win Percentage, Rolling 10 game win %, Rolling 10 game Opponent win %,
                                 Win Percentage (based on home/away flag)
        - Matchup Stats: Rolling 10 game win % against Current Opponent
   

In [45]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen
import string
import re

url_test = 'https://www.basketball-reference.com/teams/ATL/2005_games.html'
root_url = 'https://www.basketball-reference.com/teams/{}/{}_games.html'
team_list = pd.read_csv('Team and Season List.csv')
# defining list of websites to scrape from regular season
team_list.head()


Unnamed: 0,Team,Season
0,ATL,2002
1,ATL,2003
2,ATL,2004
3,ATL,2005
4,ATL,2006


In [53]:

html = urlopen(url_test)
soup = BeautifulSoup(html)
y = soup.find_all('table', {'id': 'games'})

#headers
headers = [th.getText() for th in y[0].find_all('th', limit=15)]
headers2 = headers[1:]
headers2

['Date',
 'Start (ET)',
 '\xa0',
 '\xa0',
 '\xa0',
 'Opponent',
 '\xa0',
 '\xa0',
 'Tm',
 'Opp',
 'W',
 'L',
 'Streak',
 'Notes']

In [54]:

def get_schedule(link):
    html = urlopen(link)
    soup = BeautifulSoup(html)   
    #locate table
    y = soup.findAll('tbody')
    #get dates
    date_raw = y[0].findAll('th', {'scope':'row'})
    dates = [th.getText() for th in date_raw]
    date_df = pd.DataFrame(dates, columns = ['Date'])
    game_rows = y[0].findAll('tr')
    stats = [[td.getText() for td in game_rows[i].findAll('td')] for i in range(len(game_rows))]
    rows_final = [lst for lst in stats if len(lst) > 0]
    final_df = pd.DataFrame(rows_final, columns = headers2)
    final_df2 = pd.concat([date_df, final_df], axis=1, join = 'inner')
    return final_df2

df_lst = []
team = list(team_list.Team)
season = list(team_list.Season)
for row in range(len(team)):
    team_name = team[row]
    season_number = season[row]
    url = root_url.format(team_name, season_number)
    print(url)
    df = get_schedule(url)
    df['team'] = team_name
    df['season'] = season_number
    df_lst.append(df)
    
final_df = pd.concat(df_lst)


https://www.basketball-reference.com/teams/ATL/2002_games.html
https://www.basketball-reference.com/teams/ATL/2003_games.html
https://www.basketball-reference.com/teams/ATL/2004_games.html
https://www.basketball-reference.com/teams/ATL/2005_games.html
https://www.basketball-reference.com/teams/ATL/2006_games.html
https://www.basketball-reference.com/teams/ATL/2007_games.html
https://www.basketball-reference.com/teams/ATL/2008_games.html
https://www.basketball-reference.com/teams/ATL/2009_games.html
https://www.basketball-reference.com/teams/ATL/2010_games.html
https://www.basketball-reference.com/teams/ATL/2011_games.html
https://www.basketball-reference.com/teams/ATL/2012_games.html
https://www.basketball-reference.com/teams/ATL/2013_games.html
https://www.basketball-reference.com/teams/ATL/2014_games.html
https://www.basketball-reference.com/teams/ATL/2015_games.html
https://www.basketball-reference.com/teams/ATL/2016_games.html
https://www.basketball-reference.com/teams/ATL/2017_gam

https://www.basketball-reference.com/teams/DEN/2010_games.html
https://www.basketball-reference.com/teams/DEN/2011_games.html
https://www.basketball-reference.com/teams/DEN/2012_games.html
https://www.basketball-reference.com/teams/DEN/2013_games.html
https://www.basketball-reference.com/teams/DEN/2014_games.html
https://www.basketball-reference.com/teams/DEN/2015_games.html
https://www.basketball-reference.com/teams/DEN/2016_games.html
https://www.basketball-reference.com/teams/DEN/2017_games.html
https://www.basketball-reference.com/teams/DEN/2018_games.html
https://www.basketball-reference.com/teams/DEN/2019_games.html
https://www.basketball-reference.com/teams/DET/2002_games.html
https://www.basketball-reference.com/teams/DET/2003_games.html
https://www.basketball-reference.com/teams/DET/2004_games.html
https://www.basketball-reference.com/teams/DET/2005_games.html
https://www.basketball-reference.com/teams/DET/2006_games.html
https://www.basketball-reference.com/teams/DET/2007_gam

https://www.basketball-reference.com/teams/MEM/2015_games.html
https://www.basketball-reference.com/teams/MEM/2016_games.html
https://www.basketball-reference.com/teams/MEM/2017_games.html
https://www.basketball-reference.com/teams/MEM/2018_games.html
https://www.basketball-reference.com/teams/MEM/2019_games.html
https://www.basketball-reference.com/teams/MIA/2002_games.html
https://www.basketball-reference.com/teams/MIA/2003_games.html
https://www.basketball-reference.com/teams/MIA/2004_games.html
https://www.basketball-reference.com/teams/MIA/2005_games.html
https://www.basketball-reference.com/teams/MIA/2006_games.html
https://www.basketball-reference.com/teams/MIA/2007_games.html
https://www.basketball-reference.com/teams/MIA/2008_games.html
https://www.basketball-reference.com/teams/MIA/2009_games.html
https://www.basketball-reference.com/teams/MIA/2010_games.html
https://www.basketball-reference.com/teams/MIA/2011_games.html
https://www.basketball-reference.com/teams/MIA/2012_gam

https://www.basketball-reference.com/teams/PHI/2005_games.html
https://www.basketball-reference.com/teams/PHI/2006_games.html
https://www.basketball-reference.com/teams/PHI/2007_games.html
https://www.basketball-reference.com/teams/PHI/2008_games.html
https://www.basketball-reference.com/teams/PHI/2009_games.html
https://www.basketball-reference.com/teams/PHI/2010_games.html
https://www.basketball-reference.com/teams/PHI/2011_games.html
https://www.basketball-reference.com/teams/PHI/2012_games.html
https://www.basketball-reference.com/teams/PHI/2013_games.html
https://www.basketball-reference.com/teams/PHI/2014_games.html
https://www.basketball-reference.com/teams/PHI/2015_games.html
https://www.basketball-reference.com/teams/PHI/2016_games.html
https://www.basketball-reference.com/teams/PHI/2017_games.html
https://www.basketball-reference.com/teams/PHI/2018_games.html
https://www.basketball-reference.com/teams/PHI/2019_games.html
https://www.basketball-reference.com/teams/PHO/2002_gam

https://www.basketball-reference.com/teams/WAS/2010_games.html
https://www.basketball-reference.com/teams/WAS/2011_games.html
https://www.basketball-reference.com/teams/WAS/2012_games.html
https://www.basketball-reference.com/teams/WAS/2013_games.html
https://www.basketball-reference.com/teams/WAS/2014_games.html
https://www.basketball-reference.com/teams/WAS/2015_games.html
https://www.basketball-reference.com/teams/WAS/2016_games.html
https://www.basketball-reference.com/teams/WAS/2017_games.html
https://www.basketball-reference.com/teams/WAS/2018_games.html
https://www.basketball-reference.com/teams/WAS/2019_games.html


In [55]:
print(len(final_df))
final_df.head()

43306


Unnamed: 0,Date,Date.1,Start (ET),Unnamed: 4,Unnamed: 5,Unnamed: 6,Opponent,Unnamed: 8,Unnamed: 9,Tm,Opp,W,L,Streak,Notes,team,season
0,1,"Tue, Oct 30, 2001",8:30p,,Box Score,@,Houston Rockets,L,OT,84,89,0,1,L 1,,ATL,2002
1,2,"Thu, Nov 1, 2001",8:00p,,Box Score,,Washington Wizards,L,,88,98,0,2,L 2,,ATL,2002
2,3,"Sat, Nov 3, 2001",7:00p,,Box Score,,Miami Heat,W,,90,83,1,2,W 1,,ATL,2002
3,4,"Mon, Nov 5, 2001",10:30p,,Box Score,@,Los Angeles Clippers,L,,86,109,1,3,L 1,,ATL,2002
4,5,"Wed, Nov 7, 2001",9:00p,,Box Score,@,Utah Jazz,L,,89,96,1,4,L 2,,ATL,2002


In [None]:



#alphabet list
alpha = list(string.ascii_lowercase)
root_url = "https://www.basketball-reference.com{}"
game_url = "https://www.basketball-reference.com/players/a"


html = urlopen(game_url)
soup = BeautifulSoup(html)
raw = soup.find_all('strong')

raw_result = []
for letter in alpha:
    if letter != 'x':
        url = "https://www.basketball-reference.com/players/{}".format(letter)
        html=urlopen(url)
        try:
            soup = BeautifulSoup(html)
            raw = soup.find_all('strong')
            raw_result.append(raw)
        except:
            continue
    else:
        continue
                


def get_links(result_set):
    result = []
    for i in result_set:
        result.extend(i.find_all('a'))
    links = [a.get('href') for a in result]
    name = [a.getText() for a in result]
    return links, name

player_names = []
player_links = []

for i in range(len(raw_result)):
    links,names = get_links(raw_result[i])
    player_links.extend(links)
    player_names.extend(names)
    
    

#getting the player gamelog stats
def modify(link):
    m = re.search('.*[^\.html]', link)
    link = m.group(0)
    link = link + '/gamelog/2019/'
    return link
    
    
player_log_links = [modify(link) for link in player_links]
player_link_dict = dict(zip(player_names, player_log_links))

y_gamelog = soup.find_all('table', {'id': 'pgl_basic'})
#headers
gamelog_headers = [th.getText() for th in y[0].find_all('th',limit=30)]
gamelog_headers = headers[1:]
y_gamelog[0].find_all('th', limit=30)
y_gamelog = soup.findAll('tbody')

#stats
game_rows = y[0].findAll('tr', id=lambda x: x and x.startswith('pgl_basic.'))
player_stats = [[td.getText() for td in game_rows[i].findAll('td')] for i in range(len(game_rows))]

#define a function that given a player's gamelog page, find the table of game logs and scrape the rows where the player played
def get_gamelog(link):
    html = urlopen(link)
    soup = BeautifulSoup(html)
    #locate gamelog table
    y_gamelog = soup.find_all('table', {'id': 'pgl_basic'})
    if len(y) == 0:
        return 'no gamelogs'
    
    else:
        game_rows = y[0].findAll('tr', id=lambda x: x and x.startswith('pgl_basic.'))
        player_stats = [[td.getText() for td in game_rows[i].findAll('td')] for i in range(len(game_rows))]
        
        final_df = pd.DataFrame(player_stats, columns = headers)
        return final_df
player_log_df_lst = []

for key, val in player_link_dict.items():
    link = root_url.format(val)
    player_logs_df = get_gamelog(link)
    
    if type(player_logs_df) == str:
        continue
    else:
        player_logs_df['PlayerName'] = key
        player_log_df_lst.append(player_logs_df)


final = pd.concat(player_log_df_lst)

final.to_csv('gamelog_data.csv', index=False)