# NBA Game Log Career Stats
Data acquisition method -> Web Scraping

The primary function of this script is to download an NBA players entire career game log (from 1994 and beyond), using a URL form ESPN NBA by searching the players name and selecting their 'game log' stats filter. Data collected consists of every game the player participated in during each year will be placed in CSV format for download with the players respective statistics during the time he was on the court. As of now, this script does not seperate stats from regular season and playoffs, which is a work in progress for the next iteration.

In [68]:
# Rafael Chavez
# 7/11/21
# Project 824, Owner

import requests as req
import pandas as pd
import re
from bs4 import BeautifulSoup

stats_list = []
playoff_headers = ['Conference Quarterfinals','Conference Semifinals','Conference Finals','Finals']
playoff_headers_index= []
yearly_career_stats = []

    
def request_url(player_stats_url):
    stats_request_object = req.get(player_stats_url)
        
    return stats_request_object
    
def request_to_text_object(request_object):
    stats_text_object = request_object.text
        
    return stats_text_object
    
    
def create_bs_object(stats_text_object):
    table_bs = BeautifulSoup(stats_text_object,'html5lib')
    
    return table_bs


# When parsing for values from 'option' tags, i ran into an issue where a value
# was being returned as 'Selected', which means i have to use RE to make sure its 
# not appended to my list of years played in the league. Meaning, i have to make sure
# the string is composed only of digits
def is_digits_only(dropdown_value):
    if bool(re.match(r'^([\s\d]+)$', dropdown_value)) == True:
        return True
    else:
        return False


# Need to create a fxn which parses for the years available for a players career
def get_career_years(bs_object):
    years_played = []
    list = bs_object.findAll('option')
    for l in list: 
        if l.get('value') != None and is_digits_only(l.get('value')) == True:
            years_played.append(int(l.get('value')))
            
    years_played.sort()
    print("Game Log for the following years will be written to CSV:")
    print("--------------------------------------------------------\n")
    print(years_played)
    
    return years_played


def create_table_rows_object(table_bs):
    stats_table_rows = table_bs.find_all('tr')
    
    return stats_table_rows


def create_stats_list(stats_table_rows,stats_list):
    i = 0
    while i in range(0,len(stats_table_rows)):
        stats_list.append([span.text for span in stats_table_rows[i]])
        i +=1
        
    return stats_list


def clean_career_stats(career_stats_list):
    parsed_stats_list = []
    all_player_games_stats = []
    for i in range(len(career_stats_list)):
        parsed_stats_list.append(re.search(r'(\w+ \d+/\d+)',career_stats_list[i][0]))

        if(parsed_stats_list[i] != None):
            all_player_games_stats.append(stats_list[i])
        
    return all_player_games_stats
    

def return_column_headers(stats_list):
    stat_column_headers = stats_list[0]
    
    return stat_column_headers


def create_player_stats_list(stats_list,yearly_career_stats):
    # Although i create an html object later, this one defaults to the players most recent season
    # where I am still able to obtain every year the athlete competed in the NBA
    player_stats_url = "https://www.espn.com/nba/player/gamelog/_/id/6583/type/nba/year/"
    request_object = request_url(player_stats_url)
    request_text_object = request_to_text_object(request_object)
    bs_object = create_bs_object(request_text_object)
    
    # List containing every year the athlete competed in, used to iterate through
    years_played = get_career_years(bs_object)
    
    for year in years_played:
        player_stats_url = "https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/"
        player_stats_url = player_stats_url + str(year)
        print(player_stats_url)
        request_object = request_url(player_stats_url)
        request_text_object = request_to_text_object(request_object)
        bs_object = create_bs_object(request_text_object)
        table_rows = create_table_rows_object(bs_object)
        season_stats = create_stats_list(table_rows,stats_list)
        column_headers = return_column_headers(season_stats)
        
        cleaned_stats = clean_career_stats(season_stats)
        df = pd.DataFrame(cleaned_stats, columns = column_headers)
        df.to_csv('RusselWestbrookYear'+str(year))
        stats_list.clear()
    print("/nData has been successfully written to CSV format")



# Running Script
Lets choose main method and run this shit


In [69]:
player_career_stats = create_player_stats_list(stats_list,yearly_career_stats,)
player_career_stats


Game Log for the following years will be written to CSV:
--------------------------------------------------------

[2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2013
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2014
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2015
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2016
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2017
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2018
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2019
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2020
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2021
https://www.espn.com/nba/player/gamelog/_/id/3468/type/nba/year/2022
/nData has been successfully written to CSV format


# Idea: Select top 15 players of all time, create data sets for each one, and use interesting analytics to calculate for each one and then create an easy to read, enjoyable format to send to twitter which makes good tweets for settling/igniting barber shop talk on who the goat is (kobe).

# Idea 2: Provide user input of a link to webscrape, meaning on the backend of the app we need to make sure its a valid URL from ESPN. What we can do is have the user be able to download the CSV formatted season statistics. 

# -------------

# Lets analyze player stats for once, still havent gotten that even STARTED

SIMPLE STATS, PPG, LAST 20 GAMES PPG, FIRST 20 GAMES PPG, MIDDLE 20 PPG

GOING TO CONTINUE THIS IN A DIFFERENT PAGE

In [63]:
# obtain dataframe
import statistics as stats
import math


def csv_to_df(csv):
    player_season_df = pd.read_csv(csv)
    return player_season_df

def player_season_dataframe(player_df):
    player_season = player_df
    
    return player_season

def player_points(player_season):
    points_list = player_season["PTS"].tolist()
    
    return points_list
    
def first_twenty(points_list):
    first_twenty = "{:.2f}".format(stats.mean(points_list[0:21]))
    print("First 20 PPG: " + str(first_twenty))
    
def middle_twenty(points_list):
    middle_index = math.ceil(len(points_list))
    middle_twenty = "{:.2f}".format(stats.mean(points_list[middle_index-10:middle_index+10]))
    print("Middle 20 PPG: " + str(middle_twenty))
    
def last_twenty(points_list):
    last_twenty = "{:.2f}".format(stats.mean(points_list[-20:-1]))
    print("Last 20 PPG: " + str(last_twenty))
    
# Top 10 scoring totals of season
def top_ten_games(points):
    n = 10
    ascending_list = sorted(points)
    top_10 = ascending_list[-10:]
    print(top_10)
    
    
# create points list    
p_df = csv_to_df('JamesHardenYear2019')
points = player_points(p_df)
print("Total Games Played: " + str(len(points)))

# First 20 game PPG avg
first_20_points = first_twenty(points)
first_20_points

# Middle 20 (rough estimate)
middle_twenty = middle_twenty(points)
middle_twenty

# Last 20 game PPG avg
last_twenty(points)
last_twenty

top_10 = top_ten_games(points)
top_10






Total Games Played: 95
First 20 PPG: 33.81
Middle 20 PPG: 25.70
Last 20 PPG: 28.84
[48, 50, 50, 54, 57, 57, 58, 58, 61, 61]
