In [None]:
# Libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
from bs4 import Comment

# Realistic Chrome User-Agent to avoid blocking
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    )
}

In [None]:
# Scraping Function

def get_game_log(season, advanced=False):
    url_type = "gamelog-advanced" if advanced else "gamelog"
    url = f"https://www.basketball-reference.com/players/h/hardeja01/{url_type}/{season}/"
    
    # FINAL correct table IDs
    table_id = "player_game_log_adv_reg" if advanced else "player_game_log_reg"

    try:
        print(f"Fetching {url}...")
        res = requests.get(url, headers=headers, timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        table = soup.find("table", {"id": table_id})

        if not table:
            print(f"Table with id '{table_id}' not found for {season} - {url_type}")
            return pd.DataFrame()

        df = pd.read_html(str(table))[0]
        df["Season"] = f"{season-1}-{season}"
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return pd.DataFrame()


In [15]:
test_df = get_game_log(2015, advanced=False)
test_df.head()


Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2015/...


Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Unnamed: 5,Opp,Result,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,Season
0,1,372,1,2014-10-28,HOU,@,LAL,"W, 108-90",1,30:46,...,1,6,1,0,0,1,32,27.6,12,2014-2015
1,2,373,2,2014-10-29,HOU,@,UTA,"W, 104-93",1,38:11,...,7,10,0,1,6,5,18,9.6,21,2014-2015
2,Rk,Gcar,Gtm,Date,Team,,Opp,Result,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,2014-2015
3,3,374,3,2014-11-01,HOU,,BOS,"W, 104-90",1,31:06,...,8,6,4,1,3,1,26,26.1,18,2014-2015
4,4,375,4,2014-11-03,HOU,@,PHI,"W, 104-93",1,38:12,...,9,5,1,1,4,2,35,28.3,18,2014-2015


In [16]:
get_game_log(2015, advanced=True).head()


Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2015/...


Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Unnamed: 5,Opp,Result,GS,MP,...,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,GmSc,BPM,Season
0,1,372,1,2014-10-28,HOU,@,LAL,"W, 108-90",1,30:46,...,46.6,1.7,0.0,0.0,34.7,161,104,27.6,20.5,2014-2015
1,2,373,2,2014-10-29,HOU,@,UTA,"W, 104-93",1,38:11,...,38.7,0.0,2.0,22.5,33.1,97,108,9.6,-5.5,2014-2015
2,Rk,Gcar,Gtm,Date,Team,,Opp,Result,GS,MP,...,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,GmSc,BPM,2014-2015
3,3,374,3,2014-11-01,HOU,,BOS,"W, 104-90",1,31:06,...,38.1,6.3,2.1,13.0,32.3,130,80,26.1,19.0,2014-2015
4,4,375,4,2014-11-03,HOU,@,PHI,"W, 104-93",1,38:12,...,27.1,1.4,2.4,13.4,34.4,135,101,28.3,9.8,2014-2015


In [17]:
# Define Season Range + Empty Lists

seasons = list(range(2015, 2026))  # 2015 through 2025
basic_dfs = []
advanced_dfs = []

In [18]:
# Scrape Loop with Sleep + Logging

for season in seasons:
    basic_df = get_game_log(season, advanced=False)
    adv_df = get_game_log(season, advanced=True)

    if not basic_df.empty:
        basic_dfs.append(basic_df)
    if not adv_df.empty:
        advanced_dfs.append(adv_df)

    sleep_time = random.uniform(2.5, 5.0)
    print(f"Sleeping for {sleep_time:.2f} seconds...\n")
    time.sleep(sleep_time)


Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2015/...
Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2015/...
Sleeping for 2.71 seconds...

Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2016/...
Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2016/...
Sleeping for 3.12 seconds...

Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2017/...
Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2017/...
Sleeping for 2.86 seconds...

Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2018/...
Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2018/...
Sleeping for 3.31 seconds...

Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog/2019/...
Fetching https://www.basketball-reference.com/players/h/hardeja01/gamelog-advanced/2019/...
Sleep

In [19]:
# Concatenate + Preview Results

basic_all = pd.concat(basic_dfs, ignore_index=True)
advanced_all = pd.concat(advanced_dfs, ignore_index=True)

# Quick previews
display(basic_all.head())
display(advanced_all.head())


Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Unnamed: 5,Opp,Result,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,Season
0,1,372,1,2014-10-28,HOU,@,LAL,"W, 108-90",1,30:46,...,1,6,1,0,0,1,32,27.6,12,2014-2015
1,2,373,2,2014-10-29,HOU,@,UTA,"W, 104-93",1,38:11,...,7,10,0,1,6,5,18,9.6,21,2014-2015
2,Rk,Gcar,Gtm,Date,Team,,Opp,Result,GS,MP,...,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-,2014-2015
3,3,374,3,2014-11-01,HOU,,BOS,"W, 104-90",1,31:06,...,8,6,4,1,3,1,26,26.1,18,2014-2015
4,4,375,4,2014-11-03,HOU,@,PHI,"W, 104-93",1,38:12,...,9,5,1,1,4,2,35,28.3,18,2014-2015


Unnamed: 0,Rk,Gcar,Gtm,Date,Team,Unnamed: 5,Opp,Result,GS,MP,...,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,GmSc,BPM,Season
0,1,372,1,2014-10-28,HOU,@,LAL,"W, 108-90",1,30:46,...,46.6,1.7,0.0,0.0,34.7,161,104,27.6,20.5,2014-2015
1,2,373,2,2014-10-29,HOU,@,UTA,"W, 104-93",1,38:11,...,38.7,0.0,2.0,22.5,33.1,97,108,9.6,-5.5,2014-2015
2,Rk,Gcar,Gtm,Date,Team,,Opp,Result,GS,MP,...,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg,GmSc,BPM,2014-2015
3,3,374,3,2014-11-01,HOU,,BOS,"W, 104-90",1,31:06,...,38.1,6.3,2.1,13.0,32.3,130,80,26.1,19.0,2014-2015
4,4,375,4,2014-11-03,HOU,@,PHI,"W, 104-93",1,38:12,...,27.1,1.4,2.4,13.4,34.4,135,101,28.3,9.8,2014-2015


In [None]:
# Save to CSV

basic_all.to_csv("harden_game_logs_basic.csv", index=False)
advanced_all.to_csv("harden_game_logs_advanced.csv", index=False)

# Full disclosure, I manually joined these two CSVs later on in Excel and then saved them as you can see in the repo as `james_harden_games_dataset.csv`
print("CSVs saved: 'harden_game_logs_basic.csv' and 'harden_game_logs_advanced.csv'") 


✅ CSVs saved: 'harden_game_logs_basic.csv' and 'harden_game_logs_advanced.csv'
