<a href="https://colab.research.google.com/github/shmoshe/nba-rookie-success/blob/main/Basketball_Stats_Crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import os

!pip install kaggle

api_token = {"username":"","key":""}

!mkdir /root/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as file:
  json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json



In [None]:
!mkdir ./datasets
!mkdir ./datasets/nba-stats

!kaggle datasets download drgilermo/nba-players-stats -p ./datasets/nba-stats
!unzip ./datasets/nba-stats/*.zip  -d ./datasets/nba-stats/

Dataset URL: https://www.kaggle.com/datasets/drgilermo/nba-players-stats
License(s): unknown
Downloading nba-players-stats.zip to ./datasets/nba-stats
  0% 0.00/2.13M [00:00<?, ?B/s]
100% 2.13M/2.13M [00:00<00:00, 164MB/s]
Archive:  ./datasets/nba-stats/nba-players-stats.zip
  inflating: ./datasets/nba-stats/Players.csv  
  inflating: ./datasets/nba-stats/Seasons_Stats.csv  
  inflating: ./datasets/nba-stats/player_data.csv  


In [None]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import time

In [None]:
nba_players_stats_df = pd.read_csv('./datasets/nba-stats/Players.csv')
nba_player_data_df = pd.read_csv('./datasets/nba-stats/player_data.csv')
nba_players_df = pd.merge(nba_players_stats_df, nba_player_data_df, left_on='Player', right_on='name', how='inner').drop('name', axis=1)

nba_player_list = nba_players_df['Player'].tolist()

In [None]:
# get player page of a given player
def get_player_page(player_url):
  return requests.get(player_url).text

# get number of awards for a player
def get_awards(soup):
    awards = soup.find("div", id="leaderboard_notable-awards")

    if awards == None:
      awards = 0
    elif(awards.findNext("tr", class_="only-child")):
      awards = 1
    else:
      awards = awards.findNext("button").text[:-7]

    return int(awards)

# get number of all-star games
def get_allstar(soup):
    allstars = soup.find("div", id="leaderboard_allstar")

    if allstars == None:
      allstars = 0
    elif(allstars.findNext("tr", class_="only-child")):
      allstars = 1
    else:
      allstars = allstars.findNext("button").text[:-15]

    return int(allstars)

# get number of championships
def get_championships(soup):
    championships = soup.find("div", id="leaderboard_championships")

    if championships == None:
      championships = 0
    elif(championships.findNext("tr", class_="only-child")):
      championships = 1
    else:
      championships = championships.findNext("button").text[:-14]

    return int(championships)

# get number of all-nbas
def get_allnbas(soup):
    allnbas = soup.find("div", id="leaderboard_all_league")

    if allnbas == None:
      allnbas = 0
    elif(allnbas.findNext("tr", class_="only-child")):
      allnbas = 1
    else:
      allnbas = allnbas.findNext("button").text[:-11]

    return int(allnbas)


In [None]:
# get first player page link and hall of fame status in search result
def get_player(soup):
  player = soup.find("div", class_="search-item-name")

  if not player:
    return '', False

  href = player.findNext("a")
  hof = player.findNext("span", class_="search-badge search-hof")

  player_link = ''
  hall_of_fame = False

  if href:
    player_link = "https://www.basketball-reference.com" + href["href"]

  if hof:
    hall_of_fame = True

  return player_link, hall_of_fame

# scrape data from player page
def get_player_data(player, name, hof):
  player_info = {}

  player_page = get_player_page(player)
  elem = player_page.replace("<!--","").replace("-->","")
  soup  = BeautifulSoup(elem, 'lxml')

  player_info.update({"Player": name})
  player_info.update({"Hall_of_Fame": hof})
  player_info.update({"Awards": get_awards(soup)})
  player_info.update({"All_Star": get_allstar(soup)})
  player_info.update({"Championships": get_championships(soup)})
  player_info.update({"All_Nbas": get_allnbas(soup)})

  return player_info


In [None]:
nba_basketball_url = 'https://www.basketball-reference.com/search/search.fcgi?hint={name}&search={name}&pid=&idx='
unsuccessful_list = []

from tqdm import tqdm

def chunk_list(lst, chunk_size):
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

def scrape_basketball_players(player_list):
  players_data = []

  for name in tqdm(player_list):
    search_name = name.replace(' ', '+').lower()

    html = requests.get(nba_basketball_url.format(name=search_name)).text
    nba_soup = BeautifulSoup(html, 'html.parser')
    # print(name)

    player, hof = get_player(nba_soup)

    if player == '':
      unsuccessful_list.append(name)
      continue

    player_info = get_player_data(player, name, hof)
    players_data.append(player_info)

    time.sleep(5)

  return players_data

players_data = []
chunked_list = chunk_list(nba_player_list, 100)

for idx, chunk in enumerate(chunked_list):
  players = scrape_basketball_players(chunk)
  players_df = pd.DataFrame(players)
  players_df.to_csv(f'./datasets/nba_player_awards_data_{idx}.csv')
  players_data.extend(players)
  time.sleep(60)

players_df = pd.DataFrame(players_data)
players_df

100%|██████████| 100/100 [09:19<00:00,  5.60s/it]
100%|██████████| 100/100 [09:21<00:00,  5.62s/it]
100%|██████████| 100/100 [09:22<00:00,  5.62s/it]
100%|██████████| 100/100 [09:33<00:00,  5.74s/it]
100%|██████████| 100/100 [09:33<00:00,  5.74s/it]
100%|██████████| 100/100 [09:28<00:00,  5.69s/it]
100%|██████████| 100/100 [09:43<00:00,  5.83s/it]
100%|██████████| 100/100 [09:23<00:00,  5.63s/it]
100%|██████████| 100/100 [09:35<00:00,  5.75s/it]
100%|██████████| 100/100 [09:46<00:00,  5.87s/it]
100%|██████████| 100/100 [09:55<00:00,  5.95s/it]
100%|██████████| 100/100 [09:42<00:00,  5.83s/it]
100%|██████████| 100/100 [09:42<00:00,  5.83s/it]
100%|██████████| 100/100 [09:38<00:00,  5.79s/it]
100%|██████████| 100/100 [09:44<00:00,  5.85s/it]
100%|██████████| 100/100 [09:41<00:00,  5.82s/it]
100%|██████████| 100/100 [09:43<00:00,  5.84s/it]
100%|██████████| 100/100 [09:51<00:00,  5.91s/it]
100%|██████████| 100/100 [09:42<00:00,  5.82s/it]
100%|██████████| 100/100 [09:42<00:00,  5.83s/it]


Unnamed: 0,Player,Hall_of_Fame,Awards,All_Star,Championships,All_Nbas
0,Curly Armstrong,False,0,0,0,0
1,Cliff Barker,False,0,0,0,0
2,Leo Barnhorst,False,0,2,0,0
3,Ed Bartels,False,0,0,0,0
4,Ralph Beard,False,0,1,0,1
...,...,...,...,...,...,...
3796,Troy Williams,False,0,0,0,0
3797,Kyle Wiltjer,False,0,0,0,0
3798,Stephen Zimmerman,False,0,0,0,0
3799,Paul Zipser,False,0,0,0,0


In [None]:
unsuccessful_list

['Walt Hazzard',
 'John Johnson',
 'Mike Holton',
 'Wayne Englestad',
 'Jake Voskuhl',
 'Ruben Wolkowyski',
 'J.J. Redick',
 'Enes Kanter',
 'Dewayne Dedmon',
 'C.J. McCollum',
 'James Southerland',
 'Jakob Poeltl',
 'Taurean Waller-Prince']

In [None]:
# convert dataframe to csv

players_df.to_csv('./datasets/nba_player_awards_data.csv')