In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import requests
import re
import random
import time
import matplotlib.pyplot as plt
%matplotlib inline
import unidecode
from collections import OrderedDict

### Preprocessing, data organiztion, and HTML parsing to prepare for WebScraping

In [3]:
player_a_url = 'https://www.basketball-reference.com/players/a/'
all_a_players = []
all_a_players_url = []

url = 'https://www.basketball-reference.com/players/{}/{}01.html'
a_scraped_urls = []

response = requests.get(player_a_url)
page  = response.text
soup = BeautifulSoup(page, 'lxml')

# Extract player name from chart of all NBA players with last name starting with 'A'
chart = soup.find(attrs={'id': 'players'}).find('tbody').find_all('tr')
for row in chart:
    all_a_players.append(row.find('th').text)

all_a_players = list(OrderedDict.fromkeys(all_a_players))

# Remove broken URLs
all_a_players.remove('D.J. Augustin')
all_a_players.remove('B.J. Armstrong')
all_a_players.remove('J.J. Anderson')
all_a_players.remove('Jeff Ayres')

# Accounting for special characters, format player names to adhere to page url structure
for player in all_a_players:
    a_split = player.lower().split(' ')
    all_a_players_url.append(unidecode.unidecode((a_split[1][:5]) + a_split[0][:2])) 

# List of all 'A' player pages
for i in range(len(all_a_players_url)):
    a_scraped_urls.append(url.format(all_a_players_url[i][0], all_a_players_url[i]))

### Get HTML

In [4]:
player_info = []

def toHTML(list_of_urls):
    
    for url in range(len(a_scraped_urls)):
        player_info.append(requests.get(a_scraped_urls[url]).text)

    return player_info

HTML = toHTML(a_scraped_urls)

### Parse HTML with BeautifulSoup

In [5]:
soup_objs = []

def toListOfSoups(HTML):
    
    for player in player_info:
        soup = BeautifulSoup(player.replace('-->', '').replace('<!--', ''), 'lxml') # remove extraneous characters
        soup_objs.append(soup)
    
    return soup_objs

soups = toListOfSoups(HTML)

### Collect player statistics from all NBA seasons

In [6]:
per_game_tables = []

for soup_element in soups:
    per_game_tables.append(soup_element.find(attrs={'class': 'overthrow table_container',
                                            'id': 'div_per_game'}))

### Rookie Year Statistics

In [7]:
table_rows = []

for table in per_game_tables:
    try:
        table_rows.append(table.find(attrs={'class': 'full_table'}))
    except AttributeError:
        print(table)

#### Points per game as a rookie

In [8]:
points_rookie = []

for i in table_rows:
    try:
        points_rookie.append(i.find(attrs={'data-stat': 'pts_per_g'}).text)
    except AttributeError:
        points_rookie.append('N/A')

#### Free throw percentage as a rookie

In [9]:
ft_pct = []

for i in table_rows:
    try:
        ft_pct.append(i.find(attrs={'data-stat': 'ft_pct'}).text)
    except:
        ft_pct.append('N/A')

#### 3P percentage as a rookie

In [10]:
three_pt_pct = []

for i in table_rows:
    try:
        three_pt_pct.append(i.find(attrs={'data-stat': 'fg3_pct'}).text)
    except AttributeError:
        three_pt_pct.append('N/A')

#### 3P attempts per game as rookie

In [11]:
three_pt_attempts = []

for i in table_rows:
    try:
        three_pt_attempts.append(i.find(attrs={'data-stat': 'fg3a_per_g'}).text)
    except AttributeError:
        three_pt_attempts.append(0)

#### Minutes per game as a rookie

In [12]:
mpg = []

for i in table_rows:
    try:
        mpg.append(i.find(attrs={'data-stat': 'mp_per_g'}).text)
    except AttributeError:
        mpg.append('N/A')

#### Field goal attempts per game as a rookie

In [13]:
fga = []

for i in table_rows:
    try:
        fga.append(i.find(attrs={'data-stat': 'fga_per_g'}).text)
    except AttributeError:
        fga.append('N/A')

#### Field goal percentage as a rookie

In [14]:
fg_pct = []

for i in table_rows:
    try:
        fg_pct.append(i.find(attrs={'data-stat': 'fg_pct'}).text)
    except AttributeError:
        fg_pct.append('N/A')

#### Assists per game as a rookie

In [15]:
assists = []

for i in table_rows:
    try:
        assists.append(i.find(attrs={'data-stat': 'ast_per_g'}).text)
    except AttributeError:
        assists.append('N/A')

#### Games played as a rookie (out of 82)

In [16]:
games_played= []

for i in table_rows:
    try:
        games_played.append(i.find(attrs={'data-stat': 'g'}).text)
    except AttributeError:
        games_played.append('N/A')

#### Total rebounds per game as a rookie

In [17]:
rebs = []

for i in table_rows:
    try:
        rebs.append(i.find(attrs={'data-stat': 'trb_per_g'}).text)
    except AttributeError:
        rebs.append('N/A')

#### Steals per game as a rookie

In [18]:
steals = []

for i in table_rows:
    try:
        steals.append(i.find(attrs={'data-stat': 'stl_per_g'}).text)
    except AttributeError:
        steals.append(0)

#### Player's position

In [19]:
position = []

for i in table_rows:
    try:
        position.append(i.find(attrs={'data-stat': 'pos'}).text)
    except AttributeError:
        position.append('N/A')

### Sophomore Year Table

In [20]:
all_tables = []
soph_tables = []

for table in per_game_tables:
    try:
        all_tables.append(table.find_all(attrs={'class': 'full_table'}))
    except:
        pass

for player_table in all_tables:
    try:
        soph_tables.append(player_table[1]) # 2nd row – sophomore statistics
    except:
        soph_tables.append('N/A')

#### Points per game as a sophomore (Target var)

In [21]:
points_sophomore = []

for player in soph_tables:
    try:
        points_sophomore.append(player.find(attrs={'data-stat': 'pts_per_g'}).text)
    except TypeError:
        points_sophomore.append(0)

### DataFrame

In [22]:
zipped = list(zip(all_a_players, position, points_rookie, fga, fg_pct, 
                  games_played, mpg, rebs, assists, steals, three_pt_attempts, 
                  three_pt_pct, ft_pct, points_sophomore))

df_A = pd.DataFrame(zipped, columns=['Name', 'Position', 'Rookie PPG', 'FGA', 'FG_Pct', 
                                                    'Games Played', 'MPG', 'Rebounds', 'Assists', 'Steals', 
                                                    '3 Pt. Att', '3 Pt. Pct', 'FT Pct', 'Sophomore PPG'])

### Export to CSV

In [26]:
A_csv = df_A.to_csv('A.csv')