In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def load_data():
    '''
    loads the data set found on 'https://www.basketball-reference.com/contracts/players.html' 
    and returns a list of dictionaries and a header list as (header, list_of_dictionaries)
    '''
    # specify the url
    quote_page = 'https://www.basketball-reference.com/contracts/players.html'

    # query the website and return the html to the variable ‘page’
    page = urlopen(quote_page)

    # parse the html using beautiful soup and store in variable `soup`
    soup = BeautifulSoup(page, 'lxml')

    # gets all of the table tags in the soup
    tables = soup.find_all('table')

    # iterates through the tables data
    data = []
    for t in tables:
        for r in t.find_all('tr'):
            row = {} # stores the row data for each row
            
            # gets all row information
            for c in r.find_all('th') + r.find_all('td'):
                row[c['data-stat']] = c.get_text()
            
            # checks to make sure if the row is valid
            if len(row) == 11:
                data.append(row)

    # removes invalid rows
    top = data[0]
    revised_data = []
    for r in data: 
        if r.items() != top.items():
            revised_data.append(r)
    
    # assembles the header information
    header = []
    for l in top:
        header.append(l)
        
    return header, revised_data

def load_data2():
    '''
    loads the data set found on 'https://www.basketball-reference.com/friv/birthplaces.fcgi' 
    and returns a list of dictionaries and a header list as (header, list_of_dictionaries)
    '''
    
    # specify the url
    quote_page = 'https://www.basketball-reference.com/friv/birthplaces.fcgi'

    # query the website and return the html to the variable ‘page’
    page = urlopen(quote_page)

    # parse the html using beautiful soup and store in variable `soup`
    soup = BeautifulSoup(page, 'lxml')
    
    players_data = []
    for div in soup.find_all('div'):
        
        # gets the id of the div
        try:
            id_name = div['id']
        except:
            continue
        
        # gets info for cities in the US
        if id_name == 'birthplace_1':
            for p in div.find_all('p'):
                text = p.get_text().split('\xa0')
                if len(text) < 2: # Checks if there is no city
                    place = ''
                    num = int(text[0].replace('(', '').replace(')', ''))
                else:
                    place = ' '.join(text[:-1])
                    num = int(text[-1].replace('(', '').replace(')', ''))
                players_data.append({'country': 'United States of America', 'city': place, 'num': num})
                
        # gets info for all other countries
        if id_name == 'birthplace_2':
            for p in div.find_all('p'):
                text = p.get_text().split('\xa0')
                country = ' '.join(text[:-1])
                num = int(text[-1].replace('(', '').replace(')', ''))
                players_data.append({'country': country, 'city': '', 'num': num})
        
    return ['country', 'city', 'num'], players_data

# to get salary data: call load_data()
# to get country data: call load_data2()
