In [2]:
%matplotlib inline
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display


In [3]:
def getDraftees():
    # get initial page
    url = "https://www.basketball-reference.com/play-index/draft_finder.cgi?request=1&year_min=1989&college_id=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=year_id"
    r = requests.get(url).text
    soup = BeautifulSoup(r, 'lxml')
    
    # scrape headers
    headers = soup.find(id="stats").find('thead').find_all('tr')[1:]
    colNames = []
    for h in headers:
        cols = h.find_all('th')
        for c in cols:
            colNames.append(c.get_text())
    colNames = colNames[1:]
    # scrape individual rows
    rows = soup.find(id="stats").find('tbody').find_all('tr')
    stats = []
    for r in rows:
        statsoup = r.find_all('td')
        statRow = []
        for stat in statsoup:
            statRow.append(stat.get_text())
        stats.append(statRow)
    
    # scrape the remaining pages getting only data rows
    for offset in range(100, 1700, 100):
        offsetURL = "https://www.basketball-reference.com/play-index/draft_finder.cgi?request=1&year_min=1989&year_max=&round_min=&round_max=&pick_overall_min=&pick_overall_max=&franch_id=&college_id=0&is_active=&is_hof=&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=year_id&order_by_asc=&offset=%d" % (offset)
        offsetR = requests.get(offsetURL).text
        soup = BeautifulSoup(offsetR, 'lxml')
        rows = soup.find(id="stats").find('tbody').find_all('tr')
        for r in rows:
            statsoup = r.find_all('td')
            statRow = []
            for stat in statsoup:
                statRow.append(stat.get_text())
            stats.append(statRow)
        
    draftDF = pd.DataFrame(stats, columns = colNames)
    draftDF = draftDF.apply(pd.to_numeric, errors = 'ignore')
    return draftDF

# call getDraftees to get initial data and convert to csv. Will use csv after initial download.
# draft = getDraftees()
# print(draft.shape)
# draftdf = draft[np.isfinite(draft['Year'])]
# print (draftdf.shape)
# draftdf[['Year','Rd','Pk']] = draftdf[['Year','Rd','Pk']].apply(pd.to_numeric,errors="coerce",downcast="integer")
# draftdf.to_csv("nba-draft-1989-to-2017.csv")

In [63]:
def getCBBStats(players):
    url = "http://www.sports-reference.com/cbb/players/%s-1.html" % (players[0])
    r = requests.get(url).text
    soup = BeautifulSoup(r, 'lxml')
    
    # get headers
    headers = soup.find(id='players_per_game').find('thead').find_all('th')
    colNames = [col.get_text() for col in headers]
    
    stats = []
    for p in players:
        url = "http://www.sports-reference.com/cbb/players/%s-1.html" % (p)
        r = requests.get(url).text
        soup = BeautifulSoup(r, 'lxml')
      
        # get career stats
        statsoup = soup.find(id='players_per_game').find('tfoot').find_all('td')
        statRow = ['Career']
        for s in statsoup:
            statRow.append(s.get_text())
        stats.append(statRow)

        # get awards
        awards = soup.find_all('td', {'data-stat': 'awards'})
        allAwards = []
        for a in awards:
            text = a.get_text()
            textList = text.strip().split(";")
            for t in textList:
                allAwards.append(t.replace("...", "").strip())

        # get extra awards from tooltip   
        for a in awards:
            myspan = a.find("span")
            if myspan:
                tips = myspan['tip']
                tipList = tips.split(":")[1].strip().split(";")

                tipListCopy = list(filter(None, tipList))
                for t in tipListCopy:
                    allAwards.append(t.strip())

        # remove empty strings and add awards to last element of stats list
        str_list = list(filter(None, allAwards))
        stats[-1][-1] =', '.join(str_list)
    
    # create and return dataframe
    df = pd.DataFrame(stats, columns = colNames)
    df = df.apply(pd.to_numeric, errors = 'ignore')
    df.drop(df.columns[-2],axis=1,inplace=True)
    df.drop(["Season","Conf"], axis=1, inplace=True)
    return df

myDf = getCBBStats(["doug-mcdermott", "tyler-hansbrough"])
myDf

Unnamed: 0,School,G,GS,MP,FG,FGA,FG%,2P,2PA,2P%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,SOS,Awards
0,Creighton,145,145,31.5,7.9,14.3,0.55,6.0,10.2,0.587,...,5.6,7.5,1.3,0.2,0.1,2.2,1.8,21.7,4.29,"MVC POY, MVC POY, AP POY, NABC POY, TSN POY, U..."
1,UNC,142,141,30.9,6.6,12.3,0.536,6.5,12.1,0.541,...,5.2,8.6,1.1,1.3,0.5,2.1,2.5,20.2,8.39,"AP POY, NABC POY, TSN POY, NCAA All-Tournament..."


In [None]:
hof = pd.read_csv("NBA-HOF.csv")
stars = pd.read_csv("NBA-ALLSTARS.csv")
nbaTeam= pd.read_csv("nba-ALLNBA.csv")
nbaTeam.rename(columns={"Rk":"AllNBARk","From":"AllNBAFrom", "To":"AllNBATo","Count":"AllNBACount"}, inplace=True)
stars.rename(columns={"Rk":"AllStarsRk","From":"AllStarsFrom", "To":"AllStarsTo","Count":"AllStarsCount"}, inplace=True)
hof.rename(columns={"Rk":"HOFRk","From":"HOFFrom", "To":"HOFTo","Count":"HOFSeasons"}, inplace=True)
display(hof.head())
display(stars.head())
display(nbaTeam.head())

In [None]:
nbaHonors = nbaTeam.merge(stars, how='left', on=["Player"]).merge(hof, how="left", on=["Player"])
nbaHonors.drop(["Tm_y","Lg_y", "Tm","Lg"], axis=1, inplace=True)
nbaHonors.rename(columns={"Tm_x":"Tm", "Lg_x":"Lg"}, inplace=True)
nbaHonors['Player'] = nbaHonors['Player'].str.split("\\").str[0]

display(nbaHonors.shape)
display(nbaHonors)