### Collect and Explore NBA Draft Data

# Exploratory analysis

In [38]:
import pandas as pd
import numpy as np

## Example

In [31]:
# 2009 Draft Example
html_2009 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2009.html#stats', 
                         header = [1])
df_2009_draft = html_2009[0]
df_2009_draft = df_2009_draft[['Player', 'College', 'G', 'MP.1', 'PTS.1',
                               'TRB.1', 'AST.1', 'FG%', '3P%', 'FT%']]
df_2009_draft = df_2009_draft.dropna()

# Second round headers
df_2009_draft = df_2009_draft.drop([30, 31])
df_2009_draft

Unnamed: 0,Player,College,G,MP.1,PTS.1,TRB.1,AST.1,FG%,3P%,FT%
0,Blake Griffin,Oklahoma,721,33.0,19.9,8.2,4.1,0.493,0.327,0.696
2,James Harden,Arizona State,932,34.6,25.0,5.6,6.8,0.443,0.362,0.859
3,Tyreke Evans,Memphis,594,30.7,15.7,4.6,4.8,0.44,0.323,0.757
5,Jonny Flynn,Syracuse,163,22.9,9.2,1.9,3.9,0.4,0.338,0.809
6,Stephen Curry,Davidson,826,34.3,24.3,4.6,6.5,0.473,0.428,0.908
7,Jordan Hill,Arizona,409,18.8,7.9,5.8,0.8,0.497,0.136,0.699
8,DeMar DeRozan,USC,947,34.2,20.7,4.4,3.9,0.464,0.287,0.836
10,Terrence Williams,Louisville,153,19.1,7.1,3.6,2.4,0.412,0.317,0.659
11,Gerald Henderson,Duke,535,25.9,11.2,3.2,1.9,0.44,0.327,0.793
12,Tyler Hansbrough,UNC,428,16.9,6.7,4.2,0.4,0.439,0.136,0.738


## Filtering and Cleaning

### Process Function

In [85]:
def processNBADraftHTML(df_html):
    ret_df = df_html[0]
    ret_df = ret_df[['Player', 'College', 'G', 'MP.1', 'PTS.1',
                                   'TRB.1', 'AST.1', 'FG%', '3P%', 'FT%']]
    ret_df = ret_df.dropna()

    # Second round headers
    ret_df = ret_df.drop([30, 31])
    
    # Convert to numeric types
    ret_df['G'] = pd.to_numeric(ret_df['G'])
    ret_df['MP.1'] = pd.to_numeric(ret_df['MP.1'])
    ret_df['PTS.1'] = pd.to_numeric(ret_df['PTS.1'])
    ret_df['TRB.1'] = pd.to_numeric(ret_df['TRB.1'])
    ret_df['AST.1'] = pd.to_numeric(ret_df['AST.1'])
    ret_df['FG%'] = pd.to_numeric(ret_df['FG%'])
    ret_df['3P%'] = pd.to_numeric(ret_df['3P%'])
    ret_df['FT%'] = pd.to_numeric(ret_df['FT%'])
    
    ret_df = ret_df[ret_df['G'] >= 25]
    ret_df = ret_df[ret_df['MP.1'] >= 10]
    
    return ret_df

## Apply filtering and cleaning

In [86]:
html_2014 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2014.html#stats',
                        header = [1])
df_2014_draft = processNBADraftHTML(html_2014)

html_2015 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2015.html#stats',
                        header = [1])
df_2015_draft = processNBADraftHTML(html_2015)

html_2016 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2016.html#stats',
                        header = [1])
df_2016_draft = processNBADraftHTML(html_2016)

html_2017 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2017.html#stats',
                        header = [1])
df_2017_draft = processNBADraftHTML(html_2017)

html_2018 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2018.html#stats',
                        header = [1])
df_2018_draft = processNBADraftHTML(html_2018)

html_2019 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2019.html#stats',
                        header = [1])
df_2019_draft = processNBADraftHTML(html_2019)

html_2020 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2020.html#stats',
                        header = [1])
df_2020_draft = processNBADraftHTML(html_2020)

html_2021 = pd.read_html('https://www.basketball-reference.com/draft/NBA_2021.html#stats',
                        header = [1])
df_2021_draft = processNBADraftHTML(html_2021)

In [93]:
full_df = pd.concat([df_2014_draft, df_2015_draft, df_2016_draft,
                    df_2017_draft, df_2018_draft, df_2019_draft,
                    df_2020_draft, df_2021_draft], ignore_index=True)
print(full_df)
print("--------------------")
print(full_df.shape)

                 Player         College    G  MP.1  PTS.1  TRB.1  AST.1  \
0        Andrew Wiggins          Kansas  588  35.1   19.3    4.4    2.3   
1         Jabari Parker            Duke  310  27.5   14.1    5.5    2.0   
2           Joel Embiid          Kansas  318  31.1   25.7   11.3    3.3   
3          Aaron Gordon         Arizona  519  28.8   12.9    6.2    2.5   
4          Marcus Smart  Oklahoma State  511  29.8   10.4    3.6    4.4   
..                  ...             ...  ...   ...    ...    ...    ...   
270     Kessler Edwards      Pepperdine   37  21.8    6.1    3.8    0.7   
271       Dalano Banton        Nebraska   59  11.4    3.4    2.0    1.6   
272  Brandon Boston Jr.        Kentucky   49  14.4    6.5    2.1    0.9   
273          Luka Garza            Iowa   30  11.4    5.2    2.9    0.5   
274       Aaron Wiggins        Maryland   43  23.5    7.6    3.5    1.2   

       FG%    3P%    FT%  
0    0.448  0.349  0.724  
1    0.494  0.326  0.743  
2    0.487  0.335 

## Get List of Colleges

In [95]:
uniqueColleges = np.unique(full_df[['College']])
print(uniqueColleges.size)
print(uniqueColleges)
uniqueColleges[41]

84
['Alabama' 'Arizona' 'Arizona State' 'Arkansas' 'Auburn' 'Baylor'
 'Belmont' 'Boise State' 'Boston College' 'Bowling Green' 'California'
 'Cincinnati' 'Clemson' 'Colorado' 'Creighton' 'Dayton' 'Duke' 'Florida'
 'Florida State' 'Georgia' 'Georgia Tech' 'Gonzaga' 'Houston' 'Illinois'
 'Indiana' 'Iowa' 'Iowa State' 'Kansas' 'Kansas State' 'Kentucky' 'LSU'
 'Louisiana' 'Louisville' 'Loyola (MD)' 'Maryland' 'Memphis' 'Miami (FL)'
 'Michigan' 'Michigan State' 'Missouri' 'Murray State' 'NC State'
 'Nebraska' 'Nevada' 'New Mexico State' 'Notre Dame' 'Ohio State'
 'Oklahoma' 'Oklahoma State' 'Oregon' 'Pepperdine' 'Pitt' 'Providence'
 'SMU' "Saint Joseph's" 'San Diego State' 'Seton Hall' 'South Carolina'
 'Stanford' 'Syracuse' 'TCU' 'Tennessee' 'Texas' 'Texas A&M' 'Texas Tech'
 'UCLA' 'UConn' 'UNC' 'UNLV' 'USC' 'Utah' 'VCU' 'Vanderbilt' 'Villanova'
 'Virginia' 'Virginia Tech' 'Wake Forest' 'Washington' 'Washington State'
 'West Virginia' 'Wichita State' 'Wisconsin' 'Wyoming' 'Xavier']


'NC State'

## Example College Data

In [125]:
# Alabama players
html_alabama = pd.read_html('https://www.basketball-reference.com/friv/draft.fcgi?college=alabama#drafted_nba_aba_players',
                           header = [1])
df_alabama = html_alabama[0]

df_alabama = df_alabama[['Player', 'Year', 'G', 'MP.1', 'PTS.1',
                                   'TRB.1', 'AST.1', 'FG%', '3P%', 'FT%']]
df_alabama = df_alabama[df_alabama['Player'] != 'Player']

# Drop na values
df_alabama = df_alabama.dropna()

# Coerce types
df_alabama['Year'] = pd.to_numeric(df_alabama['Year'])

df_alabama = df_alabama[df_alabama['Year'] >= 2013]

df_alabama.head()

Unnamed: 0,Player,Year,G,MP.1,PTS.1,TRB.1,AST.1,FG%,3P%,FT%
23,Herbert Jones,2021,68,30.1,9.7,3.9,2.2,0.493,0.35,0.851
25,Kira Lewis Jr.,2020,78,16.0,6.2,1.4,2.2,0.391,0.298,0.841
32,Joshua Primo,2021,40,17.6,5.2,2.3,1.4,0.369,0.298,0.762
38,Collin Sexton,2018,218,32.9,20.0,3.0,3.3,0.458,0.378,0.827


### Function for college preprocessing

In [126]:
def processCollegeHTML(link):
    html_df = pd.read_html(link, header=[1])
    df = df[0]

    df = df[['Player', 'Year', 'G', 'MP.1', 'PTS.1',
                                       'TRB.1', 'AST.1', 'FG%', '3P%', 'FT%']]
    df = df[df['Player'] != 'Player']

    # Drop na values
    df = df.dropna()

    # Coerce types
    df['Year'] = pd.to_numeric(df['Year'])

    df = df[df['Year'] >= 2013]

    df.head()

#### Robbie Colleges

#### Vidhan Colleges