In [14]:
import pandas as pd
import numpy as np
seasons = [str(year) for year in range(1991, 2020)]
yearChange = {'1990–91' : '1991',
              '1991–92' : '1992',
              '1992–93' : '1993',
              '1993–94' : '1994',
              '1994–95' : '1995',
              '1995–96' : '1996',
              '1996–97' : '1997',
              '1997–98' : '1998',
              '1998–99' : '1999',
              '1999–00' : '2000',
              '2000–01' : '2001',
              '2001–02' : '2002',
              '2002–03' : '2003',
              '2003–04' : '2004',
              '2004–05' : '2005',
              '2005–06' : '2006',
              '2006–07' : '2007',
              '2007–08' : '2008',
              '2008–09' : '2009',
              '2009–10' : '2010',
              '2010–11' : '2011',
              '2011–12' : '2012',
              '2012–13' : '2013',
              '2013–14' : '2014',
              '2014–15' : '2015',
              '2015–16' : '2016',
              '2016–17' : '2017',
              '2017–18' : '2018',
              '2018–19' : '2019'}

In [20]:
#create a function that would do all the pre-processing necessary to get our desired result of one table containing
#the all-NBA teams from the year 1991 
def getDF(frames):
    #scraping this page returns 9 total dataframes, our table is the one in the 7th index
    df = frames[7]
    
    #filter out recipients prior to the 1990-1991 season and after the 2018-2019 season
    df = df[df['Season']['Season'] > '1990-91']
    df = df[df['Season']['Season'] < '2019-20']
    
    #remove the multi-level column headers
    temp = []
    for column in df.columns:
        temp.append(column[1])
    df.columns = temp
    
    #reset the index
    df.index = np.array(range(len(df)))
    
    #teams are not a concern for us
    df = df.drop('Teams', axis=1)
    
    #rename the columns for clarity and specificity
    df.columns = ['Season', 'First Team', 'Second Team', 'Third Team']
    
    #change the dataframe to strings, regex search the table and filter out the wikipedia annotations/footnotes/etc.
    df = df.astype(str)
    for column in df.columns[1:]:
        #replace all non english alphabet characters except for slovenian characters
        df[column] = df[column].str.replace('\[.?\]|\(.?\)|[^a-z A-Z|šđčćž|ŠĐČĆŽ]', '')
        
    #rename each season from the format eg: 1990-91 --> 1991
    df = df.replace({'Season' : yearChange})
        
    #return dataframe
    return df

In [21]:
def createCSVs():
    #using wikipedia to get all our data in one table through pandas
    DFs = pd.read_html('https://en.wikipedia.org/wiki/All-NBA_Team')
    
    #save-path for all our CSVs
    savePath = '/Users/pranav/nba_allNBA_predictor/all_nba_teams/'
    
    #run getDF on the frames to get our output and group the dataframe into seasons
    teams = getDF(DFs)
    yearly = teams.groupby(teams.Season)
    
    #for each season b/w 1991-2020, divide the dataframe into a smaller one without the season column and export it
    for season in seasons:
        temp = yearly.get_group(season)
        temp = temp.drop('Season', axis=1)
        temp.to_csv(savePath + season + '_allNBAteams.csv', index=False)

In [22]:
createCSVs()