In [1]:
import pandas as pd
from glob import glob
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.kernel_ridge import KernelRidge
import warnings

In [2]:
def parse_conference_rank(row):
    """
    :return: Parse out conference rank and Team from standings on basketball reference.
    """
    team_split = row['Team'].split('(')
    row['conference_rank'] = int(team_split[1][0])
    if '*' in team_split[0]:
        row['Team'] = team_split[0][:-1]
    else:
        row['Team'] = team_split[0]

    return row

In [3]:
def get_standings(standings_df, year, i):
    """
    :param standings_df: Dataframe result from read_html of basketball reference
    :param year: between 2005 - 2016
    :param i: 1st df for each year is eastern conference, 2nd is western conference
    :return: Clean up data and add columns for conference, year, team and rank.
    """
    if i == 0:
        conf = 'Eastern'
        val = 1
    else:
        conf = 'Western'
        val = 0

    standings_df['eastern_conference'] = val
    standings_df['year'] = year
    standings_df.dropna(inplace=True)
    standings_df.rename(columns={'{} Conference'.format(conf): 'Team'}, inplace=True)
    standings_df['Team'] = standings_df['Team'].str.replace(u'\xa0', u'')
    standings_df.drop('PS/G', axis=1, inplace=True)

    return standings_df.apply(parse_conference_rank, axis=1)

In [4]:
def get_all_standings():
    """
    :return: Pull yearly standings data from basketball reference and return one
    concatenated DataFrame.
    """
    standing_dfs = []
    for year in range(2005, 2017):
        url = 'http://www.basketball-reference.com/leagues/NBA_{}.html'.format(year)
        eastern_western_standings = pd.read_html(url)
        for i, standings_df in enumerate(eastern_western_standings[:2]):
            standings_df = get_standings(standings_df, year, i)
            standing_dfs.append(standings_df)

    return pd.concat(standing_dfs)

In [5]:
def clean_up_team_df(folder):
    """
    :param folder: Data is manually downloaded from basketball reference because the web-scraping
     is too tedious.
    :return: Grab offensive and defensive stats for each team for each year and clean up the data
     and apply correct data types.
    """
    team_dfs = []
    files = glob(folder + '/*.csv')
    for file in files:
        team_df = pd.read_csv(file)
        off_def = file.split('season')
        team_df.dropna(inplace=True)
        team_df = team_df.convert_objects(convert_numeric=True)

        if 'opp' in off_def[1]:
            team_df.drop(['G', 'MP'], axis=1, inplace=True)
            team_df.rename(columns={'PA/G': 'PA_G'})
            team_df.columns = ['Def_Rank', 'Team'] + ['{}_opp'.format(col) for col in team_df.columns[2:]]

        else:
            team_df.rename(columns={'Rank': 'Off_Rank'})
            try:
                team_df['PS_G'] = team_df['PS_G'].str.replace("\\", '')
            except AttributeError:
                pass
            team_df['playoff_appearance'] = team_df.apply(lambda row: 1 if '*' in row['Team'] else 0, axis=1)

        team_df['year'] = int(file.split('yearly_data/')[-1][:4])
        # use better method than convert_objects
        team_df['Team'] = team_df['Team'].str.replace('*', '')
        team_df.columns = team_df.columns.str.replace('%', '_Perc')

        team_dfs.append(team_df)

    return pd.merge(team_dfs[0], team_dfs[1], how='left', on=['Team', 'year'])

In [6]:
def get_team_detailed_stats():
    """
    :return: Root folder of where all detailed stats live in hard drive. Concatenate
     each year of data into one single DataFrame.
    """
    root_folder = 'yearly_data/*'
    year_folders = glob(root_folder)

    return pd.concat([clean_up_team_df(folder) for folder in year_folders])

In [7]:
def get_all_stars():
    """
    :return: Additional data about all stars each year available on basketball reference.
    Loop through and pull that data.
    """
    all_star_dfs = []
    for year in range(2005, 2017):
        all_star_url = 'http://www.basketball-reference.com/allstar/NBA_{}.html'.format(year)
        west_east = pd.concat([df[['Unnamed: 0', 'Totals']] for df in pd.read_html(all_star_url)[1:]])
        west_east = west_east[west_east['Unnamed: 0'] != 'Reserves'].dropna()
        west_east['all_star_count'] = west_east.groupby('Totals')['Totals'].transform('count')
        west_east = west_east[['Totals', 'all_star_count']].drop_duplicates()
        west_east['year'] = year
        all_star_dfs.append(west_east)

    all_star_df = pd.concat(all_star_dfs)
    all_star_df.rename(columns={'Totals': 'Team Abbrev'}, inplace=True)
    team_mapping = pd.read_csv('team_mapping.csv')
    all_star_df = pd.merge(all_star_df, team_mapping, how='left', on='Team Abbrev')

    return all_star_df.drop('Team Abbrev', axis=1)

In [8]:
def get_all_nba():
    """
    :return: Manually downloaded data for each year's first all nba team to add to features.
    """
    first_all_nba = pd.read_csv('all_nba.csv')
    first_all_nba['all_nba_count'] = first_all_nba.groupby(['year', 'Team'])['Team'].transform('count')

    return first_all_nba.drop('player', axis=1)

In [9]:
def combine_standings_metrics(total_standings_df, team_detailed_df, all_star_df, first_all_nba):
    """
    :param total_standings_df:
    :param team_detailed_df:
    :param all_star_df:
    :param first_all_nba:
    :return: Combine standings, offensive + defense statistics, all star, and first team all nba
    data into a single DataFrame. Drop any columns that are highly correlated or not useful in the model.
    Normalize some statistics to be at a per game basis.
    """
    combined_df = pd.merge(total_standings_df, team_detailed_df, how='left', on=['Team', 'year'])
    combined_df = pd.merge(combined_df, all_star_df, how='left', on=['Team', 'year'])
    combined_df = pd.merge(combined_df, first_all_nba, how='left', on=['Team', 'year'])

    combined_df['GB'] = combined_df['GB'].str.replace('—', '0')
    for col in ['GB', 'PS_G']:
        combined_df[col] = combined_df[col].astype(float)

    for orig_val, new_val in [['%', '_Perc'], ['/', '_'], ['3', 'three'], ['2', 'two']]:
        combined_df.columns = combined_df.columns.str.replace(orig_val, new_val)
    corr_cols = ['Rk', 'SRS', 'W', 'L', 'GB', 'PA_G', 'conference_rank', 'Rank', 'MP', 'FG', 'FGA', 'FG_Perc',
                 'FG_Perc_opp', 'threeP', 'threePA', 'twoP', 'twoPA', 'FT', 'FTA', 'TRB', 'PTS', 'Def_Rank',
                 'FG_opp', 'FGA_opp', 'threePA_opp', 'threeP_opp', 'twoP_opp', 'twoPA_opp', 'FT_opp', 'FTA_opp',
                 'TRB_opp', 'PTS_opp', 'PS_G', 'PA_G_opp', 'STL_opp', 'TOV_opp', 'eastern_conference',
                 'FT_Perc_opp', 'BLK_opp', 'AST', 'BLK', 'DRB', 'DRB_opp', 'AST_opp']
    combined_df.drop(corr_cols, axis=1, inplace=True)

    normalize_cols = ['ORB', 'STL', 'TOV', 'PF', 'ORB_opp', 'PF_opp']
    for col in normalize_cols:
        combined_df['{}_G'.format(col)] = combined_df[col] / combined_df['G']

    combined_df.drop(normalize_cols + ['G'], axis=1, inplace=True)
    combined_df['next_year_wl_perc'] = combined_df.groupby(['Team'])['W_L_Perc'].shift(-1)
    
    for col in ['all_star_count', 'all_nba_count']:
        combined_df[col] = combined_df[col].fillna(0)

    return combined_df.drop(['Team', 'W_L_Perc'], axis=1).dropna()

In [10]:
def random_forest(combined_df):
    """
    :return: Attempt random forest regression as quick model.
    """
    model = RandomForestRegressor(random_state=0, n_estimators=50)
    model_data = combined_df.drop(['next_year_wl_perc', 'year'], axis=1)
    y = combined_df.next_year_wl_perc

    model.fit(model_data, y)
    scores = cross_val_score(model, model_data, y)
    print('Random Forest Results:')
    print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

In [11]:
def polynomial_reg(combined_df):
    """
    :return: Build pipeline that will create polynomial regression observations (degree 2)
    and use a Kernel Ridge regression to create a predictive model.  Test the data using
    each year as an test to evaluate mean squared error.
    """
    train_y = combined_df['next_year_wl_perc']
    train_X = combined_df.drop(['next_year_wl_perc', 'year'], axis=1)
    log_cols = train_X.columns.tolist()
    for i in [5, 5, 5]:
        log_cols.pop(i)
#     train_X[log_cols] = train_X[log_cols].apply(np.log10)

    polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
    linear_regression = KernelRidge(alpha=1)

    pipeline = Pipeline([("polynomial_features", polynomial_features), ("clf", linear_regression)])
    pipeline.fit(train_X, train_y)

    # Evaluate the models using cross validation
    scores = cross_val_score(pipeline, train_X, train_y, scoring="neg_mean_squared_error", cv=10)
    print('\n')
    print('Polynomial Regression Results:')
    print("Train MSE = {:.2e}(+/- {:.2e})".format(-scores.mean(), scores.std()))
    full_preds = []
    print('\n')
    for year in range(2005, 2016):
        test_X = combined_df[combined_df['year'] == year]
        test_y = test_X['next_year_wl_perc']
        test_X.drop(['next_year_wl_perc', 'year'], axis=1, inplace=True)
        predicted = cross_val_predict(pipeline, test_X, test_y)
        predicted_scores = cross_val_score(pipeline, test_X, test_X, scoring="neg_mean_squared_error")

        last_season_wl = pd.DataFrame(predicted, columns=['Predict'.format(year + 1)])
        last_season_wl = pd.DataFrame(test_y).reset_index(drop=True).join(last_season_wl)
        last_season_wl['year'] = year

        for exist_col, new_col in zip(last_season_wl.columns, ['actual_games_won', 'predicted_games_won']):
            last_season_wl[new_col] = last_season_wl[exist_col] * 82
        full_preds.append(last_season_wl)

        print("{}: Test MSE = {:.2e}(+/- {:.2e})".format(year, -predicted_scores.mean(), predicted_scores.std()))

    preds_df = pd.concat(full_preds)
    preds_df['delta'] = abs(preds_df['actual_games_won'] - preds_df['predicted_games_won'])
    
    print('\n')
    print('2016 average prediction delta by games is {0:.2f}'.format(preds_df['delta'].mean()))

In [12]:
def main():

    warnings.filterwarnings('ignore')
    team_detailed_df = get_team_detailed_stats()
    total_standings_df = get_all_standings()
    all_star_df = get_all_stars()
    first_all_nba = get_all_nba()
    combined_df = combine_standings_metrics(total_standings_df, team_detailed_df, all_star_df, first_all_nba)

    # models
    random_forest(combined_df)
    polynomial_reg(combined_df)

In [13]:
if __name__ == '__main__':
    main()

Random Forest Results:
CV AUC [ 0.69536482  0.74823411  0.30539638], Average AUC 0.582998436645731


Polynomial Regression Results:
Train MSE = 8.67e-03(+/- 6.37e-03)


2005: Test MSE = 1.82e-01(+/- 1.04e-01)
2006: Test MSE = 2.71e-02(+/- 7.32e-03)
2007: Test MSE = 7.06e-02(+/- 2.84e-02)
2008: Test MSE = 2.66e-02(+/- 1.35e-02)
2009: Test MSE = 4.67e-02(+/- 2.75e-02)
2010: Test MSE = 4.22e-02(+/- 3.83e-02)
2011: Test MSE = 3.47e-02(+/- 7.66e-03)
2012: Test MSE = 4.22e-02(+/- 5.62e-03)
2013: Test MSE = 3.46e-02(+/- 8.32e-03)
2014: Test MSE = 2.49e-02(+/- 9.39e-03)
2015: Test MSE = 5.02e-02(+/- 3.86e-02)


2016 average prediction delta by games is 10.06
