# Applying Neural Networks to NBA Team Win Prediction

In [31]:
# import packages
import os
import numpy as np
import pandas as pd

### Data Cleaning and Aggregation

In [32]:
# create a list to hold all dataset names
dataset_list = []

# adds all CSV files from the datasets folder to a list
for dataset in os.listdir('./win_predictor_data/'):
    if (dataset.endswith('.csv')):
        dataset_list.append(dataset)

# adds all datasets to a dictionary with the key being 'Year Type' 
datasets = {}
for dataset in dataset_list:
    data_file = pd.read_csv('./win_predictor_data/' + dataset)
    dataset_name = dataset[dataset.index('- ') + 1 : dataset.index('.')].strip()
    print(dataset_name)
    datasets[dataset_name] = data_file

2018-2019 Team
2016-2017 Misc
2015-2016 Misc
2017-2018 Misc
2015-2016 Team
2017-2018 Team
2016-2017 Team
2018-2019 Misc


In [33]:
# function to remove asterisks from team names
def remove_asterisk(team):
    if team.endswith("*"):
        return team[0:len(team)-1]
    return team

In [42]:
def create_training_data(datasets):
    """
    Creates the training data using the basketball datasets
    @param datasets: A dictionary with year name & stat types as key and value of a pandas dataframe 
                     containing all the data 
    @return the training data and training data labels 
    """
    # create dictionary to hold aggregated datasets by year containing all (agg_datasets['2015-2016'])
    agg_datasets = {}

    # All the unwanted columns that won't be used in our neural network 
    unwanted_data = ['Rk_x', 'Team', 'Age', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'L', 'Arena', 'Attend.', 'Attend./G', 'Rk_y']

    # run through each dataset and add to dictionary
    for dataset in sorted(datasets):

        # separate year
        year_label = dataset[0 : dataset.index(' ')]

        # remove asterisk from given dataset
        datasets[dataset]['Team'] = datasets[dataset]['Team'].apply(remove_asterisk)

        # add dataset to agg_datasets dictionary
        if year_label in agg_datasets:
            agg_datasets[year_label] = pd.merge(agg_datasets[year_label], datasets[dataset], on='Team')
        else:
            agg_datasets[year_label] = datasets[dataset]

    # Aggregate all the data into one data frame and drop the unwanted features 
    agg_all_sets = pd.concat([agg_datasets[year_label] for year_label in agg_datasets], ignore_index = True)
    agg_all_sets = agg_all_sets.drop(unwanted_data, axis = 1)


    # Convert the aggregated data into a training set 
    training_data = [agg_all_sets.loc[i,:].to_numpy() for i in range(len(agg_all_sets))][1:]

    # Get the training data labels (Wins) from the training_data 
    training_data_labels = [team[0] for team in training_data]

    return training_data, training_data_labels 

    



121


[array([6.700e+01, 1.103e+02, 9.900e+01, 1.130e+01, 9.380e+01, 2.460e-01,
        2.230e-01, 5.640e-01, 5.260e-01, 1.240e+01, 2.300e+01, 1.970e-01,
        4.770e-01, 1.410e+01, 7.910e+01, 1.820e-01, 8.200e+01, 2.403e+02,
        4.010e+01, 8.290e+01, 4.840e-01, 7.000e+00, 1.850e+01, 3.750e-01,
        3.320e+01, 6.440e+01, 5.150e-01, 1.640e+01, 2.040e+01, 8.030e-01,
        9.400e+00, 3.450e+01, 4.390e+01, 2.450e+01, 8.300e+00, 5.900e+00,
        1.310e+01, 1.750e+01, 1.035e+02]),
 array([5.500e+01, 1.131e+02, 1.056e+02, 7.500e+00, 9.670e+01, 2.920e-01,
        2.750e-01, 5.650e-01, 5.240e-01, 1.400e+01, 3.110e+01, 2.280e-01,
        4.840e-01, 1.170e+01, 7.600e+01, 2.050e-01, 8.200e+01, 2.418e+02,
        4.110e+01, 8.640e+01, 4.760e-01, 8.300e+00, 2.370e+01, 3.490e-01,
        3.290e+01, 6.260e+01, 5.240e-01, 1.970e+01, 2.520e+01, 7.820e-01,
        1.310e+01, 3.560e+01, 4.860e+01, 2.300e+01, 7.400e+00, 5.900e+00,
        1.590e+01, 2.060e+01, 1.102e+02]),
 array([5.700e+01, 1.109e+

ToDos
- Convert Data into Proper Format of Neural Net
- Research Neural Network architectures for predictions (watch tutorials/videos)
- set up PyTorch/Keras Framework to create neural network
- EDA (optional)