In [None]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import torch
import time

curr_directory = os.getcwd()
#
#
#
# Create index for all players + roster dictionaries
#
#
# Player Roster Information (Copied over from player_roster.ipynb)
teams = ['BOS','BRK','NYK','PHI','TOR','CHI','CLE','DET','IND','MIL','ATL','CHO','MIA','ORL','WAS',
         'DEN','MIN','OKC','POR','UTA','GSW','LAC','LAL','PHO','SAC','DAL','HOU','MEM','NOP','SAS']
        
# Dictionary of roster
# Ex. The roster of Boston Celtics players for the 2019-2020 season can be accessed using roster['BOS']['2019']
# It does not include any players/rookies for which there is no season data
roster = {}
    
for team in teams: 
    roster[team] = {}

# Initialize set for list of all players (with no repeats)
all_players = set()
    
for filename in os.listdir(os.path.join(curr_directory, 'data_sets/player_roster')):
    data = pd.read_csv(os.path.join('data_sets/player_roster', filename))
    year = filename[0:4]
    
    for team in teams:
        roster[team][year] = []
        
        players = data.loc[data['Tm'] == team]
        for ind in players.index: 
            player_name = players['Player'][ind].split('\\', 1)[0]
            if player_name not in roster[team][year]: 
                roster[team][year].append(player_name)
            
        all_players.update(roster[team][year])

num_players = len(all_players)
        
# Player dictionary that maps all players to index
player_index = dict(zip(list(all_players), range(len(all_players))))
#
#
#
# Data Processing
#
#
#
# Game data from 2014 - 2015 season to 2017-2018 season
game_data = pd.read_csv(os.path.join(curr_directory,'data_sets/nba.games.stats.csv'))

# Sort all values by the Date
game_data = game_data.sort_values(by=['Date'])

# game has chronical order and Y shows score differential, X plus is team 1 payer
# X minus is team 2 player
game_results = np.array(list(game_data['TeamPoints'] - game_data['OpponentPoints']))
teams = np.array(list(zip(game_data.Team, game_data.Opponent)))
dates = np.array(list(game_data['Date']))

unique_dates = list(set(dates))

# Makes an index of all games that are repeated
repeat_indexes = []

for date in unique_dates: 
    same_day = np.where(dates == date)
    # suppose same_day = [0, 1, 2, 3, 4, 5]
    for i in same_day[0]: 
        # start with i = 0
        for j in same_day[0]: 
            # j = 0, 1, 2, 3, 4, 5
            if j > i: 
                if np.array_equal(np.flip(teams[j], axis=0) , teams[i]): 
                    repeat_indexes.append(j)

# Make new unique game results, teams and dates arrays
unique_game_results = game_results[repeat_indexes]
unique_teams = teams[repeat_indexes]
unique_dates = dates[repeat_indexes]
#
#
#
# Game data from the 2018-2019 season and the 2019-2020 season
game_data_2018 = pd.read_csv(os.path.join(curr_directory,'data_sets/game_data_2018_2019.csv'))
game_data_2019 = pd.read_csv(os.path.join(curr_directory,'data_sets/game_data_2019_2020.csv'))

# Strip the day of week abbreviation from Date
game_data_2018['Date'] = game_data_2018['Date'].str[4:]
game_data_2019['Date'] = game_data_2019['Date'].str[4:]

# Date conversion functions
def monthToNum(shortMonth):
    return{
            'Jan' : '01',
            'Feb' : '02',
            'Mar' : '03',
            'Apr' : '04',
            'May' : '05',
            'Jun' : '06',
            'Jul' : '07',
            'Aug' : '08',
            'Sep' : '09', 
            'Oct' : '10',
            'Nov' : '11',
            'Dec' : '12'
    }[shortMonth]

def convert_dates(dataframe): 
    for i in range(dataframe['Date'].shape[0]): 
        if len(dataframe['Date'][i]) == 10: 
            year = dataframe['Date'][i][6:10]
            date = '0' + dataframe['Date'][i][4]
            month = monthToNum(dataframe['Date'][i][0:3])
            dataframe.loc[i, 'Date'] = year + '-' + month + '-' + date
        else: 
            year = dataframe['Date'][i][7:11]
            date = dataframe['Date'][i][4:6]
            month = monthToNum(dataframe['Date'][i][0:3])
            dataframe.loc[i, 'Date'] = year + '-' + month + '-' + date

convert_dates(game_data_2018)
convert_dates(game_data_2019)

game_results_2018 = np.array(list(game_data_2018['Visitor PTS'] - game_data_2018['Home PTS']))
teams_2018 = np.array(list(zip(game_data_2018.Visitor, game_data_2018.Home)))
dates_2018 = np.array(list(game_data_2018['Date']))

game_results_2019 = np.array(list(game_data_2019['Visitor PTS'] - game_data_2019['Home PTS']))
teams_2019 = np.array(list(zip(game_data_2019.Visitor, game_data_2019.Home)))
dates_2019 = np.array(list(game_data_2019['Date']))
#
#
#
# Combine all data into one dataset
#
#
#
teams_all = np.concatenate((unique_teams, teams_2018, teams_2019), axis=0)
dates_all = np.concatenate((unique_dates, dates_2018, dates_2019), axis=0)
results_all = np.concatenate((unique_game_results, game_results_2018, game_results_2019), axis=0)

In [None]:
# Create a class for linear regression
class LinearRegression:
    def __init__(self, step_size=1e-3, max_iter=10000, eps=1e-4, theta=None, 
                  verbose=True):
        
        self.theta = theta
        self.step_size = step_size
        self.max_iter = max_iter
        self.eps = eps
        self.error_list = []
        self.training_acc = []
        self.dev_acc = []

    def sigmoid(self, z): 
        return 1.0 / (1 + np.exp(-z))
        
    def predict(self, x):
        return self.sigmoid(x@self.theta)
    
    def loss_function_t(self, theta_t, x, y):
        EPS = 1e-8
        x = torch.tensor(x)
        y = torch.tensor(y)
        p = torch.sigmoid(x @ theta_t)
        return -1.*((y * torch.log(p + EPS) + (1-y) * torch.log(1 - p + EPS)).sum())
    
    def pytorch_gradient(self, x, y):
        theta_t = torch.tensor(self.theta, requires_grad=True)
        self.loss_function_t(theta_t, x, y).backward()
        return theta_t.grad.numpy()
    
    def gradBatchLossFunction(self, x_teams, y_teams):
        update = 0
        for i in range(x_teams.shape[0]):
            x = x_teams[i,:]
            x_new = np.reshape(x, (2*num_players + 1, 1))
            y = np.asscalar(y_teams[i])
            update += (self.predict(x_new) - y)*x_new
            
        return update
    
    def fit(self, x, y, dev_x, dev_y):
        iterations = 0
        abs_error = 1
        batch_size = 32
        
        if self.theta is None: 
            self.theta = np.zeros((2*num_players + 1, 1))
        
        while iterations < self.max_iter and abs_error >= self.eps:
            error = self.step_size*self.pytorch_gradient(x, y)
            abs_error = np.linalg.norm(error, 2)
            self.error_list.append(abs_error)

            theta_new = self.theta - error
            self.theta = theta_new
            
            train_accuracy = self.training_predict(x, y)
            self.training_acc.append(train_accuracy)
            dev_accuracy = self.training_predict(dev_x, dev_y)
            self.dev_acc.append(dev_accuracy)

            if iterations % 100 == 0: 
                print('Error {}: {}'.format(iterations, abs_error))
                print('Training Accuracy: {}'.format(train_accuracy))
                print('Dev Accuracy: {}'.format(dev_accuracy))

            iterations += 1
        
        print('Convergence!')
        plt.style.use('seaborn-darkgrid')
        plt.plot(self.training_acc, color = 'firebrick', label='Training Accuracy')
        plt.plot(self.dev_acc, color = 'teal', label='Dev Accuracy')
        plt.legend(loc='lower right')
        plt.xlabel('Iterations')
        plt.ylabel('Accuracy')
        plt.savefig('{}_linear.png'.format(self.step_size), dpi=300)
        plt.show()
        
    def process_data(self, teams, dates, results): 
        num_games = teams.shape[0]

        # Create x for all games
        # To access x for 0th game -- x[0, :] 
        x_without_intercept = np.zeros((num_games, 2*num_players))
        
        for i in range(num_games): 
            z, t = self.x_for_game(teams[i], dates[i])
            combined = np.vstack((z, t))
            x_without_intercept[i, :] = combined[:,0]
            
        x = self.add_intercept(x_without_intercept)
        
        # Create y for all games (if team A wins, y = 1; if team B wins, y = 0)
        y = np.zeros((num_games, 1))
        for i in range(num_games): 
            if results[i] > 0: 
                y[i] = 1
            else:
                y[i] = 0
                
        return x, y
    
    def add_intercept(self, x): 
        new_x = np.zeros((x.shape[0], x.shape[1]+1))
        new_x[:, 0] = 1
        new_x[:, 1:] = x
        
        return new_x
        
    def x_for_game(self, teams, date): 
        x_1 = np.zeros((num_players, 1))
        x_2 = np.zeros((num_players, 1))

        if int(date[5:7]) < 9: 
            year = str(int(date[0:4]) - 1)
        else: 
            year = date[0:4]

        team_1_players = roster[teams[0]][year]
        for item in team_1_players: 
            x_1[player_index[item]] = 1

        team_2_players = roster[teams[1]][year]
        for item in team_2_players: 
            x_2[player_index[item]] = 1

        return x_1, x_2
    
    def training_predict(self, test_x, test_y): 
        predicted_y = []
        for i in range(test_x.shape[0]):
            x = test_x[i,:]
            prediction = self.predict(x)
            if np.asscalar(prediction) > 0.5: 
                predicted_y.append(1)
            else: 
                predicted_y.append(0)

        predicted_y = np.array(predicted_y)
        return np.mean(np.array(predicted_y) == np.array(test_y.T))
    
    def general_predict(self, teams, dates, results): 
        test_x, test_y = self.process_data(teams, dates, results)
        
        predicted_y = []
        for i in range(test_x.shape[0]):
            x = test_x[i,:]

            prediction = self.predict(x)
            if np.asscalar(prediction) > 0.5: 
                predicted_y.append(1)
            else: 
                predicted_y.append(0)

        predicted_y = np.array(predicted_y)
        return np.mean(np.array(predicted_y) == np.array(test_y.T))
    
    def playoff_prediction(self, playoff_filename, playoff_date): 
        #Load playoff data
        playoff_data = pd.read_csv(os.path.join(curr_directory, playoff_filename))

        raw_playoff_results = np.array(list(playoff_data['PTS'] - playoff_data['PTS.1']))
        raw_playoff_team_pairs = np.array(list(zip(playoff_data['Visitor/Neutral'], playoff_data['Home/Neutral'])))
        raw_playoff_dates = np.array(list(playoff_data['Date']))

        playoff_pairs = {}

        for i in range(len(raw_playoff_team_pairs)): 
            team_1 = raw_playoff_team_pairs[i][0]
            team_2 = raw_playoff_team_pairs[i][1]
            if (team_1,team_2) in playoff_pairs.keys(): 
                # if results > 0 --> team A won --> +1
                # if results < 0 --> team B won --> -1
                if raw_playoff_results[i] > 0: 
                    playoff_pairs[team_1,team_2] += 1
                else: 
                    playoff_pairs[team_1,team_2] += -1
            elif (team_2,team_1) in playoff_pairs.keys():
                # if results > 0 --> team B won --> -1
                # if results < 0 --> team A won --> +1
                if raw_playoff_results[i] > 0: 
                    playoff_pairs[team_2,team_1] += -1
                else: 
                    playoff_pairs[team_2,team_1] += 1
            else: 
                if raw_playoff_results[i] > 0: 
                    playoff_pairs[team_1,team_2] = 1
                else: 
                    playoff_pairs[team_1,team_2] = -1

        playoff_teams = []
        playoff_results = []
        playoff_dates = []

        for key in playoff_pairs: 
            playoff_teams.append([key[0], key[1]])
            playoff_results.append(playoff_pairs[key])
            playoff_dates.append(playoff_date)

        playoff_teams = np.array(playoff_teams)
        playoff_results = np.array(playoff_results)
        playoff_dates = np.array(playoff_dates)

        playoff_x, playoff_y = self.process_data(playoff_teams, playoff_dates, playoff_results)

        predicted_y = []
        predictions = []

        for i in range(playoff_x.shape[0]):
            x = playoff_x[i,:]
            prediction = self.predict(x)
            predictions.append(prediction)
            if np.asscalar(prediction) > 0.5: 
                predicted_y.append(1)
            else: 
                predicted_y.append(0)

        predicted_y = np.array(predicted_y)

        prediction_accuracy = np.mean(np.array(predicted_y) == np.array(playoff_y.T[0][:]))

        return prediction_accuracy

In [None]:
teams_s, dates_s, results_s = shuffle(teams_all, dates_all, results_all, random_state=10)
# 64% training, 16% dev, 20% test
n_train = 4159
n_dev = 5199
n_full = 6500

teams_train = teams_s[0:n_train]
dates_train = dates_s[0:n_train]
results_train = results_s[0:n_train]

teams_dev = teams_s[n_train + 1:n_dev]
dates_dev = dates_s[n_train + 1:n_dev]
results_dev = results_s[n_train + 1:n_dev]

teams_test = teams_s[n_dev + 1:n_full]
dates_test = dates_s[n_dev + 1:n_full]
results_test = results_s[n_dev + 1:n_full]

In [None]:
model_1e3 = LinearRegression(step_size=1e-5, eps=1e-4, max_iter=1000)

x_1e3, y_1e3= model_1e3.process_data(teams_train, dates_train, results_train)
dev_x_1e3, dev_y_1e3 = model_1e3.process_data(teams_dev, dates_dev, results_dev)
model_1e3.fit(x_1e3, y_1e3, dev_x_1e3, dev_y_1e3)

train_accuracy_1e3 = model_1e3.general_predict(teams_train, dates_train, results_train)
dev_accuracy_1e3 = model_1e3.general_predict(teams_dev, dates_dev, results_dev)

print('Train Accuracy: {}'.format(train_accuracy_1e3))
print('Dev Accuracy: {}'.format(dev_accuracy_1e3))

np.savetxt('error_1e3_linear.txt', np.array(model_1e3.error_list), delimiter =',')
np.savetxt('training_acc_1e2_linear.txt', np.array(model_1e3.training_acc), delimiter =',')
np.savetxt('dev_acc_1e3_linear.txt', np.array(model_1e3.dev_acc), delimiter =',')

In [None]:
test_accuracy_1e3 = model_1e3.general_predict(teams_test, dates_test, results_test)
print(test_accuracy_1e3)

In [None]:
# Train on 2014-2015 game, test on 2015-2016 season
train_2014 = LinearRegression(step_size=1e-5, max_iter = 1000)

teams_train = teams_all[0:1229]
dates_train = dates_all[0:1229]
results_train = results_all[0:1229]

teams_14, dates_14, results_14 = shuffle(teams_train, dates_train, results_train, random_state=0)
x_train_14, y_train_14 = train_2014.process_data(teams_14[0:983], dates_14[0:983], results_14[0:983])
dev_x_14, dev_y_14 = train_2014.process_data(teams_14[984:1229], dates_14[984:1229], results_14[984:1229])
train_2014.fit(x_train_14, y_train_14, dev_x_14, dev_y_14)

dev_accuracy_14 = train_2014.general_predict(teams_14[984:1229], dates_14[984:1229], results_14[984:1229])
print('Dev Accuracy: {}'.format(dev_accuracy_14))

prediction_2014 = train_2014.general_predict(teams_all[1230:2459], dates_all[1230:2459], results_all[1230:2459])
print('Train Accuracy: {}'.format(prediction_2014))

In [None]:
# Train on 2014-2015 game, test on 2015-2016 season
train_2014 = LinearRegression(step_size=1e-5, max_iter = 1000)

teams_train = teams_all[0:6149]
dates_train = dates_all[0:6149]
results_train = results_all[0:6149]

teams_14, dates_14, results_14 = shuffle(teams_train, dates_train, results_train, random_state=0)
x_train_14, y_train_14 = train_2014.process_data(teams_14[0:4919], dates_14[0:4919], results_14[0:4919])
dev_x_14, dev_y_14 = train_2014.process_data(teams_14[4920:6149], dates_14[4920:6149], results_14[4920:6149])
train_2014.fit(x_train_14, y_train_14, dev_x_14, dev_y_14)

dev_accuracy_14 = train_2014.general_predict(teams_14[4920:6149], dates_14[4920:6149], results_14[4920:6149])
print('Dev Accuracy: {}'.format(dev_accuracy_14))

prediction_2014 = train_2014.general_predict(teams_all[6150:6500], dates_all[6150:6500], results_all[6150:6500])
print('Train Accuracy: {}'.format(prediction_2014))

In [None]:
# Parameter Examination -- doesn't really make a ton of sense to do this with linear regression...
indices_2014 = np.argsort(train_2014.theta, axis=0)[-20:]
print(indices_2014)
print(list(all_players)[231 % 963])
print(list(all_players)[852 % 963])
print(list(all_players)[995 % 963])
print(list(all_players)[1837 % 963])
print(list(all_players)[1534 % 963])