#Setup

##Google Drive

In [1]:
!pip install sportsreference
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

Collecting sportsreference
[?25l  Downloading https://files.pythonhosted.org/packages/4a/16/64f2181463018c00df5612cd3319a7cbf4403bd7b5c56ba8db1b9bf21a8d/sportsreference-0.4.7-py2.py3-none-any.whl (373kB)
[K     |▉                               | 10kB 19.6MB/s eta 0:00:01[K     |█▊                              | 20kB 3.1MB/s eta 0:00:01[K     |██▋                             | 30kB 4.5MB/s eta 0:00:01[K     |███▌                            | 40kB 3.0MB/s eta 0:00:01[K     |████▍                           | 51kB 3.7MB/s eta 0:00:01[K     |█████▎                          | 61kB 4.3MB/s eta 0:00:01[K     |██████▏                         | 71kB 5.0MB/s eta 0:00:01[K     |███████                         | 81kB 5.7MB/s eta 0:00:01[K     |███████▉                        | 92kB 6.3MB/s eta 0:00:01[K     |████████▊                       | 102kB 4.9MB/s eta 0:00:01[K     |█████████▋                      | 112kB 4.9MB/s eta 0:00:01[K     |██████████▌                     | 

##Import Packages

In [0]:
import pickle
import pandas as pd
import numpy as np
from sportsreference.nba.teams import Teams
from sportsreference.nba.roster import Roster
from sportsreference.nba.roster import Player
from sportsreference.nba.boxscore import Boxscore
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from statsmodels.formula.api import logit
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
np.random.seed(123)

##Download files

In [0]:
# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data/pickle/')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'1e8GS0L0xUXQDgiRorx__FQViQjHNza7c' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)

title: nba19_schedule.pkl, id: 1dDDb1Z73WN-AdTQsM1uHz3I37J0-pkiO
downloading to /root/data/pickle/nba19_schedule.pkl
title: prob_df.pkl, id: 1hDp9GE3Bka9p9oiCbc8TrXFmnslxb2xz
downloading to /root/data/pickle/prob_df.pkl
title: Simulation_2019_20.xlsx, id: 1wUhiDOCQ1sYVab6GkbXzkBvXyBLbaCz4
downloading to /root/data/pickle/Simulation_2019_20.xlsx
title: logistic_model.pkl, id: 1r9655Pr_rfpAYjUj8Ev84OiNbhzc4TeB
downloading to /root/data/pickle/logistic_model.pkl
title: match_df_processed.pkl, id: 1ntpAAnbKO65Q_T7u-kec809CyURKXavp
downloading to /root/data/pickle/match_df_processed.pkl
title: all_players_data_all_season_processed.pkl, id: 1wuFPczmUxGikAqT-mKlu6uWhZXN9folv
downloading to /root/data/pickle/all_players_data_all_season_processed.pkl
title: all_players_data_all_season.pkl, id: 1Ngu5JnhvdPLE5VPo5LutFfZ2Q5uT9Lgg
downloading to /root/data/pickle/all_players_data_all_season.pkl
title: players_list.pkl, id: 17dLZoeKIEeAcS3ZffJf5kt_xEqUab6Gi
downloading to /root/data/pickle/players_l

#Load Data

##Load data from pickle

In [0]:
# retrieve players' stats data from pickle files
all_players_data_all_season_file = '/root/data/pickle/all_players_data_all_season_processed.pkl'
with open(all_players_data_all_season_file, 'rb') as f:
  all_players_data = pickle.load(f)

#Data preparation

##Season data

In [0]:
seasons_consider = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']
all_players_data = all_players_data.loc[all_players_data['season'].isin(seasons_consider), :]

##Set index

In [0]:
all_players_data = all_players_data.set_index(['season', 'player_name', 'player_id'])

##Select columns

In [0]:
# Performance columns
perf_cols = [
            # 'assist_percentage',
            'assists',
            # 'block_percentage',
            # 'blocks',
            'box_plus_minus',
            'center_percentage',
            # 'defensive_box_plus_minus',
            # 'defensive_rebound_percentage',
            # 'defensive_rebounds',
            # 'defensive_win_shares',
            # 'dunks',
            'effective_field_goal_percentage',
            # 'field_goal_attempts',
            'field_goal_perc_sixteen_foot_plus_two_pointers',
            'field_goal_perc_ten_to_sixteen_feet',
            'field_goal_perc_three_to_ten_feet',
            # 'field_goal_perc_zero_to_three_feet',
            'field_goal_percentage',
            'field_goals',
            # 'free_throw_attempt_rate',
            'free_throw_attempts',
            'free_throw_percentage',
            'free_throws',
            'games_played',
            'games_started',
            # 'half_court_heaves',
            # 'half_court_heaves_made',
            'minutes_played',
            # 'offensive_box_plus_minus',
            # 'offensive_rebound_percentage',
            # 'offensive_rebounds',
            # 'offensive_win_shares',
            # 'percentage_field_goals_as_dunks',
            'percentage_of_three_pointers_from_corner',
            # 'percentage_shots_three_pointers',
            # 'percentage_shots_two_pointers',
            # 'percentage_sixteen_foot_plus_two_pointers',
            # 'percentage_ten_to_sixteen_footers',
            'percentage_three_to_ten_footers',
            # 'percentage_zero_to_three_footers',
            # 'personal_fouls',
            'player_efficiency_rating',
            'point_guard_percentage',
            'points',
            'power_forward_percentage',
            # 'shooting_distance',
            'shooting_guard_percentage',
            'small_forward_percentage',
            # 'steal_percentage',
            # 'steals',
            # 'three_point_attempt_rate',
            # 'three_point_attempts',
            # 'three_point_percentage',
            # 'three_point_shot_percentage_from_corner',
            # 'three_pointers',
            # 'three_pointers_assisted_percentage',
            # 'total_rebound_percentage',
            'total_rebounds',
            'true_shooting_percentage',
            'turnover_percentage',
            'turnovers',
            'two_point_attempts',
            # 'two_point_percentage',
            'two_pointers',
            'two_pointers_assisted_percentage',
            'usage_percentage',
            'win_shares',
            'win_shares_per_48_minutes'
              ]
all_players_data = all_players_data[perf_cols]

##Train, cross-validation, and test split

In [0]:
# Train
Train = all_players_data.loc[['2013-14', '2014-15', '2015-16', '2016-17'],:]
# Val
Val1 = all_players_data.loc[['2017-18'],:]
Val2 = all_players_data.loc[['2018-19'],:]
# Test
Test = all_players_data.loc[['2019-20'],:]

#KNN for MVP determination

##Scaling variables

In [0]:
scaler = StandardScaler()

# Train
scaler.fit(Train)

# Training data
Train_std = scaler.transform(Train)

# Validation data
Val1_std = scaler.transform(Val1)
Val2_std = scaler.transform(Val2)

# Test data
Test_std = scaler.transform(Test)

##Define Model

In [0]:
model = NearestNeighbors(n_neighbors = 10)

##Calculating cetroid of previous MVPs

In [0]:
index1 = Train.index.get_loc(('2013-14', 'Kevin Durant', 'duranke01'))
index2 = Train.index.get_loc(('2014-15', 'Stephen Curry', 'curryst01'))
index3 = Train.index.get_loc(('2015-16', 'Stephen Curry', 'curryst01'))
index4 = Train.index.get_loc(('2016-17', 'Russell Westbrook', 'westbru01'))
centroid = Train_std[[index1,index2,index3,index4],:].mean(axis = 0)

##MVP prediction for season 2017-18 (Validation 1)

In [0]:
model.fit(Val1_std) 
distance, indices = model.kneighbors([centroid])

print(Val1.iloc[indices[0],:].index)

MultiIndex([('2017-18',          'James Harden', 'hardeja01'),
            ('2017-18',          'LeBron James', 'jamesle01'),
            ('2017-18',        'Damian Lillard', 'lillada01'),
            ('2017-18', 'Giannis Antetokounmpo', 'antetgi01'),
            ('2017-18',     'Russell Westbrook', 'westbru01'),
            ('2017-18',          'Kevin Durant', 'duranke01'),
            ('2017-18',         'DeMar DeRozan', 'derozde01'),
            ('2017-18',         'Anthony Davis', 'davisan02'),
            ('2017-18',        'Victor Oladipo', 'oladivi01'),
            ('2017-18',          'Kemba Walker', 'walkeke02')],
           names=['season', 'player_name', 'player_id'])


##MVP prediction for season 2018-19 (Validation 2)



In [0]:
model.fit(Val2_std) 
distance, indices = model.kneighbors([centroid])

print(Val2.iloc[indices[0],:].index)

MultiIndex([('2018-19', 'Giannis Antetokounmpo', 'antetgi01'),
            ('2018-19',        'Damian Lillard', 'lillada01'),
            ('2018-19',          'James Harden', 'hardeja01'),
            ('2018-19',          'Kevin Durant', 'duranke01'),
            ('2018-19',           'Paul George', 'georgpa01'),
            ('2018-19',          'Nikola Jokić', 'jokicni01'),
            ('2018-19',          'Kemba Walker', 'walkeke02'),
            ('2018-19',         'Blake Griffin', 'griffbl01'),
            ('2018-19',          'Bradley Beal',  'bealbr01'),
            ('2018-19',    'Karl-Anthony Towns', 'townska01')],
           names=['season', 'player_name', 'player_id'])


##MVP prediction for current season (Testing)

In [0]:
model.fit(Test_std) 
distance, indices = model.kneighbors([centroid])

print(Test.iloc[indices[0],:].index)

MultiIndex([('2019-20',          'James Harden', 'hardeja01'),
            ('2019-20', 'Giannis Antetokounmpo', 'antetgi01'),
            ('2019-20',           'Luka Dončić', 'doncilu01'),
            ('2019-20',          'LeBron James', 'jamesle01'),
            ('2019-20',         'Anthony Davis', 'davisan02'),
            ('2019-20',        'Damian Lillard', 'lillada01'),
            ('2019-20',            'Trae Young', 'youngtr01'),
            ('2019-20',          'Bradley Beal',  'bealbr01'),
            ('2019-20',    'Karl-Anthony Towns', 'townska01'),
            ('2019-20',      'Donovan Mitchell', 'mitchdo01')],
           names=['season', 'player_name', 'player_id'])


In [0]:
# distance_ind = (centroid - Val2_std[53,:])**2
# distance_ind_perc = distance_ind/distance_ind.sum()*100
# d_ = dict()
# for i, j in zip(Val2.columns, distance_ind_perc):
#   d_[i] = j
# pd.set_option("display.max_rows",None)
# print(pd.DataFrame(d_, index=[1]).T)
# pd.set_option("display.max_rows",60)