<a href="https://colab.research.google.com/github/pmcwhannel/NBA-analytics/blob/main/ShotChartML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [294]:
!wc -l  /content/box_score_links.txt

62097 /content/box_score_links.txt


In [293]:
!wget https://raw.githubusercontent.com/pmcwhannel/NBA-analytics/main/box_score_links.txt

--2020-12-14 03:59:31--  https://raw.githubusercontent.com/pmcwhannel/NBA-analytics/main/box_score_links.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4036305 (3.8M) [text/plain]
Saving to: ‘box_score_links.txt’


2020-12-14 03:59:31 (32.3 MB/s) - ‘box_score_links.txt’ saved [4036305/4036305]



### Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import timeit as tt
shot_chart_path = '/content/drive/MyDrive/CS631-Project/all_shot_chart_data/part-00*'
# [player key, qtr, make/miss, TOP_dist, LEFT_dist, time remaining, dist shot,
# game score, players_team ,team_1=away, team_2=home, year] + [opponent]
# Names of the variables for indexing.
PK = 0   # Player Key
QTR = 1  # Quarter
SHOT = 2 # make/miss
TD = 3   # distance from top (pixels)
LD = 4   # distance from left (pixels)
TR = 5   # time remaining
DS = 6   # distance of shot from bucket (ft)
GS = 7   # game score after the shot
TM = 8   # This data will be changed from city name to 3-letter abreviation. To match AW and HM value.
AW = 9   # Away team (3-letter abreviation)
HM = 10  # Home team (3-letter abreviation)
YR = 11  # Year (year the seasons ends in)
OPP = 12 # This will be added later

### Setup Spark

In [None]:
%%shell
apt-get update -qq > /dev/null
apt-get install openjdk-8-jdk-headless -qq > /dev/null
wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
tar xf spark-2.4.7-bin-hadoop2.7.tgz
pip install -q findspark



In [None]:
%%shell
pip install --upgrade pip
pip install lxml
git clone https://github.com/pmcwhannel/NBA-analytics.git
mv NBA-analytics NBAanalytics # So importing functions is easy

Cloning into 'NBA-analytics'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 47 (delta 20), reused 14 (delta 3), pack-reused 0[K
Unpacking objects: 100% (47/47), done.




In [None]:
# Have to rename drive to get rid of NBA-analytics -> NBAanalytics
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext(appName="YourTest", master="local[*]")

ValueError: ignored

In [None]:
# Read in shooter data
def find_opp(x):
  '''
  Identify all shots against a single opposing team.
  '''
  if x[TM] == x[AW]: 
    opponent = x[HM] # home team is who shot is against
  else:
    opponent = x[AW] # away team is who shot is against

  return x + [opponent]

def ply_tm_2_abrv(shooter,tm_2_abrv):
  '''
  input ply_tm: the data of the shooter
  input tm_2_abrv: linkage from PLAYER TEAM to 3-letter abreviation(s)
  '''
  if shooter[TM].upper() in tm_2_abrv.keys():
    for abrv in tm_2_abrv[shooter[TM].upper()]: # {'PLAYER TEAM':[abrv,...,]}
      if abrv == shooter[AW]:
        temp = abrv
        return shooter[:TM] + [temp] + shooter[TM+1:]
      elif abrv == shooter[HM]:
        temp = abrv
        return shooter[:TM] + [temp] + shooter[TM+1:]
    # didn't find a 3-letter abrv match
    return shooter[:TM] + ['MISSING'] + shooter[TM+1:] 
  else:
    # didn't find city name
    return shooter[:TM] + ['UNKNOWN'] + shooter[TM+1:]

### Add opponent shooting FG pct (Grid weighting).

In [None]:
# Team name as pulled from shot chart of shooter
# and mapping to a list of the possible 3-letter abbreviations 
tm_2_abrv ={'ATLANTA':['ATL'],
            'BOSTON':['BOS'],
            'INDIANA':['IND'],
            'CHICAGO':['CHI'],
            'TORONTO':['TOR'],
            'BROOKLYN':['BKN','BRK'],
            'VANCOUVER':['VAN'],
            'NEW YORK':['NYK'],
            'HOUSTON':['HOU'],
            'NEW ORLEANS':['NOP','NOH'],
            'CHARLOTTE':['CHA','CHH','CHO'],
            'DALLAS':['DAL'],
            'GOLDEN STATE':['GSW'],
            'OKLAHOMA CITY':['OKC'],
            'CLEVELAND':['CLE'],
            'LA CLIPPERS':['LAC'],
            'DENVER':['DEN'],
            'DETROIT':['DET'],
            'HOUSTON':['HOU'],
            'LA LAKERS':['LAL'],
            'MINNESOTA':['MIN'],
            'MEMPHIS':['MEM'],
            'MILWAUKEE':['MIL'],
            'MIAMI':['MIA'],
            'ORLANDO':['ORL'],
            'PHILADELPHIA':['PHI'],
            'PORTLAND':['POR'],
            'SACRAMENTO':['SAC'],
            'TORONTO':['TOR'],
            'SAN ANTONIO':['SAS'],
            'UTAH':['UTA'],
            'PHOENIX':['PHX','PHO'],
            'WASHINGTON':['WSH','WSB','WAS'],
            'SEATTLE':['SEA'],
            'NEW ORLEANS/OKLAHOMA CITY':['NOK'],
            'NEW ORLEANS/OKLAHOMA':['NOK'],
            'NEW JERSEY':['NJN']
            }

In [None]:
start = tt.default_timer()
rdd = sc.textFile(shot_chart_path).map(lambda x: x.split(',')).map(lambda x: ply_tm_2_abrv(x, tm_2_abrv)).map(lambda x: find_opp(x)).filter(lambda x: int(x[LD]) <= 500 and int(x[TD]) <= 472).cache()
print(rdd.count())
print('Time it took {}'.format(tt.default_timer() - start))
# Number of shooters with a city match but not an abbreviation match: 449472 (Problems fixed)
# Number of shooters without a city match at all: 113419 (Problems fixed)

4750345
Time it took 22.292592416999923


In [None]:
# Get only NOK for working on a subset of data # (13079 shots)
small_rdd = rdd.filter(lambda x: x[OPP] == 'NOK').cache()

In [None]:
import math
def normpdf(x, mean, sd):
  # Calcualte 1-D gaussian
  var = float(sd)**2
  denom = (2*math.pi*var)**.5
  num = math.exp(-(float(x)-float(mean))**2 / (2*var))
  return num/denom

def calc_distance(shooter, pos):
  # Calculate distance between player and sample shot
  # sqrt((x-x_i)**2 + (y-y_i)**2)
  return ((int(shooter[LD]) - pos[0])**2 + (int(shooter[TD]) - pos[1])**2)**(0.5)

def calc_opp_fg_pct(shooter,x):
  d = calc_distance(shooter,(x[1],x[2]))
  sd = 4   # THIS NEEDS TO BE TUNED MOST LIKELY
  mean = 0 # THIS SHOULD REMAIN ZERO
  weight = normpdf(d,mean,sd)
  return (x[0], weight) # (make/miss, weight)

def create_grid(x, nx, ny):
  # nx: number of partitions to cut x into
  # ny: number of partitions to cut y into

  x_lim = 500 # horizontal pixel limit
  y_lim = 472 # vertical pixel limit
  x_size = 1 + x_lim // nx # number of pixels per x grid
  y_size = 1 + y_lim // ny # number of pixels per y grid
  avg_fg_pct = [] # [(make_total,miss_total,avg_fg),...m]

  for i in range(0,ny):
    avg_fg_pct.append([])
    for j in range(0,nx):
      avg_fg_pct[i].append([0,0,0])


  for shot, x_pos, y_pos in x[1]:
    xid = x_pos // x_size
    yid = y_pos // y_size
    print()
    temp = avg_fg_pct[yid][xid]
    if shot == 'make':
      avg_fg_pct[yid][xid] =  [temp[0] + 1, temp[1], (temp[0] + 1) / (temp[0] + temp[1] + 1)]
    else:
      avg_fg_pct[yid][xid] = [temp[0], temp[1] + 1, temp[0] / (temp[0] + temp[1] + 1)]
  return (x[0], avg_fg_pct)

def add_opp_pct(x, nx, ny):
  # Calculate shooting pct of players from shooter's position
  #(key, ([shooter],[list of shots]))
  # x[0] => shooter
  # x[1] => list of shots
  shooter = x[1][0]
  x_lim = 500 # horizontal pixel limit
  y_lim = 472 # vertical pixel limit
  x_size = 1 + x_lim // nx # number of pixels per x grid
  y_size = 1 + y_lim // ny # number of pixels per y grid

  return shooter + [(x[1][1][int(shooter[TD]) // y_size][int(shooter[LD]) // x_size])[-1]]

In [None]:
# Create list of shots against a team
start = tt.default_timer()
nx = 37
ny = 36 
# ('NOK2007', (make/miss, x, y))
# Get rid of shots that are
rdd_shot_against_all = rdd.map(lambda x: (x[OPP] + x[YR], [(x[SHOT], int(x[LD]), int(x[TD]))])).reduceByKey(lambda x,y:
                        x+y).map(lambda x: create_grid(x, nx, ny)).cache() # [(key,[avg_fg_pct for each grid])] run time ~ 100 seconds

# Add opposing FG pct to MJ data
final_rdd = rdd.map(lambda x: (x[OPP] + x[YR],(x))).join(rdd_shot_against_all).map(lambda x:add_opp_pct(x, nx, ny)).cache()

print('Size of final rdd: {}'.format(final_rdd.count()))
print('It took {} to create the shooting history'.format(tt.default_timer() - start))

Size of final rdd: 4750345
It took 112.31856945399977 to create the shooting history


#### Save to textFile

In [None]:
final_rdd.map(lambda x: ','.join([str(itm) for itm in x])).saveAsTextFile('/content/drive/MyDrive/CS631-Project/' + 'all_shot_chart_data_opp_fg_pct_added_3736')


### Add features to RDD

1.   List item
2.   List item



    -join on player average data
    -join on player metadata

#### Load in player metadata


In [None]:
player_meta.take(1)

['abdelal01,Alaa Abdelnaby,Right,6-10,240lb,1968-06-24,Egypt']

In [None]:
player_meta = sc.textFile('/content/drive/MyDrive/CS631-Project/player_stats/part-000*')

In [None]:
def imp_2_metric(x):
  feet, inches = x[2].split('-')
  weight = float(x[3].strip('lb'))
  return [x[0]] + [x[1]] + [30.48*int(feet) + 2.54*int(inches)] + [weight*0.453592] 

# (playerKey, [shooting hand, height (cm), weight (kg)])
player_meta2= player_meta.map(lambda x: x.split(',')).map(lambda x: [x[0], x[2], x[3], x[4]]).filter(lambda x:
                  x[-1] != 'None').map(lambda x: imp_2_metric(x)).map(lambda x: (x[0],x[1:]))

#### Load in player per game stats

In [None]:
player_per = sc.textFile('/content/drive/MyDrive/CS631-Project/per_game_stats/part-000*')

In [None]:
def custom_split(x):
  temp = x.split(',')
  if len(temp) == 31:
    return temp
  else:
    extra_pos = len(temp) - 31 # check for the number of extra positions
    arr = temp[:4]
    combine = []
    for i in range(0, extra_pos+1):
      combine.append(str(temp[4+i]))
    return arr + [';'.join(combine)] + temp[4 + extra_pos + 1:]

def check_4_dup(x,y):
  if x[0] == 'TOT':
    return x
  elif y[0] == 'TOT':
    return y
  else:
    return x
# PlayerKey, Season, Position, 2P%, 3P%, eFG%.
player_per2 = player_per.map(lambda x: custom_split(x)).map(lambda x: (x[-1] + x[0],(x[2], x))).reduceByKey(lambda x,y: 
      check_4_dup(x,y)).map(lambda x: x[1][1]).map(lambda x: [x[-1], x[0], x[4], x[16], x[13], x[17]]).map(lambda x: (x[0] + x[1], x[2:])) # key is now: playerKey+season

In [None]:
player_per2.filter(lambda x: 'gallida012011' == x[0]).collect()

[('gallida012011', ['PF;SF', '0.467', '0.352', '0.495'])]

#### Join meta data to shooter RDD (loading in RDD)

In [None]:
rdd_3736 = sc.textFile('/content/drive/MyDrive/CS631-Project/all_shot_chart_data_opp_fg_pct_added_3736/part-00*')

# rdd: [PlayerKey, qtr, shot, TD, LD, timeRemaining, distance, gamescore, team, away, home, year, opp, opp-fg%]
# player_meta2: [shooting hand, height (cm), weight (kg)]
# player_per: [position, 2P%, 3P%, eFG%]
# 4750345 number of shots
# .filter(lambda x: x[1][1] != None)
rdd_3736_meta_per = rdd_3736.map(lambda x: x.split(',')).map(lambda x: (x[0],x[1:])).join(player_meta2).map(lambda x:
                      [x[0]] + x[1][0] + x[1][1]).map(lambda x: (x[0] + x[11],x[:])).join(player_per2).map(lambda x: x[1][0] + x[1][1]).cache()

#### Save to textFile

In [None]:
rdd_3736_meta_per.map(lambda x: ','.join([str(itm) for itm in x])).saveAsTextFile('/content/drive/MyDrive/CS631-Project/' + 'all_shooter_3736')

### ML + Data vis and analysis

In [178]:
shooter_data = []
for i in range(0,20):
  if i <= 9:
    end_str = str(0) + str(i)
  else:
    end_str = str(i)
  lines = open('/content/drive/MyDrive/CS631-Project/' + 'all_shooter_3736/part-000' + end_str ).readlines()
  
  print('Reading in part-000{}'.format(end_str))
  for line in lines:
    shooter_data.append([feature.strip() for feature in line.split(',')])


Reading in part-00000
Reading in part-00001
Reading in part-00002
Reading in part-00003
Reading in part-00004
Reading in part-00005
Reading in part-00006
Reading in part-00007
Reading in part-00008
Reading in part-00009
Reading in part-00010
Reading in part-00011
Reading in part-00012
Reading in part-00013
Reading in part-00014
Reading in part-00015
Reading in part-00016
Reading in part-00017
Reading in part-00018
Reading in part-00019


In [262]:
import pandas as pd
import random
# Grab last 100,000 shots and make model
recent_shooters = random.sample(shooter_data,50000)
data = pd.DataFrame(recent_shooters, columns = ['PlayerKey','Quarter','FGM','TD','LD','TimeRemaining','Distance','GameScore','Team',
                                        'Away','Home','Season','Opponent','OPPFGPct','Hand','Height','Weight','Position','2P%','3P%','eFG%'])


In [263]:
# convert data to numerical representations for model compatibility
import numpy as np
def check_pct_type(x):
  if len(x) == 0:
    return 0.0
  else:
    return float(x)

def net_score(x):
  temp = x.split('-')
  return int(temp[0]) - int(temp[1])

def one_hot_pos(df):
  pos_list = df['Position'].tolist()
  pos_2_idx = {'PG':0,'SG':1,'SF':2,'PF':3,'C':4}
  temp = []
  for pos in pos_list:
    agg = [0,0,0,0,0]
    for p in pos.split(';'):
      agg += np.array([1 if i == pos_2_idx[p] else 0 for i in range(0,5)])
    temp.append(list(agg))
  return np.array(temp)

data['FGM'] = data['FGM'].apply(lambda x: int(x=='make'))
data['Distance'] = data['Distance'].apply(lambda x: int(x))
data['TD'] = data['TD'].apply(lambda x: int(x))
data['LD'] = data['LD'].apply(lambda x: int(x))
data['OPPFGPct'] = data['OPPFGPct'].apply(lambda x: float(x))
data['Quarter'] = data['Quarter'].apply(lambda x: int(x))
data['Hand'] = data['Hand'].apply(lambda x: int(x.lower() == 'right'))
data['2P%'] = data['2P%'].apply(lambda x: check_pct_type(x))
data['3P%'] = data['3P%'].apply(lambda x: check_pct_type(x))
data['eFG%'] = data['eFG%'].apply(lambda x: check_pct_type(x))
data['GameScore'] = data['GameScore'].apply(lambda x: net_score(x))
one_hot_mat = one_hot_pos(data)
data['PG'] = one_hot_mat[:,0]
data['SG'] = one_hot_mat[:,1]
data['SF'] = one_hot_mat[:,2]
data['PF'] = one_hot_mat[:,3]
data['C'] = one_hot_mat[:,4]

#### Data visualizations and analysis

In [264]:
import seaborn as sns
import matplotlib.pyplot as plt

In [265]:
fig = plt.figure(figsize=(20,20))
_ = sns.pairplot(data[['FGM', 'Distance', 'OPPFGPct','GameScore']], hue='FGM')

KeyboardInterrupt: ignored

<Figure size 1440x1440 with 0 Axes>

Error in callback <function flush_figures at 0x7f25eb4001e0> (for post_execute):


KeyboardInterrupt: ignored

#### OPPFGPct for each partition

In [None]:
_ = sns.displot(data, x='OPPFGPct', hue='FGM', kind='kde', fill=True)
_ = plt.title('37 Horizontal and 36 Vertical grids')
plt.grid()

#### OPPFGPct by position




In [None]:
data.head(1)

In [None]:
_ = sns.displot(data[data['PG'] == 1], x='OPPFGPct', hue='FGM', kind='kde', fill=True)
_ = plt.title('Point Guard')
plt.grid()

In [None]:
_ = sns.displot(data[data['PF'] == 1], x='OPPFGPct', hue='FGM', kind='kde', fill=True)
_ = plt.title('hello world')
plt.grid()

In [None]:
sns.displot(data[data['SF'] == 1], x='OPPFGPct',hue='FGM',kind='kde',fill=True)

In [None]:
_ = sns.displot(data[data['C'] == 1], x='OPPFGPct', hue='FGM', kind='kde',fill=True)
_ = plt.title('Center')
_= plt.grid()

#### Building Model

In [172]:
import sklearn
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import LogisticRegression as lr
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

In [173]:
def accuracy(preds,targets):
  n = len(preds)
  return sum(preds == targets) / n

In [174]:
data.head()

Unnamed: 0,PlayerKey,Quarter,FGM,TD,LD,TimeRemaining,Distance,GameScore,Team,Away,Home,Season,Opponent,OPPFGPct,Hand,Height,Weight,Position,2P%,3P%,eFG%,PG,SG,SF,PF,C
0,paulch01,3,0,239,188,0:00.0,20,2,NOH,CHI,NOH,2011,CHI,0.285714,1,185.42,79.3786,PG,0.482,0.388,0.502,1,0,0,0,0
1,adamsst01,2,1,53,240,7:39.0,2,-3,OKC,OKC,GSW,2016,GSW,0.452555,1,210.82,120.20188,C,0.613,0.0,0.613,0,0,0,0,1
2,ilgauzy01,1,0,52,240,4:02.0,0,13,CLE,MIL,CLE,2007,MIL,0.733333,1,220.98,107.954896,C,0.486,0.0,0.485,0,0,0,0,1
3,robinna01,2,1,50,240,9:52.0,0,9,NYK,CHA,NYK,2009,CHA,0.573623,1,175.26,81.64656,PG,0.504,0.325,0.498,1,0,0,0,0
4,chandty01,3,1,50,240,10:15.0,0,0,NOH,PHI,NOH,2008,PHI,0.593182,1,213.36,106.59412,C,0.624,0.0,0.623,0,0,0,0,1


In [290]:
#temp = data[data['C'] == 1] 'OPPFGPct','Distance','eFG%'
temp = data
X_train, X_test, y_train, y_test = train_test_split(temp[['OPPFGPct','Distance','eFG%','GameScore']], temp['FGM'], test_size=0.33, random_state=9)
print('Training Samples:',len(X_train))
print('Test Samples:',len(X_test))

Training Samples: 33500
Test Samples: 16500


##### Logistic Regression

In [291]:
# OPPFGPct booster
# GameScore net diff is a booster
clf = lr(random_state=1).fit(X=X_train,y=y_train)
preds = clf.predict(X_test)
prop = sum(y_test == np.zeros(len(y_test))) / len(y_test)
print('The larger proportion is {:.2f}% of the dataset'.format(max(prop, 1-prop)*100))
print('The accuracy was {:.2f}%'.format(accuracy(preds, np.array(y_test))*100))

The larger proportion is 55.16% of the dataset
The accuracy was 66.22%


##### Random Forest

In [292]:
# Boostrapping ..
depth_list = [1,2,5,10,30,50,80,100,120,150]
acc_list = []

clf = rfc(n_estimators = 200, max_depth=3, bootstrap=True, max_samples=0.5).fit(X=X_train,y=y_train)

# No Bootstrap
#clf = rfc(n_estimators = 50, max_depth=8).fit(X=X_train,y=y_train)

preds = clf.predict(X_test)
prop = sum(y_test == np.zeros(len(y_test))) / len(y_test)
acc = accuracy(preds, np.array(y_test))*100
#acc_list.append(acc)
print('The larger proportion is {:.2f}% of the dataset'.format(max(prop,1-prop)*100))
print('The accuracy was {:.2f}%'.format(acc))

#plt.plot(depth_list, acc_list)
#plt.grid()

The larger proportion is 55.16% of the dataset
The accuracy was 66.24%


##### SVC (normal) too slow

In [None]:
clf = SVC().fit(X=X_train,y=y_train)
preds = clf.predict(X_test)
prop = sum(y_test == np.zeros(len(y_test))) / len(y_test)
print('The larger proportion is {:.2f}% of the dataset'.format(max(prop,1-prop)*100))
print('The accuracy was {:.2f}%'.format(accuracy(preds, np.array(y_test))*100))

The larger proportion is 54.92% of the dataset
The accuracy was 61.49%


##### SVC - Linear

In [None]:
clf = LinearSVC().fit(X=X_train,y=y_train)
preds = clf.predict(X_test)
prop = sum(y_test == np.zeros(len(y_test))) / len(y_test)
print('The larger proportion is {:.2f}% of the dataset'.format(max(prop,1-prop)*100))
print('The accuracy was {:.2f}%'.format(accuracy(preds, np.array(y_test))*100))

The larger proportion is 54.92% of the dataset
The accuracy was 64.35%


