<a href="https://colab.research.google.com/github/VinGuar/NBAStatsAndPredictions/blob/testing/MachineLearningWithGoogleColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#importing all things needed
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import numpy as np
#scaler to scale data
scaler = MinMaxScaler()


In [2]:
#reads the data 
dfold = pd.read_csv("player12131415.csv")
dfnew = pd.read_csv("player161718.csv")

#makes a frame of both and drops rows with NA
frames = [dfold, dfnew]
allPlayer1 = pd.concat(frames)
allPlayer1 = allPlayer1.dropna()

#only includes players with over 24 minutes to have accurate data and then makes it so it only has columns needed
mask1 = (allPlayer1['playMin'] > 24)
allPlayer = allPlayer1.loc[mask1]
allPlayer = allPlayer[['teamRslt', 'playPos', "playPTS", "playAST", "playTO", "playSTL", "playBLK", "playPF","playFGA", "playFGM", "play2PA", "play2PM", "play3PA", "play3PM", "playFTA", "playFTM", "playTRB"]]


In [3]:
#this cell makes it so only data from specific position is in the dataframe for later. Then drops position column
#Then it resets index and changes wins/losses to 1 and 0 and finally scales frame.
def makePos(pos):
  mask = allPlayer['playPos'] == pos
  temp = allPlayer.loc[mask]
  temp = temp.drop(labels="playPos", axis=1, inplace = False)
  temp = temp.reset_index() 
  temp = temp.drop(labels="index", axis=1, inplace = False)
  temp['teamRslt'] = temp['teamRslt'].map({'Win': 1, 'Loss': 0})
  temp[temp.columns] = scaler.fit_transform(temp[temp.columns])

  return temp

In [4]:
#this cell is where the main machine learning is done
def findFeatures(df, alp):
  #finds the predictors that will be used for stats.
  selectedFeatures = list(df)
  selectedFeatures.remove("teamRslt")
  predictors = df.columns[df.columns.isin(selectedFeatures)]

  #machine learning using ridge regression, and then finds coefficients
  mod = Ridge(alpha=alp)
  mod.fit(df[predictors], df["teamRslt"])
  coeffs = mod.coef_

  #makes dictionary of coefficients of each stat and sorts. These coefficients are mainly what I wanted.
  coeffsDict = dict(zip(predictors, coeffs))
  coeffsDict = sorted(coeffsDict.items(), key=lambda x:x[1])


  return coeffsDict


In [5]:
#This cell is just finding the best alpha to be used, which ended up being 1.
def alphaSelect(df):

  y = df["teamRslt"]
  x = df.drop(labels="teamRslt", axis=1, inplace = False)

  model = Ridge()

  param = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

  grid = GridSearchCV(model, param, cv=5)
  grid.fit(x, y)

  return grid.best_params_['alpha']




In [6]:
#Getting stats with coefficients for PG
pointG = makePos("PG")
paramPG = alphaSelect(pointG)
predPG = findFeatures(pointG, paramPG)

In [7]:
#Getting stats with coefficients for SG
shootingG = makePos("SG")
paramSG = alphaSelect(shootingG)
predSG = findFeatures(shootingG, paramSG)

In [8]:
#Getting stats with coefficients for SF
smallF = makePos("SF")
paramSF = alphaSelect(smallF)
predSF = findFeatures(smallF, paramSF)

In [9]:
#Getting stats with coefficients for PF
powerF = makePos("PF")
paramPF = alphaSelect(powerF)
predPF = findFeatures(powerF, paramPF)

In [10]:
#Getting stats with coefficients for C
center = makePos("C")
paramC = alphaSelect(center)
predC = findFeatures(center, paramC)

In [11]:
#The actual coefficients for PG. Negative ones imply a loss, and positive imply win. 
#The higher the absolute value of coefficient the more predictive it is of win or loss
predPG


[('playFGA', -0.6041518924268755),
 ('play2PA', -0.5817563323153516),
 ('playFTA', -0.3978786078978085),
 ('play3PA', -0.35805094568665435),
 ('playTO', -0.30083741720748036),
 ('playPF', -0.2120115066906476),
 ('playBLK', 0.1496326164504983),
 ('play2PM', 0.1633828915566688),
 ('playSTL', 0.2380367995743921),
 ('playTRB', 0.33332406613842364),
 ('playFGM', 0.38022183452751585),
 ('play3PM', 0.38798203669720305),
 ('playAST', 0.4311028079096736),
 ('playFTM', 0.4737587834148403),
 ('playPTS', 0.5002416735349465)]

In [15]:
#The actual coefficients for SG. Negative ones imply a loss, and positive imply win. 
#The higher the absolute value of coefficient the more predictive it is of win or loss
predSG

[('playFGA', -0.545613383596639),
 ('play3PA', -0.42511175850290206),
 ('play2PA', -0.3944754557695734),
 ('playTO', -0.3802723565114156),
 ('playFTA', -0.334015143594086),
 ('playPF', -0.18794302958516174),
 ('playSTL', 0.20677721449065647),
 ('playBLK', 0.21212180798089755),
 ('playTRB', 0.22596688506464213),
 ('play2PM', 0.2280612752318785),
 ('play3PM', 0.388046937769855),
 ('playAST', 0.38988739549638984),
 ('playFGM', 0.3987437747447201),
 ('playFTM', 0.40649217586804715),
 ('playPTS', 0.4395938153652634)]

In [14]:
#The actual coefficients for SF. Negative ones imply a loss, and positive imply win. 
#The higher the absolute value of coefficient the more predictive it is of win or loss
predSF

[('play2PA', -0.6789656212548303),
 ('playFGA', -0.5398153263626682),
 ('playFTA', -0.32973388684884714),
 ('play3PA', -0.31532369907040514),
 ('playTO', -0.28720966537461357),
 ('playPF', -0.18689402527665935),
 ('playSTL', 0.21625507765439278),
 ('playBLK', 0.22130782402360266),
 ('play3PM', 0.25980838501071185),
 ('play2PM', 0.2811287790976505),
 ('playTRB', 0.3245960707766622),
 ('playFGM', 0.3696429657024721),
 ('playFTM', 0.4242212872038444),
 ('playAST', 0.4430524129368052),
 ('playPTS', 0.46668594501467564)]

In [13]:
#The actual coefficients for PF. Negative ones imply a loss, and positive imply win. 
#The higher the absolute value of coefficient the more predictive it is of win or loss
predPF

[('playFGA', -0.47848160638654863),
 ('play2PA', -0.40517846771147875),
 ('playFTA', -0.3380128078041573),
 ('play3PA', -0.2363215785476092),
 ('playTO', -0.23059669817117978),
 ('playPF', -0.15199871346013683),
 ('playSTL', 0.09783137624084995),
 ('playTRB', 0.23370072750688123),
 ('playBLK', 0.24844339785159317),
 ('play3PM', 0.2716967358938224),
 ('play2PM', 0.28145503945232325),
 ('playFGM', 0.35988672879187833),
 ('playFTM', 0.36285419430181237),
 ('playAST', 0.4220211161024841),
 ('playPTS', 0.46338596076923094)]

In [12]:
#The actual coefficients for C. Negative ones imply a loss, and positive imply win. 
#The higher the absolute value of coefficient the more predictive it is of win or loss
predC

[('playFGA', -0.4516003074885281),
 ('play3PA', -0.36150580372989805),
 ('play2PA', -0.32166653757477376),
 ('playTO', -0.25044110608202164),
 ('playPF', -0.06384746359282258),
 ('playFTA', 0.0005284484097624423),
 ('playSTL', 0.07939236919636329),
 ('playFTM', 0.13010716287939678),
 ('play2PM', 0.23006080401929851),
 ('playTRB', 0.2438799792877528),
 ('play3PM', 0.2748369907538867),
 ('playFGM', 0.310717857888903),
 ('playPTS', 0.3254765721176555),
 ('playBLK', 0.35985115572314524),
 ('playAST', 0.3785108251385527)]