In [13]:
# With help from tutorial @ Ken Jee Youtube: https://www.youtube.com/watch?v=7O4dpR9QMIM&t=308s&ab_channel=KenJee

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [15]:
# load in cleaned csv
df = pd.read_csv('cleaned_nba_2020.csv')
df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,3P,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Steven Adams,C,26,OKC,63,63,26.7,4.5,7.6,0.0,...,3.2,3.3,6.0,9.3,2.3,0.8,1.1,1.5,1.9,10.9
1,Bam Adebayo,PF,22,MIA,72,72,33.6,6.1,11.0,0.0,...,5.3,2.4,7.8,10.2,5.1,1.1,1.3,2.8,2.5,15.9
2,LaMarcus Aldridge,C,34,SAS,53,53,33.1,7.4,15.0,1.2,...,3.6,1.9,5.5,7.4,2.4,0.7,1.6,1.4,2.4,18.9
3,Kyle Alexander,C,23,MIA,2,0,6.5,0.5,1.0,0.0,...,0.0,1.0,0.5,1.5,0.0,0.0,0.0,0.5,0.5,1.0
4,Nickeil Alexander-Walker,SG,21,NOP,47,1,12.6,2.1,5.7,1.0,...,0.8,0.2,1.6,1.8,1.9,0.4,0.2,1.1,1.2,5.7


In [16]:
# choose relevant columns 
# as most the data revolves 
df.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA',
       '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS'],
      dtype='object')

In [17]:
# We want columns that will help determine the player's PTS
df_model = df[['Player', 'Pos', 'Age', 'Tm', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'PTS']]
df_model.head()

Unnamed: 0,Player,Pos,Age,Tm,MP,FG,FGA,3P,3PA,2P,2PA,FT,FTA,PTS
0,Steven Adams,C,26,OKC,26.7,4.5,7.6,0.0,0.0,4.5,7.5,1.9,3.2,10.9
1,Bam Adebayo,PF,22,MIA,33.6,6.1,11.0,0.0,0.2,6.1,10.8,3.7,5.3,15.9
2,LaMarcus Aldridge,C,34,SAS,33.1,7.4,15.0,1.2,3.0,6.2,12.0,3.0,3.6,18.9
3,Kyle Alexander,C,23,MIA,6.5,0.5,1.0,0.0,0.0,0.5,1.0,0.0,0.0,1.0
4,Nickeil Alexander-Walker,SG,21,NOP,12.6,2.1,5.7,1.0,2.8,1.1,2.8,0.5,0.8,5.7


In [18]:
# get dummy data
# used in regression analysis to represent subgroups of the sample in the study
# categorical data
df_dum = pd.get_dummies(df_model)
df_dum.head()

Unnamed: 0,Age,MP,FG,FGA,3P,3PA,2P,2PA,FT,FTA,...,Tm_ORL,Tm_PHI,Tm_PHO,Tm_POR,Tm_SAC,Tm_SAS,Tm_TOR,Tm_TOT,Tm_UTA,Tm_WAS
0,26,26.7,4.5,7.6,0.0,0.0,4.5,7.5,1.9,3.2,...,0,0,0,0,0,0,0,0,0,0
1,22,33.6,6.1,11.0,0.0,0.2,6.1,10.8,3.7,5.3,...,0,0,0,0,0,0,0,0,0,0
2,34,33.1,7.4,15.0,1.2,3.0,6.2,12.0,3.0,3.6,...,0,0,0,0,0,1,0,0,0,0
3,23,6.5,0.5,1.0,0.0,0.0,0.5,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,21,12.6,2.1,5.7,1.0,2.8,1.1,2.8,0.5,0.8,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# train test split
X = df_dum.drop('PTS', axis=1)
y = df_dum.PTS.values
X_train, X_test, y_train, t_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a multiple linear regression
# utilize statsmodel OLS regresion to get the ordinary least squares

import statsmodels.api as sm
from sklearn import linear_model

# add constant so linear regression data can have intercept with slope of 1
X_sm = sm.add_constant(X)
model = sm.OLS(y, X_sm).fit()
predict = model.predict(X_sm)
model.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,5847.0
Date:,"Tue, 16 Feb 2021",Prob (F-statistic):,1.94e-109
Time:,21:31:03,Log-Likelihood:,1370.4
No. Observations:,632,AIC:,-1607.0
Df Residuals:,65,BIC:,915.7
Df Model:,566,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0386,0.044,0.884,0.380,-0.049,0.126
Age,-0.0005,0.002,-0.193,0.848,-0.005,0.004
MP,0.0012,0.007,0.178,0.860,-0.012,0.014
FG,1.6923,0.293,5.772,0.000,1.107,2.278
FGA,-0.0664,0.250,-0.266,0.791,-0.565,0.432
3P,1.2601,0.283,4.457,0.000,0.695,1.825
3PA,0.0862,0.255,0.338,0.736,-0.423,0.595
2P,0.2340,0.294,0.795,0.429,-0.354,0.822
2PA,0.0846,0.251,0.337,0.737,-0.416,0.585

0,1,2,3
Omnibus:,103.13,Durbin-Watson:,2.693
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1408.915
Skew:,-0.137,Prob(JB):,1.14e-306
Kurtosis:,10.309,Cond. No.,4.89e+18


In [40]:
# Cross validate linear regression model used with statsmodel using sklearn
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
lm = LinearRegression()
lm.fit(X_train, y_train)

# a cross validation score is used to estimate the accuracy of a ML model
# a "perfect" score would be 1.0
np.mean(cross_val_score(lm, X_train, y_train, scoring = "neg_mean_absolute_error", cv = 3))

-0.34742632514520516