In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

## Import Files

In [7]:
# Import NBA salaries table
nba_salary = pd.read_csv('nba_salary2.csv')
nba_salary.head()

Unnamed: 0,Player,2021-22,2020-21,2019-20,2018-19,2017-18,2016-17,2015-16,2014-15,2013-14,...,2005-06,2004-05,2003-04,2002-03,2001-02,2000-01,1999-00,1998-99,1997-98,1996-97
0,Stephen Curry,45780966.0,45325029.0,42674629.0,40386520.0,38468739.0,13654081.0,12945956.0,12116632.0,11504868.0,...,,,,,,,,,,
1,James Harden,44310840.0,43479158.0,40518442.0,32960751.0,31388759.0,29918259.0,17896589.0,16750126.0,15904415.0,...,,,,,,,,,,
2,John Wall,44310840.0,43479158.0,40518442.0,20668989.0,20035826.0,17761997.0,16769198.0,15618561.0,8680071.0,...,,,,,,,,,,
3,Russell Westbrook,44211146.0,43588654.0,40844595.0,38454209.0,31388759.0,29918259.0,19063757.0,17918740.0,17097246.0,...,,,,,,,,,,
4,Kevin Durant,42018900.0,42271404.0,39457722.0,32346173.0,27729174.0,29918259.0,25015562.0,22795904.0,21843723.0,...,,,,,,,,,,


In [8]:
# Import NBA stats table
nba_stats = pd.read_csv('nba_stats.csv')
nba_stats.head()

Unnamed: 0,PLAYER,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,...,REB,AST,TOV,STL,BLK,PF,DD2,TD3,+/-,SEASON
0,Trae Young,23,76,40,36,2652,2155,711,1544,46.0,...,284,737,303,72,7,128,42,0,159,2021-22
1,DeMar DeRozan,32,76,43,33,2743,2118,774,1535,50.4,...,392,374,181,68,24,178,6,0,77,2021-22
2,Joel Embiid,28,68,45,23,2296,2079,666,1334,49.9,...,796,284,214,77,99,181,46,2,368,2021-22
3,Jayson Tatum,24,76,49,27,2731,2046,708,1564,45.3,...,609,334,217,75,49,174,22,0,667,2021-22
4,Nikola Jokic,27,74,46,28,2476,2004,764,1311,58.3,...,1019,584,281,109,63,191,66,19,444,2021-22


In [None]:
pd.options.display.float_format = '{:.2f}'.format # show only up to 2 decimal places
# nba = pd.read_csv('nba_stats_merged.csv', encoding='utf-8')
# nba

## Data Cleaning

In [None]:
# Explore descriptive statistics of the variables
nba.describe()

In [None]:
nba.info()

No missing values indicated

In [None]:
# Drop columns we do not want to include as variables, assign to new variable
nba2 = nba.drop(columns=(['GP', 'W', 'L', 'FGM', 'FGA', '3PA', '3PM', 'FTM', 'FTA', 'OREB', 'DREB']), axis=1)
nba2

We will drop 'field goals made (FGM)' and 'field goals attempted (FGA)' and keep 'field goal percentage (FG%)' instead because FG% is the ratio of FGM to FGA.

We will do the same and drop '3 point field goals made (3PM)' and '3 point field goals attempted (3PA)' and keep '3 point field goals percentage (3P%)' for the equivalent reason as above.

We will do the same and drop 'free throws made (FTM)' and 'free throws attempted (FTA)' and keep 'free throw percentage (FT%)' for the equivalent reason as above.

We will drop 'offensive rebounds (OREB)' and 'defensive rebounds (DREB)' and keep 'rebounds (REB)' instead, as REB is the sum of both OREBB and DREB.

We will drop 'games played (GP)', 'wins (W)', 'losses (L)' and keep 'minutes played (MIN)' instead, since total minutes played on the court is a better, more accurate representation of how much a player is playing.



In [None]:
# View correlation among all variables using pairplot
sns.pairplot(nba2)

In [None]:
# View correlation among all variables using heatmap
sns.heatmap(nba2.corr(), cmap='seismic', annot=True)

In [None]:
nba2.corr()

Strong correlation between average salary and average minutes played, points scored, rebounds, and turnovers.

In [None]:
# Observe the distribution with a distribution plot of the more highly correlated variables
sns.displot(nba2['AVG SALARY'])

In [None]:
q = nba2['AVG SALARY'].quantile(0.99)
data_1 = nba2[nba2['AVG SALARY']<q]
data_1.describe(include='all')

In [None]:
sns.displot(nba2['MIN'])

Distribution for 'MIN' looks great!

In [None]:
sns.displot(nba2['PTS'])

In [None]:
sns.displot(nba2['TOV'])

In [None]:
plt.scatter(nba2['AVG SALARY'], nba2['PTS'])

In [None]:
plt.scatter(nba2['AVG SALARY'], nba2['MIN'])

In [None]:
plt.scatter(nba2['AVG SALARY'], nba2['TOV'])

In [None]:
# Use Numpy to calculate natural log of average salary

In [None]:
# Check for multicollinearity with VIF

from statsmodels.stats.outliers_influence import variance_inflation_factor

variables = nba2[['PTS', 'MIN', 'TOV', 'REB']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['features'] = variables.columns

In [None]:
vif

a vif value between 1-5 is considered good.

# Linear Regression Model

### Declare features and target

In [None]:
# Target
y = nba2['AVG SALARY']
# Features
X = nba2.drop(columns=['AVG SALARY', 'PLAYER'], axis=1) # drop PLAYER from being standardized bc it is a string object

###  Scale the data

In [None]:
# Standardize the scale for the data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

In [None]:
X_scaled = scaler.transform(X)

### Train Test Split

In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV

# Partition dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=0)

### Split training data into multiple folds

### Create the regression

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# Calculate the R^2
lm.score(X_train, y_train)

Our model is explaining 74% of the variability of the data.

### Finding the weights and bias

In [None]:
lm.intercept_

In [None]:
lm.coef_

In [None]:
# Create a summary table with the feature names and the weights
lm_summary = pd.DataFrame(X.columns.values, columns=['Features'])
lm_summary['Weights'] = lm.coef_
lm_summary

A positive weight shows that as a feature increases in value, so does the "AVG SALARY" respectively.  Alternatively, a negative weight shows that as a feature increases in value, the "AVG SALARY" decreases.

### Testing

In [None]:
y_hat_test = lm.predict(X_test)

In [None]:
plt.scatter(y_test, y_hat_test, alpha=0.25)
plt.xlabel('Target (y_test)', size=15)
plt.ylabel('Predictions (y_hat_test)', size=15)
# plt.xlim()
# plt.ylim()
plt.show()

We have more concentration on lower end, meaning our model is good at predicting at the lower salaries.  Not so much at the higher salaries.