<a href="https://colab.research.google.com/github/selenkelat/regression-analyses-on-NBA-data/blob/main/NBA_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
csv_path = "/content/NBA.csv"
dat = pd.read_csv(csv_path)
dat

In [None]:
#removing non-numeric variables from data
dat=dat.drop("Position",axis=1)
dat=dat.drop("Team",axis=1)
dat=dat.drop("Player",axis=1)
dt=dat.copy()

In [None]:
dt.info()

In [None]:
dt.describe().T

In [None]:
dt.isnull().values.any() #no missing value

In [None]:
dt.corr() #correlation values ​​between variables

In [None]:
#pair plot of the variables with regression lines fitted to the scatter plots
import seaborn as sbn
sbn.pairplot(dt, kind="reg");

In [None]:
#USG is indepentend variable and PER is depentend variable
# visualize the relationship between the variables "USG" and "PER" (Player Efficiency Rating)
import matplotlib.pyplot as plt
import seaborn as sbn
jp=sbn.jointplot(x="USG", y="PER", data=dt, kind="reg")
jp.savefig("jointplot.png")
plt.show()


### LINEAR REGRESSION

In [None]:
# OLS REGRESSION
x=dt[["USG"]]
y=dt[["PER"]]
import statsmodels.api as sm
x=sm.add_constant(x)
x.head()
lin_mod=sm.OLS(y,x)
mod=lin_mod.fit()
mod.summary() #R2 0.439
#confidence intervals are significant

In [None]:
from sklearn.linear_model import LinearRegression
rm=LinearRegression()
mod=rm.fit(x,y)
mod.intercept_ #beta0

In [None]:
mod.coef_ #beta1

In [None]:
mod.score(x, y) # R^2

In [None]:

import seaborn as sbn
import matplotlib.pyplot as plt
# Scatter plot with regression line
pl = sbn.regplot(x=dt["USG"], y=dt["PER"], ci=None, scatter_kws={"color": "r", "s": 9})
pl.set_title("Regression Model")
plt.show()


model metrics

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mse=mean_squared_error(y, mod.predict(x))
mse

In [None]:
import numpy as np
np.sqrt(mse)
#this means that, on average, the model's predictions deviate from the actual values ​​by 3,109 units.

### MULTIPLE LINEAR REGRESSION

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.30, random_state=42)

In [None]:
lin_mod=sm.OLS(y_train, x_train)
mod=lin_mod.fit()
mod.summary()

In [None]:
lin_mod=LinearRegression()
mod=lin_mod.fit(x_train,y_train)
mod.intercept_ #beta0: 3.73379212
mod.coef_ #beta1: 0.55511165

In [None]:
preds=mod.predict(x_test) #predictive values ​​of y related to the test set

In [None]:
#graph of actual and predicted values
plt.scatter(y_test, preds)
plt.show()

In [None]:
np.sqrt(mean_squared_error(y_test, preds)) #rmse of actual and predicted values ​​in the test set

### Principal component regression (PCR)

In [None]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn import model_selection
from sklearn.model_selection import RepeatedKFold

In [None]:
#independent variables need to be scaled
pca=PCA()
x_scaled=pca.fit_transform(scale(x_train))

In [None]:
#cross validation
cv=RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
mod=LinearRegression()
pcr_mod=mod.fit(x_scaled, y_train)

In [None]:
pcr_mod.intercept_ #beta0

In [None]:
pcr_mod.coef_ # beta1

In [None]:
#y prediction values
fits=pcr_mod.predict(x_scaled)

In [None]:
plt.scatter(y_train, fits)
plt.show() #positive linear relationship

In [None]:
np.sqrt(mean_squared_error(y_train,fits)) #rmse

In [None]:
r2_score(y_train,fits)

In [None]:
#for test set and prediction values
pca2=PCA()
x_scaled_test=pca2.fit_transform(scale(x_test))
preds=pcr_mod.predict(x_scaled_test)

In [None]:
np.sqrt(mean_squared_error(y_test,preds))

In [None]:
r2_score(y_test,preds)

This model is better than the model we established with the training set and fit values.

### RIDGE REGRESSION

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge_mod=Ridge(alpha=0.1).fit(x_train,y_train)

In [None]:
#metrics between the training set and prediction values.
fits=ridge_mod.predict(x_train)
np.sqrt(mean_squared_error(y_train,fits)) # rmse

In [None]:
r2_score(y_train,fits)

In [None]:
#for test set and prediction values.
preds=ridge_mod.predict(x_test)
np.sqrt(mean_squared_error(y_test,preds)) #rmse (lower than the training set, so it is better)

In [None]:
r2_score(y_test,preds)
# the model established with the test set is better, the R^2 value is greater.

In [None]:
# try to find the best lambda value
# Lambda values ​​to try:
alphas = [0.1, 1, 10, 100, 1000]
# define the GridSearchCV object
grid_search = GridSearchCV(ridge_mod, {'alpha': alphas}, cv=5)
# apply GridSearchCV on the training data
grid_search.fit(x_train, y_train)

In [None]:
print("the best lambda (alpha) value:", grid_search.best_params_['alpha'])

In [None]:
#regression model was created according to the best alpha value determined.
ridge_best=Ridge(alpha=100)
ridge_best.fit(x_train,y_train)
fits=ridge_best.predict(x_train)
np.sqrt(mean_squared_error(y_train,fits))

In [None]:
r2_score(y_train,fits)

### LASSO REGRESSION

In [None]:
from sklearn.linear_model import Lasso
lasso_mod=Lasso(alpha=0.1).fit(x_train,y_train)

In [None]:
lasso_mod.coef_ #beta1 coefficient is significant

In [None]:
#rmse and R^2 for the training set
fits=lasso_mod.predict(x_train)
np.sqrt(mean_squared_error(y_train,fits))

In [None]:
r2_score(y_train,fits)

In [None]:
#rmse and R^2 for test set
preds=lasso_mod.predict(x_test)
np.sqrt(mean_squared_error(y_test,preds))

In [None]:
r2_score(y_test,preds)
#In Lasso regression, the model we built with the test set is better
#obtained results very similar to those obtained in the Ridge regression model

In [None]:
#find the optimum alpha value
from sklearn.linear_model import LassoCV
model=LassoCV(cv=5,random_state=0,max_iter=10000)
model.fit(x_train,y_train)
LassoCV(cv=5,max_iter=10000,random_state=0)
model.alpha_

In [None]:
#build a regression model on the training set according to the best alpha value determined
lasso_best=Lasso(alpha=model.alpha_)
lasso_best.fit(x_train,y_train)
fits=lasso_best.predict(x_train)

In [None]:
np.sqrt(mean_squared_error(y_train,fits))

In [None]:
r2_score(y_train,fits)

### KNN (K-Nearest Neighbors)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as mp
from sklearn.neighbors import KNeighborsRegressor

In [None]:
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.30, random_state=15)

In [None]:
#how many K's the model works with
KNNmodel=KNeighborsRegressor().fit(x_train, y_train)
KNNmodel.n_neighbors

In [None]:
y_hat_knn5=KNNmodel.predict(x_train) #Predictions of the dependent variable y

In [None]:
mean_squared_error(y_train, y_hat_knn5) #mse in train set

In [None]:
r2_score(y_train, y_hat_knn5)

In [None]:
#What is the k value that will minimize the error? write the loop
mse = []
r2 = []

for k in range(20):
    k = k + 1
    KNN_model = KNeighborsRegressor(n_neighbors=k).fit(x_train, y_train)
    y_hat_k = KNN_model.predict(x_test)
    mse.append(mean_squared_error(y_test, y_hat_k))
    r2.append(r2_score(y_test, y_hat_k))

In [None]:
mse_df=pd.DataFrame(mse)
plt.show(mse_df.plot())
#When k is 10, mse takes its lowest value

In [None]:
#find the best parameter
from sklearn.model_selection import GridSearchCV
KNN_arg=KNeighborsRegressor()
k_params={"n_neighbors":[2,3,4,5,6,7,8,9,10,11,12,13,14]}
KNN_model=GridSearchCV(KNN_arg, k_params, cv=5)
KNN_model.fit(x_train, y_train)
KNN_model.best_params_

### SUPPORT VECTOR REGRESSION

linear:

In [None]:
from sklearn.svm import SVR
SVR_model=SVR(kernel="linear").fit(x_train, y_train)
#for train set
y_hat_svr_l=SVR_model.predict(x_train)

In [None]:
mean_squared_error(y_train, y_hat_svr_l)

In [None]:
r2_score(y_train, y_hat_svr_l)

In [None]:
#for test set
predicted_y_svr_l=SVR_model.predict(x_test)
mean_squared_error(y_test, predicted_y_svr_l)

In [None]:
r2_score(y_test, predicted_y_svr_l)
#The model built with the test set is better

nonlinear:

In [None]:
SVR_model_nl=SVR(kernel="rbf").fit(x_train, y_train)
y_hat_svr_nl=SVR_model.predict(x_train)

In [None]:
mean_squared_error(y_train, y_hat_svr_nl)

In [None]:
r2_score(y_train, y_hat_svr_nl)

In [None]:
#for test set
predicted_y_svr_nl=SVR_model_nl.predict(x_test)

In [None]:
mean_squared_error(y_test, predicted_y_svr_nl)

In [None]:
r2_score(y_test, predicted_y_svr_nl)

GRID SEARCH

In [None]:
params_svr={"C":np.arange(0.1, 2, 0.4)}
#linear
gs_SVR_model_l=GridSearchCV(SVR_model, params_svr, cv=10).fit(x_train, y_train)
#nonlinear
gs_SVR_model_nl=GridSearchCV(SVR_model_nl, params_svr, cv=10).fit(x_train, y_train)

In [None]:
gs_SVR_model_l.best_params_ #best param for linear model

In [None]:
gs_SVR_model_nl.best_params_ #best param for nonlinear model

In [None]:
#en iyi parametrelerle model kuralim
bp=pd.Series(gs_SVR_model_l.best_params_)[0]
best_l_svr_model=SVR(kernel="linear", C=bp).fit(x_train, y_train)
best_nl_svr_model=SVR(kernel="rbf", C=bp).fit(x_train, y_train)
y_hat_l_best=best_l_svr_model.predict(x_test)
y_hat_nl_best=best_nl_svr_model.predict(x_test)

In [None]:
mean_squared_error(y_test, y_hat_l_best) #mse for linear model

In [None]:
r2_score(y_test, y_hat_l_best) #R^2 for linear model

In [None]:
mean_squared_error(y_test, y_hat_nl_best) #mse for nonlinear model

In [None]:
r2_score(y_test, y_hat_nl_best) #R^2 for nonlinear model

### Artificial Neural Network (ANN)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
#scale the data using StandardScaler
scl=StandardScaler()
scl.fit(x_train)
x_train_scl=scl.transform(x_train)
x_test_scl=scl.transform(x_test)
ann_model=MLPRegressor().fit(x_train_scl, y_train)
ann_model.n_layers_

In [None]:
ann_model.hidden_layer_sizes

In [None]:
yhat_ann0=ann_model.predict(x_train_scl) #predicted values for the training data

In [None]:
mean_squared_error(y_train, yhat_ann0)

In [None]:
r2_score(y_train, yhat_ann0)

In [None]:
#for test set
y_predict_ann0=ann_model.predict(x_test_scl)
mean_squared_error(y_test, y_predict_ann0)

In [None]:
r2_score(y_test, y_predict_ann0)

In [None]:
#find the best parameters
params_ann = {"alpha": [0.1, 0.01, 0.02, 0.005],
"hidden_layer_sizes": [(20, 20), (100, 50, 150), (300, 200, 100)],
"activation": ["relu", "logistic"]}
gs_ann_model = GridSearchCV(ann_model, params_ann, cv=5)
gs_ann_model.fit(x_train_scl, y_train)
gs_ann_model.best_params_

In [None]:
best_ann=MLPRegressor(alpha=0.01, hidden_layer_sizes=(300,200,100), activation="relu")
model_best=best_ann.fit(x_train, y_train)
best_hat=model_best.predict(x_train_scl)

In [None]:
#mse for train set
mean_squared_error(y_train,best_hat)

In [None]:
#R^2 for train set
r2_score(y_train, best_hat)

In [None]:
#for test set
best_hat_test=model_best.predict(x_test_scl)
mean_squared_error(y_test,best_hat_test)

In [None]:
r2_score(y_test, best_hat_test)

### CART (Classification & Regression Tree)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as mp
from sklearn.model_selection import GridSearchCV

In [None]:
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.30, random_state=15)
from sklearn.tree import DecisionTreeRegressor
cart_model=DecisionTreeRegressor().fit(x_train, y_train)
fitted_cart=cart_model.predict(x_train)

In [None]:
#for train set
mean_squared_error(y_train, fitted_cart)

In [None]:
r2_score(y_train, fitted_cart)

In [None]:
#for test set
preds_cart=cart_model.predict(x_test)
mean_squared_error(y_test, preds_cart)

In [None]:
r2_score(y_test, preds_cart)

In [None]:
#find the best parameters
cart_params = {"min_samples_split": range(2, 100),
"max_leaf_nodes": range(2, 10)}
grid_cart_model = GridSearchCV(cart_model, cart_params, cv=10)
grid_cart_model.fit(x_train, y_train)
grid_cart_model.best_params_

In [None]:
#new model with the best params
best_cart_model=DecisionTreeRegressor(max_leaf_nodes=8, min_samples_split=6).fit(x_train, y_train)
preds_best_cart_model=best_cart_model.predict(x_test)
mean_squared_error(y_test, preds_best_cart_model)

In [None]:
r2_score(y_test, preds_best_cart_model)

### BAGGING (BOOTSTRAP AGGREGATION)

In [None]:
from sklearn.ensemble import BaggingRegressor
bagging_model=BaggingRegressor(bootstrap_features=(True)).fit(x_train, y_train)
fits_bagging=bagging_model.predict(x_train)

In [None]:
#mse for train set
mean_squared_error(y_train, fits_bagging)

In [None]:
#R^2 for train set
r2_score(y_train, fits_bagging)

In [None]:
#for test set
preds_bagging=bagging_model.predict(x_test)
mean_squared_error(y_test, preds_bagging)

In [None]:
r2_score(y_test, preds_bagging)

In [None]:
#GRIDSEARCH
bagging_params={"n_estimators":range(2,40)}
grid_bagging_model=GridSearchCV(bagging_model, bagging_params, cv=10)
grid_bagging_model.fit(x_train, y_train)

In [None]:
grid_bagging_model.best_params_

In [None]:
best_bagging_model=BaggingRegressor(n_estimators=25).fit(x_train, y_train)

In [None]:
#for test set
preds_bagging_model=best_bagging_model.predict(x_test)
mean_squared_error(y_test, preds_bagging_model)

In [None]:
r2_score(y_test, preds_bagging_model)

In [None]:
#for train set
preds_bagging_model=best_bagging_model.predict(x_train)
mean_squared_error(y_train, preds_bagging_model)

In [None]:
r2_score(y_train, preds_bagging_model)

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model=RandomForestRegressor().fit(x_train, y_train)

In [None]:
#model with train set
fits_rf_model=rf_model.predict(x_train)
mean_squared_error(y_train, fits_rf_model)

In [None]:
r2_score(y_train, fits_rf_model)

In [None]:
#for test set
preds_rf_model=rf_model.predict(x_test)
mean_squared_error(y_test, preds_rf_model)

In [None]:
r2_score(y_test, preds_rf_model)

gridsearch

In [None]:

rf_params={"max_depth":range(1,15), "max_features":[2,3,5,10,15], "n_estimators":[100,250,500,1000,2000]}
grid_rf_model=GridSearchCV(rf_model, rf_params, cv=10, n_jobs=-1)
grid_rf_model.fit(x_train, y_train)

In [None]:
grid_rf_model.best_params_

In [None]:
best_rf_model=RandomForestRegressor(max_depth=4,max_features=5,n_estimators=250).fit(x_egitim, y_egitim)
best_preds_rf_model=rf_model.predict(x_test)
mean_squared_error(y_test, best_preds_rf_model)
r2_score(y_test, best_preds_rf_model)

### GRADIENT BOOSTING REGRESSION

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model=GradientBoostingRegressor().fit(x_train,y_train)

In [None]:
fits_gb=gb_model.predict(x_train)
mean_squared_error(y_train, fits_gb)

In [None]:
r2_score(y_train, fits_gb)

In [None]:
preds_gb=gb_model.predict(x_test)
mean_squared_error(y_test, preds_gb)

In [None]:
r2_score(y_test, preds_gb)

GridSearh

In [None]:
gb_params={"learning_rate":[0.001, 0.01, 0.1, 0.2], "max_depth":[2,5,10,50,100],"n_estimators":[100,200,500,1000],
"subsample":[1,0.5,0.75]}
grid_gb_model=GridSearchCV(gb_model, gb_params, cv=10,n_jobs=-1, verbose=2)

In [None]:
grid_gb_model.fit(x_train, y_train)

In [None]:
grid_gb_model.best_params_

In [None]:
best_gb_model=GradientBoostingRegressor(learning_rate=0.01,max_depth=2,n_estimators=200,subsample=0.5).fit(x_train
preds_best_gb_model=best_gb_model.predict(x_test)
mean_squared_error(y_test, preds_best_gb_model)
r2_score(y_test, preds_best_gb_model)

### XG BOOST

In [None]:
!pip install xgboost
import xgboost as xgb
from xgboost import XGBRegressor
xgb_model=XGBRegressor().fit(x_train,y_train)

In [None]:
fits_xgb=xgb_model.predict(x_train)
mean_squared_error(y_train, fits_xgb)

In [None]:
r2_score(y_train, fits_xgb)

In [None]:
#for test set
preds_xgb=gb_model.predict(x_test)
mean_squared_error(y_test, preds_xgb)

In [None]:
r2_score(y_test, preds_xgb)

GridSearhc

In [None]:
xgb_params={"colsample_bytree":[0.4,0.5],"n_estimators":[100,250,500],"max_depth":[5,6],
"learning_rate":[0.01,0.5]}
grid_xgb_model=GridSearchCV(xgb_model, xgb_params,cv=10,n_jobs=-1,verbose=2)
grid_xgb_model.fit(x_train, y_train)

In [None]:
grid_xgb_model.best_params_

In [None]:
best_xgb_model=XGBRegressor(colsample_bytree=0.4,learning_rate=0.01,max_depth=5,n_estimators=500).fit(x_train,y_train)

In [None]:
#model with test set
preds_best_xgb=best_xgb_model.predict(x_test)
mean_squared_error(y_test, preds_best_xgb)

In [None]:
r2_score(y_test, preds_best_xgb)

In [None]:
#model with train set
preds_best_xgb=best_xgb_model.predict(x_train)
mean_squared_error(y_train, preds_best_xgb)

In [None]:
r2_score(y_train, preds_best_xgb)