In [None]:
import os
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn import metrics
from scipy.stats.mstats import normaltest
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 50)

In [None]:
dataset = pd.read_csv('/kaggle/input/nba-players-data/all_seasons.csv',index_col= 0 )

# Brief description of the data set and a summary of its attributes.

A dataset contains over 20 years of data on each player who has been part of an NBA teams' roster. It captures demographic variables such as age, height, weight and place of birth, biographical details like the team played for, draft year and round. In addition, it has basic box score statistics such as games played, average number of points, rebounds, assists, etc.

I downloaded it from https://www.kaggle.com/justinas/nba-players-data and all credit for collecting this datset goes to Justinas Cirtautas.

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.columns

# Main objective(s) of this analysis.
your model will be focused on prediction or interpretation?

In this analysis I will try to predict player_height with other features. So player_height is my target variable and other columns are my features. There might be strong correlation between player_height and player_weight - during my predictions I will consider droping player_weight column to test prediction accuracy  with and without this column.

# EDA 

In [None]:
dataset.isnull().sum()

In [None]:
dataset.describe()

In [None]:
plt.figure(figsize=(15,15))
ax = plt.axes()

ax.scatter(dataset.player_height, dataset.player_weight,s=10,c='r')

ax.set(xlabel='Player Height (cm)',
       ylabel='Player Weight (kg)',
       title='Player Height vs Weight');

In [None]:
plt.figure(figsize=(8,8))
ax = plt.axes()
ax.hist(dataset.player_height, bins=25);

ax.set(xlabel='Player Height (cm)', 
       ylabel='Frequency',
       title='Distribution of Player Height');

In [None]:
#creating dataframe to display heatmap
heatmap_data = dataset[['age','player_height','player_weight','gp','pts','reb','ast','net_rating','oreb_pct','dreb_pct','usg_pct',
                        'ts_pct','ast_pct']]
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sn.heatmap(heatmap_data.corr())

In [None]:
heatmap_data.corr()

In [None]:
dataset.duplicated(keep=False).sum()

In [None]:
#Checking average
player_height_column = dataset[['player_height','country']]
player_height_column.groupby('country').mean()

# Summary of training three linear regression models.

## MODEL 1.

Simple linear regression, as there is high correlation between player_height and player_weight I've droped rest of the columns and split player_weight into X and player_height to y.

In [None]:
data_model1 =dataset.drop(['age','player_name','team_abbreviation','college','country','draft_year','draft_round','draft_number','season','gp','pts','reb','ast','net_rating','usg_pct','ts_pct','ast_pct','oreb_pct','dreb_pct'], axis=1)

In [None]:
data_model1

In [None]:
data_model1.corr()

In [None]:
X = data_model1.iloc[:,1:].values
y = data_model1.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=None,)
print(f'X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}')
regr = LinearRegression()
regr.fit(X_train, y_train)
print(f'Coefficient: {regr.coef_}')
print(f'Intercept: {regr.intercept_}')

In [None]:
y_pred = np.round(regr.predict(X_test), decimals=2)
print(f'Mean Absolute Error(MAE): {metrics.mean_absolute_error(y_test, y_pred)}')
print(f'Residual Sum of Squares(MSE): {metrics.mean_squared_error(y_test, y_pred)}')
print(f'R2-Score: {metrics.r2_score(y_test, y_pred)}')

## MODEL 2.
4 columns with highest correlation X(player_weight,oreb_pct,dreb_pct) y(player_height) with cross-validation, and usage of pipeline with StandardScaler and GridSearchCV. We can record significant improvment of R2-Score from 0.684 to 0.753.

In [None]:
data_model2 =dataset.drop(['age','player_name','team_abbreviation','college','country','draft_year','draft_round','draft_number','season','gp','pts','reb','ast','net_rating','usg_pct','ts_pct','ast_pct'], axis=1)

In [None]:
data_model2

In [None]:
X_2 = data_model2.iloc[:,1:].values
y_2 = data_model2.iloc[:,0].values

In [None]:
X_2

In [None]:
y_2

In [None]:
data_model2.corr()

In [None]:
kf = KFold(shuffle=True, random_state=72018, n_splits=3)

In [None]:
for train_index, test_index in kf.split(X_2):
    print("Train index:", train_index[:10], len(train_index))
    print("Test index:",test_index[:10], len(test_index))
    print('')

In [None]:
lr = LinearRegression()
s = StandardScaler()

estimator = Pipeline([("scaler", s),
                      ("regression", lr)])

In [None]:
predictions = cross_val_predict(estimator, X_2, y_2, cv=kf)

In [None]:
metrics.r2_score(y_2, predictions)

In [None]:
estimator = Pipeline([("scaler", StandardScaler()),
        ("polynomial_features", PolynomialFeatures()),
        ("ridge_regression", Ridge())])

hparams = {
    'polynomial_features__degree': [1, 2, 3],
    'ridge_regression__alpha': np.geomspace(4, 20, 30)
}

grid = GridSearchCV(estimator, hparams, cv=kf)

In [None]:
grid.fit(X_2, y_2)

In [None]:
grid.best_score_, grid.best_params_

In [None]:
y_predict = grid.predict(X_2)

In [None]:
r2_score(y, y_predict)

## MODEL 3
Ridge vs. Lasso regression

In [None]:
r = Ridge(alpha = 0.001)
X_train_s = s.fit_transform(X_train)
r.fit(X_train_s, y_train)
X_test_s = s.transform(X_test)
y_pred_r = r.predict(X_test_s)
print(r2_score(y_test, y_pred_r))
print(r2_score(y_test, y_pred))

# MODELS SUMMARY
I trained 3 different models on the same training and test splits, according to results and r2 prediction model2 achieved highest score of 0.7533 thanks to polynomial features. Summaryzing I would recommend model2 approach for this dataset.

#  key findings related to the main objective(s) of the analysis.
In this analysis I will try to predict player_height with other features. So player_height is my target variable and other columns are my features. There might be strong correlation between player_height and player_weight - during my predictions I will consider droping player_weight column to test prediction accuracy with and without this column.

After analysing dataset and applying linear regression models to try predict player_height target value I came to few conclusions. First of all at the beginning I thought that there's strong correlation between player_height and features like offensive, defensive rebound. As a basketball fan I always thought that biggest and tallest players usually grabs most rebounds, but after analysing dataset we can clearly see that there's very low correlation of 0.589033 for offensive rebounds and 0.614650 for defensive rebounds. Also negative correlaction value between player_height and assists was a big suprise for me. Of course biggest factor turned out to be player_weight which basically make lots of sense as statistically every human being usally gets heavier if it's taller.

# Suggestions for next steps
In my opinion most efficient way to improve prediction score would be to add specific features that have high correlation with player_height. Features like wingspan, size of foot, size of hand, vertical jump, quickness etc.