In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import plotly.express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Extraction and Basic Exploration

In [None]:
data = pd.read_csv('/kaggle/input/nba2k20-player-dataset/nba2k20-full.csv')

In [None]:
data.sample(5)

In [None]:
data.info()
# to note:
#1 salary is in string
#2 height,weight,salary column will require some transformation

In [None]:
data.shape[0] - data.count()
# there are missing values under 'team' and 'college' columns

In [None]:
data[data['team'].isnull() == True]
# below are all the row items with missing values under the 'team' column

# Data Cleaning and Transfomation

In [None]:
# to drop all row items with missing values under the column 'team'
data = data.dropna(subset=['team'], inplace=False)

In [None]:
# to transform the salary column into the correct working format and to create calculated columns
data['salary'] = data['salary'].str.replace('$','')
data['salary'] = data['salary'].astype('int32')
data['salary_group'] = data['salary'].apply(lambda x : 'More than 30m' if x > 30000000 else ('20m - 30m' if (x >= 20000000 and x <= 30000000) else 'Less than 10m'))

In [None]:
# to split the height and weight column accordingly and change into the correct format
data[['weight_in_lb','weight_in_kg']] = data['weight'].str.split('/', expand=True)
data['weight_in_lb'] = data['weight_in_lb'].str.replace('lbs.','')
data['weight_in_lb'] = data['weight_in_lb'].astype('float16')

data['weight_in_kg'] = data['weight_in_kg'].str.replace('kg.','')
data['weight_in_kg'] = data['weight_in_kg'].astype('float16')

In [None]:
# to split the height column accordingly and change into the correct format
data[['height_in_ft', 'height_in_m']] = data['height'].str.split('/',expand=True)
data['height_in_m'] = data['height_in_m'].astype('float16')

In [None]:
# to derive the age of the players as of end 2020
data['age_working'] = pd.to_datetime(data['b_day']).apply(lambda x: x.year)
data['age_as_of_end_2020'] = 2020 - data['age_working']
data['age_group'] = data['age_as_of_end_2020'].apply(lambda x : 'Over 30' if x >= 30 else 'Under 30')

In [None]:
# to create calculated column
data['rating_group'] = data['rating'].apply(lambda x : 'Less than 75' if x < 75 else ('More than or equal to 90' if x >=90 else '75-89'))
data['years_in_nba'] = 2020 - data['draft_year']
data['years_in_nba_category'] = data['years_in_nba'].apply(lambda x : 'More than 10' if x > 10 else ('Less than 5' if x < 5 else '5-10'))

In [None]:
# to drop non-critical columns
data.drop(['height','weight','b_day','jersey','age_working', 'draft_year', 'draft_round','draft_peak'], axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data.sample(5) # this is the final dataset that we will use for EDA and Visualization

# EDA on Prep Dataset

In [None]:
data.describe().round(1)

In [None]:
team = data.groupby('team').agg({'full_name': np.count_nonzero, 
                          'salary': np.mean, 
                          'weight_in_kg': np.mean, 
                          'height_in_m': np.mean, 
                          'age_as_of_end_2020': np.mean,
                          'rating' : np.mean,
                          'years_in_nba': np.mean}).\
                            rename(columns={'full_name' : 'Number of Players',
                                            'salary': 'Average Salary', 
                                            'weight_in_kg' : 'Average Weight (kg)',
                                            'height_in_m' : 'Average Height (m)', 
                                            'age_as_of_end_2020' : 'Average Age',
                                            'rating' : 'Average Rating',
                                            'years_in_nba' : 'Average Years of Experience in NBA'}).reset_index()

team.sort_values('Average Salary', inplace=True, ascending=False)
team['Average Salary'] = team['Average Salary'].astype('int32')
team['Average Weight (kg)'] = team['Average Weight (kg)'].astype('int16')
team['Average Age'] = team['Average Age'].astype('int16')

team['Average Salary'] = team['Average Salary'].apply(lambda x : f'{x:,}')
team['Average Height (m)'] = team['Average Height (m)'].round(1)
team['Average Rating'] = team['Average Rating'].round(1)
team['Average Years of Experience in NBA'] = team['Average Years of Experience in NBA'].round(1)


print('Team Overall Statistics\n')
print(team.to_string(index=False))

In [None]:
print('Top 5 Highest Paid Players\n')
print(data.sort_values('salary', ascending=False).head(5)[['full_name','position','team','salary']].to_string(index=False))

In [None]:
print('Top 5 Highest Rated Players\n')
print(data.sort_values('rating', ascending=False).head(5)[['full_name','position','team','rating','salary']].to_string(index=False))

In [None]:
x = data.groupby('college').agg({'full_name':np.count_nonzero}).rename(columns={'full_name':'Number of Players'}).reset_index()

print('Top 5 Colleges with Highest Number of drafted NBA players\n')
print(x.sort_values('Number of Players', ascending=False).head(5).to_string(index=False))

In [None]:
data.sample(5)

In [None]:
x = data.groupby('country').agg({'full_name':np.count_nonzero, 'rating': np.mean}).rename(columns={'full_name':'Number of Players', 'rating':'Average Rating'}).reset_index()
print('Top 5 Countries with NBA Players\n')
print(x.sort_values(by='Number of Players', ascending=False).head().to_string(index=False))

# Visualization

# Scatterplot of Rating & Salary by Age Group

In [None]:
fig = px.scatter(data,x='rating', y='salary', color='age_group', labels={'salary':'Salary','rating':'Rating', 'age_group':'Age Group'}, hover_data=['full_name','team','position','country','age_as_of_end_2020'], trendline='ols', marginal_x='box', marginal_y='violin')
fig.update_layout(title='Visual View between Salary and Rating By Age Group')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show() 

**Commentary:**
* There seems to be some linear relationship between rating and salary (i.e. higher rating generally translate to higher salary)
* The median rating of players over the age of 30 is higher than the median rating of players under the age of 30
* There are more outliers in the players' rating for players under the age of 30
* Distribution of salary for players over 30 years old is wider as compared to the same for the players under 30 years old

# Relationship between Rating & Salary by Years in NBA

In [None]:
fig = px.scatter(data, x='rating', y='salary', facet_col='years_in_nba_category',color='age_group', trendline='ols', labels={'rating':'Rating', 'salary':'Salary', 'years_in_nba_category':'Years in NBA', 'age_group':'Age Group'})
fig.update_layout(title='Visual View Between Rating & Salary by Years in NBA')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

**Commentary:**
* While higher rating generally translate to higher salary, the effect of each additional unit of rating on salary is different between players of different experience level
* For players with more than 10 years experience, each additional unit of rating translate to higher salary increment as compared to the same for players with less than 5 years experience 

# Relationship between Years in NBA & Salary by Rating Group

In [None]:
fig = px.scatter(data, x = 'years_in_nba', y='salary', color='age_group', facet_col='rating_group', labels={'age_group':'Age Group', 'years_in_nba':'Years in NBA', 'salary':'Salary'}, trendline='ols')
fig.update_layout(title='Visual View Between Years in NBA & Salary by Rating Group')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

**Commentary:**
* There is a discernible difference in the salary between between players rated 90 and above, and players rated 75 and below
* For players in the middle rating category (75-89) and over the age of 30, there seems to be an inverse relationship between number of years in NBA and Salary

# Distribution of Salary by Age Group

In [None]:
fig = px.histogram(data, x='salary', marginal='box', color='age_group', labels={'age_group':'Age Group'})
fig.update_layout(title='Distribution of Salary by Age Group')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

**Commentary:**
* There are more outliers in salary for players under the age of 30
* Median salary for a player above 30 years old is 3-4 times higher than the median salary of a player under 30 years old

# Distribution of Salary by Rating Group

In [None]:
fig = px.histogram(data, x='salary', marginal='box', color='rating_group', labels={'rating_group':'Rating Group'}, hover_data=['full_name'])
fig.update_layout(title='Distribution of Salary by Rating Group')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show() 

**Commentary:**
* The variability in the players' salary in the low category (<75) is the lowest, however this category has the highest number of outliers
* The variability in the players' salary in the middle category (75-89) is high relative to the variability in the players' salary in the low/high (<75 / >=90) category

# Visual View Between Weight & Height by Position

In [None]:
fig = px.scatter(data, x='weight_in_kg', y='height_in_m',color='position', hover_data=['full_name','rating'], labels={'weight_in_kg':'Weight (kg)', 'height_in_m':'Height (m)'}, facet_col='position')
fig.update_layout(title='Scatterplot between Weight (kg) and Height (m) by Position', showlegend=False)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

**Commentary:**
* It seems that the average Guard player is 'smaller' in physical size as compared to other players playing in other position