In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import date
import plotly.figure_factory as ff



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<ul style='list-style-type: none'>
    <li><a href='#part1' style='text-decoration: none; font-size: 24px'>1. Preprocessing data</a></li>
    <li><a href='#part2' style='text-decoration: none; font-size: 24px'>2. Heatmap</a></li>
    <li><a href='#part3' style='text-decoration: none; font-size: 24px'>3. Height and weight distriburion</a></li>
    <li><a href='#part4' style='text-decoration: none; font-size: 24px'>4. Country and college distriburion</a></li>
    <li><a href='#part5' style='text-decoration: none; font-size: 24px'>5. Top-50 players by salary</a></li>
    <li><a href='#part6' style='text-decoration: none; font-size: 24px'>6. Basic regression</a></li>
    <li><a href='#part7' style='text-decoration: none; font-size: 24px'>7. Team rating</a></li>
    <li><a href='#part8' style='text-decoration: none; font-size: 24px'>8. Height by country</a></li>
    <li><a href='#part9' style='text-decoration: none; font-size: 24px'>9. Salary by draft_round</a></li>
</ul>










<a id='part1'><h1>1. Preprocessing data</h1></a>

Import and exploring data

In [None]:
data = pd.read_csv("../input/nba2k20-player-dataset/nba2k20-full.csv",parse_dates=True)
data.head()

In [None]:
data.describe()

In [None]:
data.dtypes

Some columns need to be converted into numeric data type

In [None]:
data['weight'] = [float(data['weight'][i].split()[3]) for i in range(len(data))]
data['height'] = [float(data['height'][i].split()[-1]) for i in range(len(data))]
data['salary'] = [int(data['salary'][i].split('$')[1]) for i in range(len(data))]
data['jersey'] = [int(data['jersey'][i].split('#')[1]) for i in range(len(data))]

data['b_day'] = data['b_day'].apply(lambda x: datetime.strptime(x, '%m/%d/%y').date())
data['age'] = (datetime.today().date() - data['b_day']).astype('<m8[Y]').astype('int64')

data['draft_round'] = data['draft_round'].apply(lambda x: 0 if x=='Undrafted' else int(x)) 
data['draft_peak'] = data['draft_peak'].apply(lambda x: 0 if x=='Undrafted' else int(x)) 

data['college'] = data['college'].fillna('No college')
data['team'] = data['team'].fillna('No team')


Let's take a look at data again

In [None]:
data.head()

In [None]:
data.dtypes

<a id='part2'>
    <h1>2. Heatmap</h1>
</a>

Correlation between features on a heatmap

In [None]:
plt.figure(figsize=(30,15))
sns.set(font_scale=1.8)
sns.heatmap(data.corr(),cmap='Blues',annot=True)

The highest correlation between draft_round and draft_peak. Also there is hight correlation between weight and height and between salary and rating

<a id='part3'>
    <h1>3. Height and weight distriburion</h1>
</a>

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "histogram"}, {"type": "histogram"}]])
fig.add_trace(go.Histogram(x=data['height']*100,
                           xbins=dict(
                               start=150,
                               end=280,
                               size=3
                           ),
                           name='height, cm', hovertemplate='Count: %{y}<br>Height: %{x}cm'
                           ), col=1, row=1)
fig.add_trace(go.Scatter(x=[data['height'].mean()*100, data['height'].mean()*100], y=[0, 91],
                         mode='lines',
                         name='Mean height', hovertemplate='Mean: %{x:.2f}'))
fig.update_layout(hovermode='x')
fig.add_trace(go.Histogram(x=data['weight'],
                           xbins=dict(
                               start=min(data['weight']),
                               end=max(data['weight']),
                               size=3
                           ),
                           name='weight, kg', hovertemplate='Count: %{y}<br>Weight: %{x}kg'
                           ), col=2, row=1)
fig.add_trace(go.Scatter(x=[data['weight'].mean(), data['weight'].mean()], y=[0, 91],
                         mode='lines',
                         name='Mean weight', hovertemplate='Mean: %{x:.2f}'),col=2, row=1)

fig.update_layout(title={
        'text': "Height and weight distribution",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

In [None]:
fig = px.scatter(data, x="weight", y="height", 
                 marginal_x="box", marginal_y="violin",
                 color_discrete_sequence=['orange']
                )
fig.show()

<a id='part4'><h1>4. Country and college distriburion</h1></a>

In [None]:
country_count = data['country'].value_counts()
fig = go.Figure(go.Pie(labels=country_count.index, values=country_count.values, hole=0.4,textinfo= "none"))
fig.update_layout(title={
        'text': "Percentage of players by country",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

There are about 75% of players from USA and much less from every other country.

In [None]:
college_count = data['college'].value_counts()
fig = go.Figure(go.Pie(labels=college_count.index, values=college_count.values, hole=0.4,textinfo= "none"))
fig.update_layout(title={
        'text': "Percentage of players by college",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

About 15% of players are not from college. The highest percents of players have Kentucky and Duke colleges.

<a id='part5'><h1>5. Top-50 players by salary</h1></a>

Represents relationship between age, salary and rating for top-30 the most paid players.

In [None]:
data['position'] = data['position'].apply(lambda x: 'F-C' if x=='C-F' else x) #union related positions
data['position'] = data['position'].apply(lambda x: 'F-G' if x=='G-F' else x)

In [None]:


fig = px.scatter(data.sort_values(by='salary', ascending=False)[:50], 
                 x="salary", y="age",
                 size="rating", color="position", 
                 title="Top-50 players",
                 log_x=True, size_max=20)
fig.show()

The most paid players from top-30 are 28+ years old on G and F positions.

<a id='part6'><h1>6. Basic regression</h1></a>

This is simple regression plots for prediction salary based on rating of players by position.

In [None]:
fig = px.scatter(
    data, x='rating', y='salary', opacity=0.65,
    trendline='ols', trendline_color_override='darkblue', 
    facet_col='position', facet_col_wrap=3, color='salary'
)
fig.show()

In [None]:
fig = px.scatter(data, x='rating', y='salary', opacity=0.65,
                 trendline='ols', trendline_color_override='darkblue', 
                 facet_col='draft_round', facet_col_wrap=3, color='salary'
                )
fig.show()

Same regression plot for draft_round. So, the highest salaries are received by the players from the first draft round.

<a id='part7'><h1>7. Team rating</h1></a>

Let's calculate average rating of players for each team

In [None]:
data_team = data[['team', 'rating']].groupby('team').mean().reset_index()
data_team = data_team.sort_values(by='rating', ascending=False)

In [None]:
fig = px.bar(data_team.query("team != 'No team'"), 
             x='team', y='rating', color='team', 
             labels={'rating':'mean rating of players'},
             title='Mean rating of players for each team',
             color_discrete_sequence=px.colors.qualitative.Safe)
fig.show()

The strongest team is 'Los Angeles Clippers' with mean rating of players 79.3 and the weakest is 'Detroit Pistons' with rating 73.9

<a id='part8'><h1>8. Height by country</h1></a>

In [None]:
data_height = data[['height', 'country']].groupby('country').mean().reset_index()
data_height = data_height.sort_values(by='height', ascending=False)


In [None]:
fig = px.bar(data_height, 
             x='country', y='height', color='country', 
             labels={'height':'mean height of players'},
             color_discrete_sequence=px.colors.qualitative.Vivid,
             title='Mean height of each country'
            )
fig.show()

Represents mean height of players for each country. The tallest players are from Austria and the lowest are from Puerto Rico

<a id='part9'><h1>9. Salary by draft_round</h1></a>

In [None]:
fig = px.box(data, x="draft_round", y="salary", 
             color="draft_round",
             title='Salary exploring by draft_round',
             points='all'
            )
fig.update_traces(quartilemethod="exclusive") 
fig.show()

draft round 0 (undrafted players): 50% of salary data is between 1.4M and 4.76, median is 2.56M<br>
draft round 1: 50% of salary data is between 2.9M and 15.6, median is 6.5M<br>
draft round 2: 50% of salary data is between 1.4M and 8.5, median is 1.7M

So,the most paid players drom draft round 1, where the max salary is 40.2M. The draft round 2 have bigger range of salary data then data of undrafted players, but median of salary from draft round 2 less by 0.8M.