<h1><center>Basketball Players Stats. Extended analysis</center></h1>

<center><img width="800" height="600" src="https://res.cloudinary.com/grohealth/image/upload/f_auto,fl_lossy,q_auto/v1581678662/DCUK/Content/iStock-959080376.jpg"></center>

In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

pd.options.display.max_columns = None

In [None]:
df = pd.read_csv('/kaggle/input/basketball-players-stats-per-season-49-leagues/players_stats_by_season_full_details.csv')
df = df.drop(['height', 'weight'], axis=1)
df.head()

In [None]:
df.columns

Let's see percent of NaNs for every column.

In [None]:
missed = pd.DataFrame()
missed['column'] = df.columns
percent = list()

for col in df.columns:
    percent.append(round(100* df[col].isnull().sum() / len(df), 2))
    
missed['percent'] = percent
missed = missed.sort_values('percent')
missed = missed[missed['percent']>0]

fig = px.bar(
    missed, 
    x='percent', 
    y="column", 
    orientation='h', 
    title='Missed values percent for every column (percent > 0)', 
    height=600, 
    width=600
)

fig.show()

What about leagues that we have?

In [None]:
ds = df['League'].value_counts().reset_index()

ds.columns = [
    'league', 
    'number of samples'
]

ds = ds.sort_values(['number of samples'])

fig = px.bar(
    ds, 
    x='number of samples', 
    y="league", 
    orientation='h', 
    title='Leagues presented in dataset', 
    height=1000, 
    width=800
)

fig.show()

Let's do the same but for seasons.

In [None]:
ds = df['Season'].value_counts().reset_index()

ds.columns = [
    'season', 
    'number of samples'
]

ds = ds.sort_values(['number of samples'])

fig = px.bar(
    ds, 
    x='number of samples', 
    y="season", 
    orientation='h', 
    title='Seasons presented in dataset', 
    height=600, 
    width=600
)

fig.show()

What about stages?

In [None]:
ds = df['Stage'].value_counts().reset_index()

ds.columns = [
    'stage', 
    'number of samples'
]

ds = ds.sort_values(['number of samples'])

fig = px.pie(
    ds, 
    values='number of samples', 
    names="stage",  
    title='Stages presented in dataset', 
    height=500, 
    width=500
)

fig.show()

Let's see how many games per season and minutes per game sportsmen played?

In [None]:
fig = make_subplots(rows=2, cols=1)
traces = [
    go.Histogram(
        x=df[col[0]], 
        nbinsx=100, 
        name=col[1]
    ) 
    for col in 
    [
        ('GP', 'Games Played'), 
        ('MIN', 'Minutes Played')
    ]
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 1) + 1, 
        (i % 1) + 1
    )

fig.update_layout(
    title_text='Games played and minutes played distribution',
    height=900,
    width=800
)

fig.show()

Let's see distributions for FGM (Field Goals Made) and FGA (Field Goals Attempts)

In [None]:
data = pd.DataFrame()
data['number'] = df['FGM'].copy()
data['legend'] = 'FGM'
data2 = pd.DataFrame()
data2['number'] = df['FGA'].copy()
data2['legend'] = 'FGA'
data = pd.concat([data, data2])

In [None]:
fig = px.histogram(
    data, 
    x="number", 
    nbins=200, 
    color = 'legend',
    title='FGM vs FGA distributions', 
    width=800, 
    height=700
)

fig.show()

And do the same for 3PM (3-points made) and (3-points attempts)

In [None]:
data = pd.DataFrame()
data['number'] = df['3PM'].copy()
data['legend'] = '3PM'
data2 = pd.DataFrame()
data2['number'] = df['3PA'].copy()
data2['legend'] = '3PA'
data = pd.concat([data, data2])

fig = px.histogram(
    data, 
    x="number", 
    nbins=80, 
    color = 'legend',
    title='3PM vs 3PA distributions', 
    width=800, 
    height=700
)

fig.show()

Now we will do it for FTM (free throws made) and (free throws attempts)

In [None]:
data = pd.DataFrame()
data['number'] = df['FTM'].copy()
data['legend'] = 'FTM'
data2 = pd.DataFrame()
data2['number'] = df['FTA'].copy()
data2['legend'] = 'FTA'
data = pd.concat([data, data2])

fig = px.histogram(
    data, 
    x="number", 
    nbins=80, 
    color = 'legend',
    title='FTM vs FTA distributions', 
    width=800, 
    height=700
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "TOV", 
    nbins=100, 
    title='Number of turnovers distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "PF", 
    nbins=100, 
    title='Number of personal fouls distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "ORB", 
    nbins=100, 
    title='Number of offensive rebounds distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "DRB", 
    nbins=100, 
    title='Number of defensive rebounds distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "AST", 
    nbins=100, 
    title='Number of assists distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "STL", 
    nbins=100, 
    title='Number of steals distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "BLK", 
    nbins=100, 
    title='Number of blocks distribution', 
    width=800, 
    height=600
)

fig.show()

In [None]:
fig = px.histogram(
    df, 
    "PTS", 
    nbins=100, 
    title='Number of points distribution', 
    width=800, 
    height=600
)

fig.show()

Time to check nationalities

In [None]:
ds = df.groupby(['Player', 'nationality'])['Team'].count().reset_index()
ds = ds['nationality'].value_counts().reset_index()

ds.columns = [
    'nationality', 
    'number of samples'
]

ds = ds.sort_values(['number of samples'])
ds = ds.tail(40)
fig = px.bar(
    ds, 
    x='number of samples', 
    y="nationality", 
    orientation='h', 
    title='Top 40 nationalities presented in dataset', 
    height=900, 
    width=900
)
fig.show()

# TBD