# Statistical analysis of basketball players

## Import the necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Set display all columns

In [None]:
pd.set_option('display.max_columns', None)

## Read dataset CSV file

In [None]:
data = pd.read_csv('../input/basketball-players-stats-per-season-49-leagues/players_stats_by_season_full_details.csv')

## Observation data

In [None]:
data.sample(5)

In [None]:
data.isnull().sum()

**Description**

The date of birth, weight, height, school, and nationality are missing. Let's take a look at the data characteristics and decide how to clean it.

In [None]:
data_highschool = data[data['high_school'].isnull()]
data_highschool['League'].value_counts()

There are too many empty data in the high_school column, but it does not affect our analysis of the data, so directly delete the high_school column.

In [None]:
del data['high_school']

In [None]:
(data[data['birth_year'].isnull()])['League'].value_counts()

In [None]:
(data[data['weight'].isnull()])['League'].value_counts()

**in conclusion**

We found that the height, weight, and date of birth are empty data, excluding NBA games. Moreover, the number of empty data for each type of game is not large, so the deletion process is uniform.

## Delete rows with empty data

In [None]:
data = data.dropna()
data.sample(5)

In [None]:
data.info()

## Data format conversion

In [None]:
data['birth_date'] = data['birth_date'].astype('datetime64')
data['birth_year'] = data['birth_year'].astype('int64')
data.sample(5)

## Player height distribution

In [None]:
b = data[['Player', 'height_cm']]
bb = b.groupby('height_cm').size()
bbb = bb[bb.values > 3000]

In [None]:
fig = plt.figure(figsize=(14, 3), dpi=100)
ax = fig.add_subplot(111)
mean = data['height_cm'].mean()
ax.bar(bb.index, bb.values, width=0.8, color='lightsteelblue')
ax.bar(bbb.index, bbb.values, width=0.8, color='royalblue')
ax.set_xticks(np.arange(165.0, 230.0, 1))
ax.set_xticklabels(np.arange(165.0, 230.0, 1), rotation=90)
ax.set_xlim(164,230)
for x,y in zip(bb.index, bb.values):
    ax.text(x, y+200, y, fontsize=10, rotation=45, horizontalalignment='center')
ax.set_ylim(0,5000)
ax.set_title('Player Height Distribution', fontsize=16, y=1.02)
ax.set_xlabel('Height (CM)', fontsize=12)
ax.set_ylabel('Player Counts', fontsize=12)
ax.yaxis.grid(alpha=0.4, ls='--')
plt.show()

## Top 20 player nationalities

In [None]:
c = data[['Player', 'nationality']]
cc = c.sort_values(by=['Player', 'nationality']).reset_index(drop=True)
ccc = cc.groupby('Player', as_index=False).first()
cccc = ccc.groupby('nationality', as_index=False).count().sort_values(by='Player').tail(20)
ccccc = cccc.tail(5)

In [None]:
fig = plt.figure(figsize=(6,8), dpi=100)
ax = fig.add_subplot(111)
ax.barh(cccc['nationality'], cccc['Player'], color='lightsteelblue', height=0.8)
for x,y in zip(cccc['nationality'], cccc['Player']):
    ax.text(y+60, x, y, fontsize=10, horizontalalignment='left')
ax.barh(ccccc['nationality'], ccccc['Player'], color='royalblue', height=0.8)
ax.set_xlim(0,4500)
ax.xaxis.grid(alpha=0.4, ls='--', color='lightsteelblue')
ax.set_title('Player Nationality Distribution', fontsize=16, y=1.02)
ax.set_xlabel('Number of Players', fontsize=12)
ax.set_ylabel('Country of Players', fontsize=12)
plt.show()

## The percentage of players from the top 5 countries with the most players

In [None]:
data_nat = ccc.groupby('nationality', as_index=False).count().sort_values(by='Player', ascending=False).reset_index(drop=True)
data_nat_200 = data_nat.loc[data_nat['Player'] >= 200]
data_nat_100 = data_nat.loc[(data_nat['Player'] >= 100) & (data_nat['Player'] <= 200)]

In [None]:
fig = plt.figure(figsize=(6,6), dpi=100)
ax = fig.add_subplot(111)
datas = data_nat_200['Player']
labels = data_nat_200['nationality']
colors = sns.color_palette("Blues_r",n_colors=5)
ax.pie(datas, labels=labels, colors=colors, autopct='%1.2f%%', textprops={'fontsize': 10, 'color': 'black'}, startangle=135, counterclock=False, pctdistance=0.8, explode=(0, 0, 0, 0, 0.1), wedgeprops=dict(width=0.4, edgecolor='w'))
ax.legend(bbox_to_anchor=(1.2, 0, 0, 0.8))
plt.show()

## The proportion of national players with a number of players between 100 and 200

In [None]:
fig = plt.figure(figsize=(6,6), dpi=100)
ax = fig.add_subplot(111)
datas = data_nat_100['Player']
labels = data_nat_100['nationality']
colors = sns.color_palette("Blues_d",n_colors=17)
ax.pie(datas, labels=labels, colors=colors, autopct='%1.2f%%', textprops={'fontsize': 10, 'color': 'white'}, startangle=135, counterclock=False, pctdistance=0.8, wedgeprops=dict(width=0.4, edgecolor='w'))
ax.legend(bbox_to_anchor=(1.2, 0, 0, 0.9))
plt.show()

## NBA players' average total number of attempts per season

In [None]:
data_nba = data.iloc[:, 0:22]
data_nba = data_nba.loc[data_nba['League'] == 'NBA', :]
data_nba.head()

In [None]:
d = data_nba.groupby(['Season', 'Player'], as_index=False)[['FGM', 'FGA']].sum().sort_values(by=['Season', 'FGA', 'FGM'], ascending=[True, False, False])
dd = d.groupby('Player', as_index=False)[['FGM', 'FGA']].mean().sort_values(by='FGA', ascending=False)
ddd = dd.reset_index(drop=True)
ddd.head()

In [None]:
fig = plt.figure(figsize=(6,6), dpi=100)
gs = fig.add_gridspec(6,6)
plt.subplots_adjust(wspace=0.1, hspace=0.1)
ax1 = fig.add_subplot(gs[1:, 0:5])
ax1.set_yticks(np.arange(0, 901, 100))
ax1.grid(alpha=0.2, ls='--')
ax2 = fig.add_subplot(gs[0, 0:5])
ax2.set_xticks([])
ax3 = fig.add_subplot(gs[1:, 5])
ax3.set_yticks([])
ax1.scatter(ddd['FGA'], ddd['FGM'], marker='.', color='royalblue', alpha=0.4,)
ax2.hist(ddd['FGA'], bins=200, color='royalblue', alpha=0.8)
ax3.hist(ddd['FGM'], bins=200, color='royalblue', alpha=0.8, orientation='horizontal')
ax2.set_title('Player FGA & FGM Distribution', fontsize=16, y=1.02)
ax1.set_xlabel('FGA', fontsize=12)
ax1.set_ylabel('FGM', fontsize=12)
plt.show()

## NBA player statistics for each season

In [None]:
data_nba.head()

In [None]:
e = data_nba.groupby('Player')['PTS'].sum().sort_values(ascending=False)
ee = e.describe()
ee

## NBA player total score statistics

In [None]:
fig = plt.figure(figsize=(14,4), dpi=100)
ax = fig.add_subplot(111)
ax.boxplot(e.values, widths=0.1, labels=['PTS'], vert=False, sym='+', patch_artist=False, meanline=True, showmeans=True, showcaps=True, showbox=True, showfliers=True)
ax.set_xticks(np.arange(0,30001,2000))
ax.set_xlabel('Total Scores of Player', fontsize=12)
ax.set_title('NBA Players Career Total Score Distribution', fontsize=16, y=1.02)
ax.grid(alpha=0.2)
plt.show()

## NBA players three-pointers, two-pointers, rebounds, assists statistics

In [None]:
e1 = data_nba['3PM']
e2 = data_nba['FTM']
e3 = data_nba['REB']
e4 = data_nba['AST']

In [None]:
fig = plt.figure(figsize=(4,8), dpi=100)
ax = fig.add_subplot(111)
ax.boxplot([e1.values, e2.values, e3.values, e4.values], widths=0.2, labels=['3PM', 'FTM', 'REB', 'AST'], sym='.', patch_artist=False, vert=True)
ax.set_xlabel('Data Categories', fontsize=12)
ax.set_title('Player data performance distribution in each season', fontsize=14, y=1.02)
ax.set_ylabel('Values', fontsize=12)
ax.set_yticks(np.arange(0,1400,50))
ax.yaxis.grid(alpha=0.2, ls='--')
plt.show()

## Data distribution of the TOP20 players with the most total points

In [None]:
f = data_nba.groupby('Player')['PTS'].sum().sort_values(ascending=False).head(20)

In [None]:
ff = data_nba.loc[data_nba['Player'].isin(f.index)]
ff.head()

In [None]:
fig = plt.figure(figsize=(6,10), dpi=100)
ax = fig.add_subplot(111)
ax.scatter(ff['3PM'], ff['Player'], label='Three Pointer', marker='o', alpha=0.6, color='limegreen')
ax.scatter(ff['FTM'], ff['Player'], label='Free Throw', marker='o', alpha=0.6, color='firebrick')
ax.scatter(ff['REB'], ff['Player'], label='Rebounds', marker='o', alpha=0.6, color='steelblue')
ax.set_xticks(np.arange(0,1001,50))
ax.set_xticklabels(np.arange(0,1001,50), rotation=45)
ax.xaxis.grid(alpha=0.4, ls='--')
ax.set_title('Data Distribution of TOP20 NBA Players', fontsize=16, y=1.01)
ax.set_xlabel('Players\' Performance Per Game', fontsize=12)
ax.set_ylabel('TOP20 Players', fontsize=12)
ax.legend()
plt.show()

The most scoring three-pointer is Stephen Curry, the season highest hit 400 + three-pointers. 

Blake Griffin has the most rebounds. The season's highest number of rebounds is close to 1,000. 

Kevin Durant and James Harden scored the most two-pointers. The season-highest hit 750+ two-pointers. James Harden has more high-scoring seasons and his performance is more stable. 

## The performance of the 5 most valuable players in the regular season each season

In [None]:
p = data_nba.groupby('Player')['PTS'].sum().sort_values(ascending=False).head(5)
pp = data_nba[['Season', 'Stage', 'Player', 'PTS']].loc[data_nba['Player'].isin(p.index)]
ppp = pp.sort_values(by=['Player', 'Season'], ascending=[True, True]).reset_index(drop=True)
pppp = ppp.loc[ppp['Stage'] == 'Regular_Season'].reset_index(drop=True)
pppp.head()

In [None]:
players = pppp[['Season', 'Player', 'PTS']].set_index(['Season', 'Player']).unstack().T
players = players.fillna(pppp['PTS'].mean())

In [None]:
fig = plt.figure(figsize=(10,4), dpi=100)
ax = fig.add_subplot(111)
ax.plot(players.columns, players.iloc[0].values, label='James Harden', marker='.', lw=1, color='royalblue')
ax.plot(players.columns, players.iloc[1].values, label='Kevin Durant', marker='.', lw=1, color='darkblue')
ax.plot(players.columns, players.iloc[2].values, label='LeBron James', marker='.', lw=1, color='slategrey')
ax.plot(players.columns, players.iloc[3].values, label='Russell Westbrook', marker='.', lw=1, color='teal')
ax.plot(players.columns, players.iloc[4].values, label='Stephen Curry', marker='.', lw=1, color='green')

ax.set_xticklabels(players.columns, rotation=30)
ax.set_xlabel('Seasons', fontsize=12)
ax.set_ylabel('Scores Per Season', fontsize=12)
ax.set_title('TOP5 Player Scores Trend', fontsize=16, y=1.02)
ax.xaxis.grid(alpha=0.3, ls='--')
plt.legend(loc='center', bbox_to_anchor=(1,0.5,0.3,0))
plt.show()

## Correlation analysis of player data indicators

In [None]:
data_nba1 = data.loc[(data['League'] == 'NBA') & (data['Season'] == '2018 - 2019') & (data['Stage'] == 'Regular_Season')].reset_index(drop=True)
data_nba2 = data_nba1[['Player', 'GP', 'MIN', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'TOV', 'PF', 'ORB', 'DRB', 'REB', 'AST', 'STL', 'BLK', 'PTS']]

In [None]:
data_nba2['GP'].describe()

In [None]:
data_nba2['MIN'].describe()

In [None]:
data_nba3 = (data_nba2.loc[data_nba2['GP'] >= 71]).loc[data_nba2['MIN'] >= 1751].reset_index(drop=True)
data_nba3 = data_nba3[['Player', 'FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'TOV', 'PF', 'ORB', 'DRB', 'REB', 'AST', 'STL', 'BLK', 'PTS']]
data_nba3.head()

In [None]:
data_cor = data_nba3.corr()
data_cor

In [None]:
fig = plt.figure(figsize=(8,7), dpi=100)
ax = fig.add_subplot(111)
im = ax.imshow(data_cor, cmap='Blues', origin='lower', aspect='auto')
ax.set_xticks(np.arange(0,15,1))
ax.set_yticks(np.arange(0,15,1))
ax.set_xticklabels(data_cor.index)
ax.set_yticklabels(data_cor.columns)
ax.set_title('Correlation Analysis', fontsize=16, y=1.02)
for x in range(len(data_cor.index)):
    for y in range(len(data_cor.values)):
        ax.text(y, x, round(data_cor.iloc[x,y],2), color='white', ha='center', va='center', fontsize=10)
plt.colorbar(im)
plt.show()

The correlation between FGM and FGA is 0.95;
The correlation between FTM and FTA is 0.98;
The correlation between 3PM and 3PA is 0.99;
The correlation between REB and ORB and DRB is 0.97 and 0.87 respectively;
The correlation between PTS and FGM and FGA was 0.98 and 0.96 respectively.