In [1]:
#import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#get files from cleaned data
players = pd.read_csv('players.csv', index_col='Unnamed: 0')
coaches = pd.read_csv('coaches.csv', index_col='Unnamed: 0')
champions = pd.read_csv('champions.csv', index_col='Unnamed: 0')

In [19]:
print(players.shape)
players.head()

(6371, 32)


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Champ,Runner-Up
0,Arron Afflalo,SG,24,DEN,82,75,27.1,3.3,7.1,0.465,...,3.1,1.7,0.6,0.4,0.9,2.7,8.8,2010,0,0
1,Alexis Ajinça,C,21,CHA,6,0,5.0,0.8,1.7,0.5,...,0.7,0.0,0.2,0.2,0.3,0.8,1.7,2010,0,0
2,LaMarcus Aldridge,PF,24,POR,78,78,37.5,7.4,15.0,0.495,...,8.0,2.1,0.9,0.6,1.3,3.0,17.9,2010,0,0
3,Joe Alexander,SF,23,CHI,8,0,3.6,0.1,0.8,0.167,...,0.6,0.3,0.1,0.1,0.0,1.1,0.5,2010,0,0
4,Malik Allen,PF,31,DEN,51,3,8.9,0.9,2.3,0.397,...,1.6,0.3,0.2,0.1,0.4,1.3,2.1,2010,0,0


In [20]:
print(coaches.shape)
coaches.head()

(332, 26)


Unnamed: 0,Coach,Team,F-Seasons,Car-Seasons,CR-G,CR-W,CR-L,FR-G,FR-W,FR-L,...,CP-L,FP-G,FP-W,FP-L,Car.P-G,Car.P-W,Car.P-L,Year,Champ,Runner-Up
0,Mike Woodson,ATL,6,6,82,53,29,492,206,286,...,7.0,29.0,11.0,18.0,29.0,11.0,18.0,2010,0,0
1,Doc Rivers,BOS,6,11,82,50,32,492,280,212,...,9.0,71.0,41.0,30.0,86.0,46.0,40.0,2010,0,1
2,Larry Brown,CHA,2,29,82,44,38,164,79,85,...,4.0,4.0,0.0,4.0,235.0,120.0,115.0,2010,0,0
3,Vinny Del Negro,CHI,2,2,82,41,41,164,82,82,...,4.0,12.0,4.0,8.0,12.0,4.0,8.0,2010,0,0
4,Mike Brown,CLE,5,5,82,61,21,410,272,138,...,5.0,71.0,42.0,29.0,71.0,42.0,29.0,2010,0,0


In [5]:
#Descriptive statistics
players.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'Champ',
       'Runner-Up'],
      dtype='object')

In [6]:
#list of numerical and categorical features
p_int = []
p_num = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
p_cat = ['Player', 'Pos', 'Tm', 'Year', 'Champ', 'Runner-Up']

## Exploring Null Values:
- There are players who are on the team, but haven't played games
- Some players entered a game, but didn't record a shot (minutes played != 0)
- There are players who won't have attempted two pointers or three pointers e.g. centers, some forwards
- Those who don't get to the free throw line

In [7]:
players.isnull().sum()

Player          0
Pos             0
Age             0
Tm              0
G               0
GS              0
MP            237
FG            237
FGA           237
FG%           271
3P            237
3PA           237
3P%          1026
2P            237
2PA           237
2P%           309
eFG%          271
FT            237
FTA           237
FT%           540
ORB           237
DRB           237
TRB           237
AST           237
STL           237
BLK           237
TOV           237
PF            237
PTS           237
Year            0
Champ           0
Runner-Up       0
dtype: int64

Note, the FG% and eFG% are missing 34 values --> these players did not attempt a shot, whether or not they played. Let's see if they played. Other observations:
- 237 players were rostered, but did not play any minutes of basketball for their teams
- 271-237 players recorded game minutes, but never attempted a shot in a game 
- 1026-237 players recorded minutes, but didn't attempt a three pointer
- 309-237 players recorded minutes, but didn't attempt a two pointer
- 540-237 players recorded minutes, but didn't attempt a free throw


### What to remove?
- We don't want to keep any players who didn't play any minutes (G == 0 or MPG == 0.0)
- Might keep players whose average minutes are 0 because maybe they played a game or two. Have to see first.

In [21]:
no_min = len(players[players['MP'].isnull()])
print('Players with 0 minutes played are {}% of the data'.format(round(no_min*100/len(players), 2)))

#drop this population -> cannot extract info to answer business question
played_min = players[players['MP'].isnull() == False]
len(played_min)

Players with 0 minutes played are 3.72% of the data


6134

### Exploring small impact players
Some players have minutes recorded, but they could be entering the game and not attempting shots/assists or affecting the defensive effort with rebounds/steals/blocks. Using a threshold of MP = 2.0/per game to see if we should keep or drop these players.

In [29]:
played_few = played_min[played_min['MP'] <= 2.0]
pct_total = len(played_few)*100/len(played_min)
print('Players who had small impact are {}% of the data'.format(round(pct_total,2)))
played_few[p_num].describe()

Players who had small impact are 0.98% of the data


Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,60.0,60.0,60.0,39.0,60.0,60.0,13.0,60.0,60.0,33.0,...,6.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,1.515,0.175,0.531667,0.335051,0.033333,0.121667,0.230769,0.14,0.415,0.375758,...,0.458333,0.118333,0.211667,0.328333,0.06,0.028333,0.033333,0.101667,0.188333,0.416667
std,0.536727,0.307877,0.480287,0.384658,0.160155,0.27065,0.438529,0.271967,0.471142,0.410654,...,0.33229,0.304482,0.316277,0.436508,0.139247,0.09037,0.103607,0.217452,0.455137,0.716433
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.3125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.75,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.3,0.333,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.3,1.0,0.55,0.0,0.0,0.0,0.3,0.85,0.667,...,0.5,0.0,0.3,0.55,0.0,0.0,0.0,0.125,0.0,0.7
max,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,1.5,1.0,1.5,0.7,0.4,0.5,1.3,2.0,3.0


No statistic sticks out as one to consider for these 60 players over 10 seasons. The standard deviations are huge which makes sense because the chances of these individuals playing and their impact on a game are stochastic, and cannot be gainfully analysed.  
**Decision:** Dropping them from the dataset