In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from pandas.plotting import scatter_matrix
%matplotlib inline

In [3]:
# Read in data
df = pd.read_csv('player_data.csv')

**Data Dictionary**

- Player - Player Name 
- G      - Games Played
- GS     - Games Started
- MP     - Minutes Played
- FG     - Field Goals per 100 possessions
- FGA    - Field Goal Attempts per 100 possessions
- 2P     - 2 Pointers per 100 possessions
- 2PA    - 2 Point Attempts per 100 possessions
- 3P     - 3 Pointers per 100 possessions
- 3PA    - 3 Point Attempts per 100 possessions
- FT     - Free Throws per 100 possessions
- FTA    - Free Throw Attempts per 100 possessions
- TRB    - Total Rebounds per 100 possessions
- AST    - Assists per 100 possessions
- STL    - Steals per 100 possessions
- BLK    - Blocks per 100 possessions
- TOV    - Turn Overs per 100 possessions
- PF     - Personal Fouls per 100 possessions
- Team   - Team
- Season - Season (Year)
- Class  - Player Class {1: Freshmen, 2: Sophmore, 3: Junior, 4: Senior}
- Pos    - Position
- Height - Height (inches)

**Basic Pandas**

In [4]:
df.columns

Index(['Player', 'G', 'GS', 'MP', 'FG', 'FGA', '2P', '2PA', '3P', '3PA', 'FT',
       'FTA', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'Team', 'Season',
       'Class', 'Pos', 'Height'],
      dtype='object')

In [8]:
df.head(10)

Unnamed: 0,Player,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,...,AST,STL,BLK,TOV,PF,Team,Season,Class,Pos,Height
0,Julian Edmonson,30,5.0,568,10.5,26.2,6.2,15.2,4.3,11.0,...,3.0,2.3,0.2,5.6,6.6,abilene-christian,2014,3.0,G,73.0
1,Jacob Lancaster,31,11.0,434,12.0,21.7,11.8,21.5,0.1,0.1,...,1.0,1.0,5.0,5.6,6.7,abilene-christian,2014,3.0,F,82.0
2,Parker Wentz,31,29.0,1040,7.7,15.7,3.2,5.6,4.5,10.1,...,4.9,2.8,0.1,3.4,4.0,abilene-christian,2014,2.0,G,69.0
3,LaDarrien Williams,16,14.0,474,8.1,21.8,6.4,16.3,1.7,5.5,...,7.7,2.4,0.5,4.2,5.2,abilene-christian,2014,3.0,G,72.0
4,Harrison Hawkins,16,16.0,481,6.8,18.1,3.7,10.4,3.1,7.6,...,6.3,2.1,0.0,6.0,5.8,abilene-christian,2014,3.0,G,72.0
5,James Pegues,31,0.0,445,7.3,16.6,7.2,15.8,0.1,0.8,...,2.1,2.9,0.4,6.1,9.4,abilene-christian,2014,2.0,F,78.0
6,Rafael Farley,27,11.0,421,6.2,15.6,1.5,4.9,4.6,10.7,...,1.7,0.6,0.3,2.5,4.6,abilene-christian,2014,3.0,G,77.0
7,Alexsander Milosavljevic,19,0.0,127,7.9,15.4,7.9,14.9,0.0,0.5,...,1.4,0.0,1.4,4.7,8.8,abilene-christian,2014,3.0,F,81.0
8,Austin Cooke,31,31.0,914,5.2,11.8,2.1,4.3,3.0,7.5,...,3.9,1.9,0.3,1.9,4.9,abilene-christian,2014,2.0,F,79.0
9,Michael Grant,31,17.0,642,5.1,10.2,5.1,9.8,0.0,0.5,...,2.9,2.8,1.1,4.5,6.5,abilene-christian,2014,1.0,G,77.0


In [9]:
df.describe()

Unnamed: 0,G,GS,MP,FG,FGA,2P,2PA,3P,3PA,FT,FTA,TRB,AST,STL,BLK,TOV,PF,Season,Class,Height
count,23868.0,23867.0,23868.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23744.0,23868.0,23311.0,23311.0
mean,24.35977,12.016592,485.537791,6.331006,15.279127,4.583137,9.68499,1.747545,5.5945,3.611801,5.549212,9.475425,3.296319,1.704443,1.018299,3.668341,6.286965,2015.999623,2.455965,76.819828
std,10.496075,12.947851,369.624537,3.362839,6.736971,3.206025,5.718717,2.063585,5.509197,3.123368,4.614641,5.452278,2.837091,1.856055,1.516546,3.061034,4.307453,1.412924,1.115557,3.518153
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,1.0,62.0
25%,17.0,0.0,123.0,4.4,11.3,2.4,6.0,0.0,0.6,1.8,3.0,5.8,1.4,0.9,0.0,2.5,4.2,2015.0,1.0,74.0
50%,29.0,6.0,459.0,6.3,15.1,4.3,9.3,1.3,4.9,3.2,5.0,8.7,2.7,1.5,0.5,3.4,5.6,2016.0,2.0,77.0
75%,32.0,25.0,805.0,8.2,18.8,6.3,12.9,2.9,8.8,4.9,7.4,12.5,4.5,2.2,1.4,4.5,7.7,2017.0,3.0,80.0
max,41.0,41.0,1474.0,63.4,126.8,63.4,126.8,56.7,122.9,113.6,125.2,86.5,61.1,64.8,59.5,171.3,245.5,2018.0,4.0,91.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23868 entries, 0 to 23867
Data columns (total 23 columns):
Player    23868 non-null object
G         23868 non-null int64
GS        23867 non-null float64
MP        23868 non-null int64
FG        23744 non-null float64
FGA       23744 non-null float64
2P        23744 non-null float64
2PA       23744 non-null float64
3P        23744 non-null float64
3PA       23744 non-null float64
FT        23744 non-null float64
FTA       23744 non-null float64
TRB       23744 non-null float64
AST       23744 non-null float64
STL       23744 non-null float64
BLK       23744 non-null float64
TOV       23744 non-null float64
PF        23744 non-null float64
Team      23868 non-null object
Season    23868 non-null int64
Class     23311 non-null float64
Pos       23311 non-null object
Height    23311 non-null float64
dtypes: float64(17), int64(3), object(3)
memory usage: 4.2+ MB


**Feature Engineering**
1. Create FG_perc, 2P_perc, 3P_perc, FT_perc and Points Columns

In [None]:
df[['FG_perc', '2P_perc', '3P_perc', 'FT_perc', 'Points']].head()

**Query Questions**
1. How many teams are there?
2. Gonzaga Players by minutes played from 2018 Season
3. Senior Center with most turn overs, per 100 possessions, before 2017.
4. Team with most points, per 100 possessions, by under classmen.
5. Player under 6 feet tall with most fouls--per 100 possessions.
6. Guards from each season in top 1% for three-point shot percentage, per 100 possessions, from the 2018 season.
7. How many rows have null values?
8. Mean of Games played by players with null values.
9. Mean of Games played by players without null values.

In [None]:
# 1. How many teams are there?


In [None]:
# 2. Gonzaga Players by minutes played from 2018 Season


In [None]:
# 3. Senior Center with most turn overs, per 100 possessions, before 2017.


In [None]:
# 4. Team with most points, per 100 possessions, by under classmen since 2016.


In [None]:
# 5. 5 players under 6 feet tall with most fouls--per 100 possessions


In [None]:
# 6. Guards in top 1% for three-point shot percentage, per 100 possessions, from the 2018 season.


In [None]:
# 7. How many rows have null values?


In [None]:
# 8. Mean of Games played by players with null values


In [None]:
# 9. Mean of Games played by players without null values


**Basic plotting**
1. Create a DataFrame for 2018 Season with only 'MP', 'FG', '2P', '3P' and no null values
2. Create histograms for MP, FG, 2P, 3P
3. Create box plots for MP, FG, 2P, 3P
4. Repeat 1 and 2 by Pos
5. Scatter FG, 2P and 3P by MP

Extra Credit - Create Scatter Matrix

**1. Create a DataFrame for 2018 Season with only 'MP', 'FG', '2P', '3P' and no null values**

**2. Create histograms for MP, FG, 2P, 3P**

**3. Create box plots for MP, FG, 2P, 3P**

**4. Repeat 1 and 2 by Pos**

**5. Scatter FG, 2P and 3P by MP**

**Extra Credit - Create Scatter Matrix**