In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier / obsolete on v0.23
plt.rcParams['figure.figsize'] = [15,7]

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 1000)

In [2]:
pd.__version__

u'0.23.0'

# DataFrames creation

In [3]:
pd.DataFrame([[1,2,3,4],[5,6,7,8]])

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8


In [4]:
pd.DataFrame([[1,2,3,4],[5,6,7,8]], columns=['col1','col2','col3','col4'])

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,4
1,5,6,7,8


In [5]:
pd.DataFrame([[1,2,3,4],[5,6,7,8]], columns=['col1','col2','col3','col4'], index=['a', 'b'])

Unnamed: 0,col1,col2,col3,col4
a,1,2,3,4
b,5,6,7,8


# Load data from file

In [6]:
df = pd.read_csv("nba_stats/stats.csv")

In [7]:
df.head()

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,SF,Boston,Cleveland,R,...,10,3,4,0,1,0,14
2,2017-2018 Regular Season,10/17/2017,Al Horford,C,Boston,Cleveland,R,...,7,5,2,0,0,1,9
3,2017-2018 Regular Season,10/17/2017,Jaylen Brown,SF,Boston,Cleveland,R,...,6,0,5,2,3,0,25
4,2017-2018 Regular Season,10/17/2017,Kyrie Irving,PG,Boston,Cleveland,R,...,4,10,4,3,2,0,22


In [8]:
# also from JSON, and other formats
df = pd.read_json("nba_stats/stats.json")

In [9]:
df.head() # note that in JSON format columns are sorted in alphanum order

Unnamed: 0,3P,3PA,A,BL,DATA SET,DATE,DR,...,PLAYER FULL NAME,POSITION,PTS,ST,TO,TOT,VENUE (R/H)
0,0,1,0,0,2017-2018 Regular Season,2017-10-17,1,...,Gordon Hayward,SF,2,0,0,1,R
1,1,2,3,0,2017-2018 Regular Season,2017-10-17,6,...,Jayson Tatum,SF,14,0,1,10,R
2,0,2,5,1,2017-2018 Regular Season,2017-10-17,7,...,Al Horford,C,9,0,0,7,R
3,2,9,0,0,2017-2018 Regular Season,2017-10-17,5,...,Jaylen Brown,SF,25,2,3,6,R
4,4,9,10,0,2017-2018 Regular Season,2017-10-17,2,...,Kyrie Irving,PG,22,3,2,4,R


In [10]:
# read a JSON string directly
pd.read_json('[{"a": 1, "b": 2}, {"a": 3, "b": 4}]')

Unnamed: 0,a,b
0,1,2
1,3,4


In [11]:
df = pd.read_csv("nba_stats/stats.csv")

## Attributes
Most of them as similar to the Serie's attributes

In [12]:
df.values # values

array([['2017-2018 Regular Season', '10/17/2017', 'Gordon Hayward', ...,
        0, 0, 2],
       ['2017-2018 Regular Season', '10/17/2017', 'Jayson Tatum', ..., 1,
        0, 14],
       ['2017-2018 Regular Season', '10/17/2017', 'Al Horford', ..., 0,
        1, 9],
       ...,
       ['2017-2018 Regular Season', '10/18/2017', 'Kosta Koufos', ..., 1,
        1, 2],
       ['2017-2018 Regular Season', '10/18/2017', 'Vince Carter', ..., 1,
        0, 6],
       ['2017-2018 Regular Season', '10/18/2017', 'Malachi Richardson',
        ..., 0, 0, 0]], dtype=object)

In [13]:
df.index # index values

RangeIndex(start=0, stop=269, step=1)

In [14]:
df.ndim # number of dimensions of the DataFrame

2

In [15]:
df.shape # number of (rows, columns)

(269, 23)

In [17]:
df.size # total number of elements in the DataFrame (rows x columns), not to confuse with shape

6187

# Methods

In [19]:
df.info() # provide info summary on the df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 23 columns):
DATA SET            269 non-null object
DATE                269 non-null object
PLAYER FULL NAME    269 non-null object
POSITION            269 non-null object
OWN TEAM            269 non-null object
OPP TEAM            269 non-null object
VENUE (R/H)         269 non-null object
MIN                 269 non-null float64
FG                  269 non-null int64
FGA                 269 non-null int64
3P                  269 non-null int64
3PA                 269 non-null int64
FT                  269 non-null int64
FTA                 269 non-null int64
OR                  269 non-null int64
DR                  269 non-null int64
TOT                 269 non-null int64
A                   269 non-null int64
PF                  269 non-null int64
ST                  269 non-null int64
TO                  269 non-null int64
BL                  269 non-null int64
PTS                 269 non-

In [20]:
df.get_dtype_counts() # count of columns by datatype

float64     1
int64      15
object      7
dtype: int64

In [21]:
df.sum() # sum of all values in DF per column

DATA SET            2017-2018 Regular Season2017-2018 Regular Seas...
DATE                10/17/201710/17/201710/17/201710/17/201710/17/...
PLAYER FULL NAME    Gordon HaywardJayson TatumAl HorfordJaylen Bro...
POSITION            SFSFCSFPGPGCPFPGPGSFSFPFSGPGCSGSGSFSGSFPFCPGPG...
OWN TEAM            BostonBostonBostonBostonBostonBostonBostonBost...
                                          ...                        
PF                                                                554
ST                                                                197
TO                                                                375
BL                                                                150
PTS                                                              2810
Length: 23, dtype: object

In [22]:
df.mean() # mean per column

MIN    23.206320
FG      3.858736
FGA     8.535316
3P      0.962825
3PA     2.710037
         ...    
PF      2.059480
ST      0.732342
TO      1.394052
BL      0.557621
PTS    10.446097
Length: 16, dtype: float64

In [23]:
df.mean(axis=1) # the axis parameter let us set the dimension to aggregate (rows = 0, columns = 1)

0      0.89375
1      6.53125
2      5.38125
3      8.41250
4      8.15000
        ...   
264    4.05625
265    4.97500
266    2.60000
267    2.26250
268    0.59375
Length: 269, dtype: float64

... similar behavior on most of the methods available on a Series (mean, min, max, std, product ...)

## Data look up

In [24]:
df = pd.read_csv("nba_stats/stats.csv")
df.head()

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,SF,Boston,Cleveland,R,...,10,3,4,0,1,0,14
2,2017-2018 Regular Season,10/17/2017,Al Horford,C,Boston,Cleveland,R,...,7,5,2,0,0,1,9
3,2017-2018 Regular Season,10/17/2017,Jaylen Brown,SF,Boston,Cleveland,R,...,6,0,5,2,3,0,25
4,2017-2018 Regular Season,10/17/2017,Kyrie Irving,PG,Boston,Cleveland,R,...,4,10,4,3,2,0,22


In [25]:
df['PLAYER FULL NAME'] # extract a single column (Series) of data

0          Gordon Hayward
1            Jayson Tatum
2              Al Horford
3            Jaylen Brown
4            Kyrie Irving
              ...        
264        Garrett Temple
265          De'Aaron Fox
266          Kosta Koufos
267          Vince Carter
268    Malachi Richardson
Name: PLAYER FULL NAME, Length: 269, dtype: object

In [26]:
df[['PLAYER FULL NAME', 'POSITION']] # extract multiple columns at once 

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Gordon Hayward,SF
1,Jayson Tatum,SF
2,Al Horford,C
3,Jaylen Brown,SF
4,Kyrie Irving,PG
...,...,...
264,Garrett Temple,SG
265,De'Aaron Fox,PG
266,Kosta Koufos,C
267,Vince Carter,SG


In [28]:
df2 = df['PLAYER FULL NAME'] # by assigning to df2, we get a slice of the dataframe, wether it is a COPY or a REFERENCE
# is not very intuitive.
# For a single column, it is usually a reference
df2[0] = "Al Horford" # !!! Warning
df.head(1) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Al Horford,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2


### important warning about assignments
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

https://www.dataquest.io/blog/settingwithcopywarning/

In [29]:
df = pd.read_csv("nba_stats/stats.csv")

In [30]:
# instead it is recommended to use `.loc`
df.loc[:,'PLAYER FULL NAME']  # single column

0          Gordon Hayward
1            Jayson Tatum
2              Al Horford
3            Jaylen Brown
4            Kyrie Irving
              ...        
264        Garrett Temple
265          De'Aaron Fox
266          Kosta Koufos
267          Vince Carter
268    Malachi Richardson
Name: PLAYER FULL NAME, Length: 269, dtype: object

In [31]:
df2 = df['PLAYER FULL NAME']
df2.loc[0] = "Al Horford"
df.head(1) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Al Horford,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2


In [32]:
df = pd.read_csv("nba_stats/stats.csv")

In [34]:
# in the case above, the assignment actually worked, and also modified the original DF
# If we do:
df3 = df[['PLAYER FULL NAME', 'POSITION']]
df3['PLAYER FULL NAME'][0] = "Mickey Mouse"
df3.head(1)

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Mickey Mouse,SF


In [35]:
df.head(1) # df was not affected

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2


In [36]:
df = pd.read_csv("nba_stats/stats.csv")

In [37]:
df4 = df[['PLAYER FULL NAME', 'POSITION']]
df4.loc[0, 'PLAYER FULL NAME'] = "Mickey Mouse"
df4.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PLAYER FULL NAME,POSITION
0,Mickey Mouse,SF


In [38]:
df.head(1) # df was NOT modified, because df4 was a copy in this case

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2


In [39]:
df5 = df[['PLAYER FULL NAME', 'POSITION']].copy()
df5.loc[0, 'PLAYER FULL NAME'] = "Mickey Mouse"
df5.head(1)

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Mickey Mouse,SF


In [40]:
# no warning as we implicitely defined this df as a copy

## Back to data lookup

In [46]:
df.loc[0:5, 'PLAYER FULL NAME'] # rows index 2 to 5, FULL PLAYER NAME column

0    Gordon Hayward
1      Jayson Tatum
2        Al Horford
3      Jaylen Brown
4      Kyrie Irving
5      Marcus Smart
Name: PLAYER FULL NAME, dtype: object

In [47]:
df.iloc[2:5, 2:7] # single row index 1, columns index 2 to 7

Unnamed: 0,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H)
2,Al Horford,C,Boston,Cleveland,R
3,Jaylen Brown,SF,Boston,Cleveland,R
4,Kyrie Irving,PG,Boston,Cleveland,R


In [48]:
df.loc[0:2, ['PLAYER FULL NAME', 'POSITION']] # new DF

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Gordon Hayward,SF
1,Jayson Tatum,SF
2,Al Horford,C


In [49]:
df.loc[:, ['PLAYER FULL NAME', 'POSITION']]  # all rows, multiple columns

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Gordon Hayward,SF
1,Jayson Tatum,SF
2,Al Horford,C
3,Jaylen Brown,SF
4,Kyrie Irving,PG
...,...,...
264,Garrett Temple,SG
265,De'Aaron Fox,PG
266,Kosta Koufos,C
267,Vince Carter,SG


In [50]:
# filter all but a specific column
df.loc[:, df.columns != 'POSITION']

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,OWN TEAM,OPP TEAM,VENUE (R/H),MIN,...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,Boston,Cleveland,R,5.3,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,Boston,Cleveland,R,36.5,...,10,3,4,0,1,0,14
2,2017-2018 Regular Season,10/17/2017,Al Horford,Boston,Cleveland,R,32.1,...,7,5,2,0,0,1,9
3,2017-2018 Regular Season,10/17/2017,Jaylen Brown,Boston,Cleveland,R,39.6,...,6,0,5,2,3,0,25
4,2017-2018 Regular Season,10/17/2017,Kyrie Irving,Boston,Cleveland,R,39.4,...,4,10,4,3,2,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,2017-2018 Regular Season,10/18/2017,Garrett Temple,Sacramento,Houston,H,29.9,...,1,3,2,4,0,0,5
265,2017-2018 Regular Season,10/18/2017,De'Aaron Fox,Sacramento,Houston,H,23.6,...,4,5,2,0,3,0,14
266,2017-2018 Regular Season,10/18/2017,Kosta Koufos,Sacramento,Houston,H,17.6,...,6,0,3,0,1,1,2
267,2017-2018 Regular Season,10/18/2017,Vince Carter,Sacramento,Houston,H,14.2,...,1,2,1,0,1,0,6


In [51]:
df[['PLAYER FULL NAME', 'POSITION']][:5] # works fine for accessing... but use loc / iloc for assignment

Unnamed: 0,PLAYER FULL NAME,POSITION
0,Gordon Hayward,SF
1,Jayson Tatum,SF
2,Al Horford,C
3,Jaylen Brown,SF
4,Kyrie Irving,PG


## Data transformation

In [52]:
# apply a function to each cell of a column
df['PLAYER FULL NAME'].apply(str.upper)

0          GORDON HAYWARD
1            JAYSON TATUM
2              AL HORFORD
3            JAYLEN BROWN
4            KYRIE IRVING
              ...        
264        GARRETT TEMPLE
265          DE'AARON FOX
266          KOSTA KOUFOS
267          VINCE CARTER
268    MALACHI RICHARDSON
Name: PLAYER FULL NAME, Length: 269, dtype: object

In [53]:
# another example, apply your own function
def scramble(cell):
    a = list(cell)
    np.random.shuffle(a)
    return "".join(a)

In [54]:
df['PLAYER FULL NAME'].apply(scramble)

0          yHdwaar Grdoon
1            out TJasymna
2              dHr ooflrA
3            nJlw Breonay
4            IinKgrrevyi 
              ...        
264        emaertTlGte rp
265          nAeoo 'xDrFa
266          soK oasKufto
267          eciaernt VrC
268    honiachcdailMaR rs
Name: PLAYER FULL NAME, Length: 269, dtype: object

In [55]:
# apply a function to each row of the DataFrame
# or use multiple columns of a row to create a column
def transform_row(row):
    return row['PLAYER FULL NAME'] + ": " + row['OWN TEAM']

df.apply(lambda row: transform_row(row), axis=1) # need to specify the axis for this to work

0              Gordon Hayward: Boston
1                Jayson Tatum: Boston
2                  Al Horford: Boston
3                Jaylen Brown: Boston
4                Kyrie Irving: Boston
                    ...              
264        Garrett Temple: Sacramento
265          De'Aaron Fox: Sacramento
266          Kosta Koufos: Sacramento
267          Vince Carter: Sacramento
268    Malachi Richardson: Sacramento
Length: 269, dtype: object

## Renaming columns

In [56]:
# rename a single column or selected columns
df.rename(columns={'PLAYER FULL NAME': 'FULL NAME', 'OWN TEAM': 'TEAM'}) # use inplace=True or reassign to commit

Unnamed: 0,DATA SET,DATE,FULL NAME,POSITION,TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,SF,Boston,Cleveland,R,...,10,3,4,0,1,0,14
2,2017-2018 Regular Season,10/17/2017,Al Horford,C,Boston,Cleveland,R,...,7,5,2,0,0,1,9
3,2017-2018 Regular Season,10/17/2017,Jaylen Brown,SF,Boston,Cleveland,R,...,6,0,5,2,3,0,25
4,2017-2018 Regular Season,10/17/2017,Kyrie Irving,PG,Boston,Cleveland,R,...,4,10,4,3,2,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,2017-2018 Regular Season,10/18/2017,Garrett Temple,SG,Sacramento,Houston,H,...,1,3,2,4,0,0,5
265,2017-2018 Regular Season,10/18/2017,De'Aaron Fox,PG,Sacramento,Houston,H,...,4,5,2,0,3,0,14
266,2017-2018 Regular Season,10/18/2017,Kosta Koufos,C,Sacramento,Houston,H,...,6,0,3,0,1,1,2
267,2017-2018 Regular Season,10/18/2017,Vince Carter,SG,Sacramento,Houston,H,...,1,2,1,0,1,0,6


In [57]:
# rename all columns
new_columns = [c.lower() for c in df.columns.values]
df.columns = new_columns

In [58]:
df.head(2)

Unnamed: 0,data set,date,player full name,position,own team,opp team,venue (r/h),...,tot,a,pf,st,to,bl,pts
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,SF,Boston,Cleveland,R,...,10,3,4,0,1,0,14


## Filtering data

In [59]:
df = pd.read_csv("nba_stats/stats.csv")

In [60]:
df['POSITION'] == 'SF' # returns a Series of Booleans

0       True
1       True
2      False
3       True
4      False
       ...  
264    False
265    False
266    False
267    False
268    False
Name: POSITION, Length: 269, dtype: bool

In [61]:
df[df['POSITION'] == 'SF'] # returns the column filtered by the boolean series

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
0,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Boston,Cleveland,R,...,1,0,1,0,0,0,2
1,2017-2018 Regular Season,10/17/2017,Jayson Tatum,SF,Boston,Cleveland,R,...,10,3,4,0,1,0,14
3,2017-2018 Regular Season,10/17/2017,Jaylen Brown,SF,Boston,Cleveland,R,...,6,0,5,2,3,0,25
10,2017-2018 Regular Season,10/17/2017,LeBron James,SF,Cleveland,Boston,H,...,16,9,3,0,4,2,29
11,2017-2018 Regular Season,10/17/2017,Jae Crowder,SF,Cleveland,Boston,H,...,5,2,2,2,1,0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,2017-2018 Regular Season,10/18/2017,Josh Jackson,SF,Phoenix,Portland,H,...,2,1,1,0,0,0,11
248,2017-2018 Regular Season,10/18/2017,Derrick Jones Jr.,SF,Phoenix,Portland,H,...,1,1,1,0,0,1,0
250,2017-2018 Regular Season,10/18/2017,Trevor Ariza,SF,Houston,Sacramento,R,...,2,2,1,0,0,0,2
256,2017-2018 Regular Season,10/18/2017,PJ Tucker,SF,Houston,Sacramento,R,...,2,1,4,0,3,1,6


In [62]:
df['TOT'] > 14

0      False
1      False
2      False
3      False
4      False
       ...  
264    False
265    False
266    False
267    False
268    False
Name: TOT, Length: 269, dtype: bool

In [63]:
df[df['TOT'] > 14]

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
10,2017-2018 Regular Season,10/17/2017,LeBron James,SF,Cleveland,Boston,H,...,16,9,3,0,4,2,29
42,2017-2018 Regular Season,10/18/2017,Dwight Howard,C,Charlotte,Detroit,R,...,15,1,2,1,3,2,10
82,2017-2018 Regular Season,10/18/2017,Hassan Whiteside,C,Miami,Orlando,R,...,22,1,1,0,1,1,26
111,2017-2018 Regular Season,10/18/2017,Marcin Gortat,C,Washington,Philadelphia,H,...,17,1,3,0,1,3,16
140,2017-2018 Regular Season,10/18/2017,Anthony Davis,PF,New Orleans,Memphis,R,...,18,0,5,0,5,1,33
252,2017-2018 Regular Season,10/18/2017,Clint Capela,C,Houston,Sacramento,R,...,17,1,2,3,2,1,22


In [64]:
# sometimes more clear to use assignements like
total_rebounds_gt_14 = df['TOT'] > 14
df[total_rebounds_gt_14]

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
10,2017-2018 Regular Season,10/17/2017,LeBron James,SF,Cleveland,Boston,H,...,16,9,3,0,4,2,29
42,2017-2018 Regular Season,10/18/2017,Dwight Howard,C,Charlotte,Detroit,R,...,15,1,2,1,3,2,10
82,2017-2018 Regular Season,10/18/2017,Hassan Whiteside,C,Miami,Orlando,R,...,22,1,1,0,1,1,26
111,2017-2018 Regular Season,10/18/2017,Marcin Gortat,C,Washington,Philadelphia,H,...,17,1,3,0,1,3,16
140,2017-2018 Regular Season,10/18/2017,Anthony Davis,PF,New Orleans,Memphis,R,...,18,0,5,0,5,1,33
252,2017-2018 Regular Season,10/18/2017,Clint Capela,C,Houston,Sacramento,R,...,17,1,2,3,2,1,22


In [65]:
turnovers_gt_4 = df['TO'] > 4

In [66]:
df[turnovers_gt_4]

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
28,2017-2018 Regular Season,10/17/2017,Kevin Durant,SF,Golden State,Houston,H,...,5,7,4,0,8,4,20
99,2017-2018 Regular Season,10/18/2017,Robert Covington,SF,Philadelphia,Washington,R,...,7,1,4,1,5,2,29
119,2017-2018 Regular Season,10/18/2017,Giannis Antetokounmpo,SG,Milwaukee,Boston,R,...,13,3,4,3,5,0,37
120,2017-2018 Regular Season,10/18/2017,Khris Middleton,PF,Milwaukee,Boston,R,...,9,6,4,2,5,0,15
140,2017-2018 Regular Season,10/18/2017,Anthony Davis,PF,New Orleans,Memphis,R,...,18,0,5,0,5,1,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,2017-2018 Regular Season,10/18/2017,Paul Millsap,PF,Denver,Utah,R,...,6,1,3,0,5,0,19
196,2017-2018 Regular Season,10/18/2017,Rudy Gobert,C,Utah,Denver,H,...,10,1,4,0,6,1,18
229,2017-2018 Regular Season,10/18/2017,Jusuf Nurkic,C,Portland,Phoenix,R,...,11,1,3,1,5,0,11
242,2017-2018 Regular Season,10/18/2017,Eric Bledsoe,PG,Phoenix,Portland,H,...,4,3,2,2,5,1,15


## Chaining filters, logical filters (AND / OR)
We can use boolean logic to chain filters

In [67]:
df[total_rebounds_gt_14 & turnovers_gt_4] # filter with both filters

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
140,2017-2018 Regular Season,10/18/2017,Anthony Davis,PF,New Orleans,Memphis,R,...,18,0,5,0,5,1,33


In [68]:
df[total_rebounds_gt_14 | turnovers_gt_4] # filter with either filters

Unnamed: 0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OWN TEAM,OPP TEAM,VENUE (R/H),...,TOT,A,PF,ST,TO,BL,PTS
10,2017-2018 Regular Season,10/17/2017,LeBron James,SF,Cleveland,Boston,H,...,16,9,3,0,4,2,29
28,2017-2018 Regular Season,10/17/2017,Kevin Durant,SF,Golden State,Houston,H,...,5,7,4,0,8,4,20
42,2017-2018 Regular Season,10/18/2017,Dwight Howard,C,Charlotte,Detroit,R,...,15,1,2,1,3,2,10
82,2017-2018 Regular Season,10/18/2017,Hassan Whiteside,C,Miami,Orlando,R,...,22,1,1,0,1,1,26
99,2017-2018 Regular Season,10/18/2017,Robert Covington,SF,Philadelphia,Washington,R,...,7,1,4,1,5,2,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,2017-2018 Regular Season,10/18/2017,Rudy Gobert,C,Utah,Denver,H,...,10,1,4,0,6,1,18
229,2017-2018 Regular Season,10/18/2017,Jusuf Nurkic,C,Portland,Phoenix,R,...,11,1,3,1,5,0,11
242,2017-2018 Regular Season,10/18/2017,Eric Bledsoe,PG,Phoenix,Portland,H,...,4,3,2,2,5,1,15
252,2017-2018 Regular Season,10/18/2017,Clint Capela,C,Houston,Sacramento,R,...,17,1,2,3,2,1,22


## Merging, joining, concat

In [69]:
df = pd.read_csv("nba_stats/stats.csv")
desc = pd.read_csv('nba_stats/team-cities.csv')
desc.head()

Unnamed: 0,Team Name,City,NBA.com Initials,BigDataBall Initials,Unnamed: 4
0,Atlanta Hawks,Atlanta,ATL,Atl,
1,Boston Celtics,Boston,BOS,Bos,
2,Brooklyn Nets,Brooklyn,BKN,Bro,
3,Charlotte Hornets,Charlotte,CHA,Cha,
4,Chicago Bulls,Chicago,CHI,Chi,


### Merging data: JOIN query like SQL
https://www.codeproject.com/KB/database/Visual_SQL_Joins/Visual_SQL_JOINS_orig.jpg

In [70]:
df.columns.values

array(['DATA SET', 'DATE', 'PLAYER FULL NAME', 'POSITION', 'OWN TEAM',
       'OPP TEAM', 'VENUE (R/H)', 'MIN', 'FG', 'FGA', '3P', '3PA', 'FT',
       'FTA', 'OR', 'DR', 'TOT', 'A', 'PF', 'ST', 'TO', 'BL', 'PTS'],
      dtype=object)

In [71]:
df.columns.values[4] = 'City'

In [72]:
df.columns

Index([u'DATA SET', u'DATE', u'PLAYER FULL NAME', u'POSITION', u'City', u'OPP TEAM', u'VENUE (R/H)', u'MIN', u'FG', u'FGA', u'3P', u'3PA', u'FT', u'FTA', u'OR', u'DR', u'TOT', u'A', u'PF', u'ST', u'TO', u'BL', u'PTS'], dtype='object')

In [73]:
desc.head()

Unnamed: 0,Team Name,City,NBA.com Initials,BigDataBall Initials,Unnamed: 4
0,Atlanta Hawks,Atlanta,ATL,Atl,
1,Boston Celtics,Boston,BOS,Bos,
2,Brooklyn Nets,Brooklyn,BKN,Bro,
3,Charlotte Hornets,Charlotte,CHA,Cha,
4,Chicago Bulls,Chicago,CHI,Chi,


In [74]:
df['City'].head()

0    Boston
1    Boston
2    Boston
3    Boston
4    Boston
Name: City, dtype: object

In [75]:
df = df.set_index('City')
df.head(1)

Unnamed: 0_level_0,DATA SET,DATE,PLAYER FULL NAME,POSITION,OPP TEAM,VENUE (R/H),MIN,...,TOT,A,PF,ST,TO,BL,PTS
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Boston,2017-2018 Regular Season,10/17/2017,Gordon Hayward,SF,Cleveland,R,5.3,...,1,0,1,0,0,0,2


In [76]:
desc = desc.set_index('City')
desc.head(1)

Unnamed: 0_level_0,Team Name,NBA.com Initials,BigDataBall Initials,Unnamed: 4
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Atlanta,Atlanta Hawks,ATL,Atl,


In [78]:
df = df.join(desc)
df.head()

ValueError: columns overlap but no suffix specified: Index([u'Team Name', u'NBA.com Initials', u'BigDataBall Initials', u'Unnamed: 4'], dtype='object')

In [79]:
df[['PLAYER FULL NAME', 'Team Name']]

Unnamed: 0_level_0,PLAYER FULL NAME,Team Name
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Atlanta,Taurean Prince,Atlanta Hawks
Atlanta,Ersan Ilyasova,Atlanta Hawks
Atlanta,Dewayne Dedmon,Atlanta Hawks
Atlanta,Kent Bazemore,Atlanta Hawks
Atlanta,Dennis Schroder,Atlanta Hawks
...,...,...
Washington,Kelly Oubre Jr.,Washington Wizards
Washington,Mike Scott,Washington Wizards
Washington,Jodie Meeks,Washington Wizards
Washington,Ian Mahinmi,Washington Wizards


## Pivot tables
Transforms data from a columnar format with duplicate index entries to a tabular format aggregating entries into columns

In [81]:
df = pd.DataFrame([
    [123, 'First Name', 'John'],
    [123, 'Last Name', 'Doe'],
    [111, 'First Name', 'Jane'],
    [111, 'Last Name', 'Smith'],
    [124, 'First Name', 'Robert'],
    [125, 'Last Name', 'Jones'],
    [123, 'Favorite Food', 'Ice Cream'],
    [124, 'Favorite Food', 'Peperoni Pizza']
], columns=['ID', 'Group', 'Value'])
df.head()

Unnamed: 0,ID,Group,Value
0,123,First Name,John
1,123,Last Name,Doe
2,111,First Name,Jane
3,111,Last Name,Smith
4,124,First Name,Robert


In [82]:
df2 = df.pivot(index="ID", columns="Group", values="Value")
df2.head()

Group,Favorite Food,First Name,Last Name
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
111,,Jane,Smith
123,Ice Cream,John,Doe
124,Peperoni Pizza,Robert,
125,,,Jones
