In [32]:
import numpy as np
import pandas as pd

In [33]:
season_stats = pd.read_csv('Seasons_Stats.csv')

## Removing 'TOT' lines

In [34]:
# One popular way

season_stats_without_tot_a = season_stats.drop(season_stats.loc[season_stats["Tm"].isin(["TOT"])].index)

In [35]:
# This works nicely

season_stats_without_tot_b = season_stats[season_stats['Tm'] != 'TOT']

10604221.0

In [38]:
# So does this

season_stats_without_tot_c = season_stats[season_stats.Tm != 'TOT']

In [39]:
# So does this

season_stats_without_tot_d = season_stats.loc[season_stats['Tm'] != 'TOT']

In [40]:
# Did they really give us the equivalent # of rows?

print (season_stats_without_tot_a.G.count())
print (season_stats_without_tot_b.G.count())
print (season_stats_without_tot_c.G.count())
print (season_stats_without_tot_d.G.count())

22501
22501
22501
22501


## Cleaning up NaNs

In [41]:
no_tots = season_stats_without_tot_d

In [42]:
no_tots.notna?

[0;31mSignature:[0m [0mno_tots[0m[0;34m.[0m[0mnotna[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Detect existing (non-missing) values.

Return a boolean same-sized object indicating if the values are not NA.
Non-missing values get mapped to True. Characters such as empty
strings ``''`` or :attr:`numpy.inf` are not considered NA values
(unless you set ``pandas.options.mode.use_inf_as_na = True``).
NA values, such as None or :attr:`numpy.NaN`, get mapped to False
values.

Returns
-------
DataFrame
    Mask of bool values for each element in DataFrame that
    indicates whether an element is not an NA value.

See Also
--------
DataFrame.notnull : Alias of notna.
DataFrame.isna : Boolean inverse of notna.
DataFrame.dropna : Omit axes labels with missing values.
notna : Top-level notna.

Examples
--------
Show which entries in a DataFrame are not NA.

>>> df = pd.DataFrame({'age': [5, 6, np.NaN],
...                    'born': [pd.NaT, pd.Timestamp('193

In [43]:
# This is one way to drop NaN elements on 'Player'

no_tots_a = no_tots[no_tots['Player'].notna()]

In [44]:
no_tots.dropna?

[0;31mSignature:[0m [0mno_tots[0m[0;34m.[0m[0mdropna[0m[0;34m([0m[0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mhow[0m[0;34m=[0m[0;34m'any'[0m[0;34m,[0m [0mthresh[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0msubset[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Remove missing values.

See the :ref:`User Guide <missing_data>` for more on which values are
considered missing, and how to work with missing data.

Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
    Determine if rows or columns which contain missing values are
    removed.

    * 0, or 'index' : Drop rows which contain missing values.
    * 1, or 'columns' : Drop columns which contain missing value.

    .. deprecated:: 0.23.0

       Pass tuple or list to drop on multiple axes.
       Only a single axis is allowed.

how : {'any', 'all'}, default 'any'
    Determine if row or col

In [45]:
# This uses the .dropna() feature

no_tots_b = no_tots.dropna(subset = ['Player'])

# Selecting out the Hall of Famers

### Method 1: Using a Defined Function on the last character of 'Player'
### And then use that function in .map()

### .apply() might also work here

In [46]:
def Is_HOF(x):
    if str(x)[-1]=="*":
        return True
    else:
        return False

In [51]:
HOF_Stats_a = no_tots_a.loc[no_tots_a["Player"].map(Is_HOF)]

### Method 2: Direct search on "*"

In [52]:
HOF_Stats_b = no_tots_a[no_tots_a['Player'].str.contains("\*")]

### Method 3: Use a `lambda x` function instead of a Defined Function
### Note here it searches explicitly for the last character in 'Player'

In [53]:
HOF_Stats_c = no_tots_a.loc[no_tots_a["Player"].map(lambda x: str(x)[-1] == "*")]

### Method 4: Take advantage of the .endswith feature of strings in Python

In [54]:
HOF_Stats_d = no_tots_a[no_tots_a.Player.str.endswith('*')]

In [55]:
# Did they really give us the equivalent # of rows?

print (HOF_Stats_a.G.count())
print (HOF_Stats_b.G.count())
print (HOF_Stats_c.G.count())
print (HOF_Stats_d.G.count())

1486
1486
1486
1486


In [72]:
HOF_Stats_a

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
32,32,1950.0,Al Cervi*,PG,32.0,SYR,56.0,,,,...,0.829,,,,264.0,,,,223.0,573.0
49,49,1950.0,Bob Davies*,PG,30.0,ROC,64.0,,,,...,0.752,,,,294.0,,,,187.0,895.0
68,68,1950.0,Joe Fulks*,PF,28.0,PHW,68.0,,,,...,0.696,,,,56.0,,,,240.0,965.0
71,71,1950.0,Harry Gallatin*,F-C,22.0,NYK,68.0,,,,...,0.757,,,,56.0,,,,215.0,803.0
92,92,1950.0,Alex Hannum*,PF,26.0,SYR,64.0,,,,...,0.688,,,,129.0,,,,264.0,482.0
108,108,1950.0,Red Holzman*,PG,29.0,ROC,68.0,,,,...,0.686,,,,200.0,,,,67.0,556.0
112,112,1950.0,Buddy Jeannette*,G,32.0,BLB,37.0,,,,...,0.820,,,,93.0,,,,82.0,193.0
148,148,1950.0,Ed Macauley*,C-F,21.0,STB,67.0,,,,...,0.718,,,,200.0,,,,221.0,1081.0
157,157,1950.0,Slater Martin*,PG,24.0,MNL,67.0,,,,...,0.634,,,,148.0,,,,162.0,271.0
163,163,1950.0,Dick McGuire*,PG,24.0,NYK,68.0,,,,...,0.652,,,,386.0,,,,160.0,584.0


### Now we're ready to aggregate on Hall of Famers

### The most direct way is to use .groupby()
### There are a few ways to instantiate it on a select group of stats.

In [75]:
stats_totals = [ 'G', 'GS', 'MP',
       'OWS', 'DWS', 'WS',
       'OBPM', 'DBPM', 'BPM', 'FG', 'FGA', '3P', '3PA',
       '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

In [76]:
HOF_totals_a = HOF_Stats_a.groupby('Player').sum()[stats_totals]

In [77]:
HOF_totals_b = HOF_Stats_a.groupby(HOF_Stats_a['Player']).aggregate(sum)[stats_totals]

In [78]:
HOF_totals_a

Unnamed: 0_level_0,G,GS,MP,OWS,DWS,WS,OBPM,DBPM,BPM,FG,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adrian Dantley*,955.0,546.0,34151.0,111.4,22.7,134.4,63.8,-24.2,39.5,8169.0,...,8351.0,2207.0,3248.0,5455.0,2830.0,944.0,150.0,2503.0,2550.0,23177.0
Al Cervi*,202.0,0.0,1151.0,16.0,6.0,22.0,0.0,0.0,0.0,405.0,...,931.0,0.0,0.0,261.0,648.0,0.0,0.0,0.0,669.0,1591.0
Alex English*,1193.0,753.0,38063.0,77.5,23.0,100.6,39.0,-19.9,19.0,10659.0,...,5141.0,2778.0,3760.0,6538.0,4351.0,1067.0,833.0,2821.0,3027.0,25613.0
Alex Hannum*,391.0,0.0,5563.0,-0.7,12.2,11.4,0.0,0.0,0.0,935.0,...,941.0,0.0,0.0,1519.0,696.0,0.0,0.0,0.0,1549.0,2449.0
Alfred McGuire*,191.0,0.0,2966.0,2.7,2.8,5.4,0.0,0.0,0.0,251.0,...,463.0,0.0,0.0,418.0,363.0,0.0,0.0,0.0,501.0,757.0
Allen Iverson*,914.0,901.0,37584.0,60.8,38.3,98.9,49.1,-24.3,24.8,8467.0,...,8168.0,745.0,2649.0,3394.0,5624.0,1983.0,164.0,3262.0,1777.0,24368.0
Alonzo Mourning*,838.0,686.0,25975.0,41.6,48.0,89.7,-25.5,33.3,7.5,5116.0,...,5862.0,2184.0,4953.0,7137.0,946.0,414.0,2356.0,2209.0,2810.0,14311.0
Andy Phillip*,539.0,0.0,12688.0,19.5,22.7,42.3,0.0,0.0,0.0,1645.0,...,1876.0,0.0,0.0,2031.0,2969.0,0.0,0.0,0.0,1416.0,4598.0
Arnie Risen*,577.0,0.0,12690.0,23.1,22.4,45.3,0.0,0.0,0.0,2265.0,...,2989.0,0.0,0.0,5011.0,958.0,0.0,0.0,0.0,2233.0,6638.0
Artis Gilmore*,909.0,476.0,29685.0,70.9,36.5,107.5,16.1,11.8,27.7,5732.0,...,5768.0,2639.0,6522.0,9161.0,1777.0,470.0,1747.0,2347.0,2986.0,15579.0


In [85]:
# Another method...this retains position

justsome_HOFers = HOF_Stats_a[['Player','Pos','PTS','FG','FGA','2PA','2P','3PA','3P','STL','BLK','TRB','AST','3P%','2P%','FG%','AST%']].fillna(0)

In [86]:
# when using this handy feature that allows for different aggregators on different variables,
# it helps visually to put them on individual lines
# It won't affect the operation of the code in Python (this might not be true in all languages!)

grouped_HOFers=justsome_HOFers.groupby('Player').agg({'PTS':sum,
                                        'FG':sum,
                                        'FGA':sum,
                                        '2PA':sum,
                                        '2P':sum,
                                        '3PA':sum,                                                      
                                        '3P':sum,
                                        'STL':sum,
                                        'BLK':sum,
                                        'TRB':sum,
                                        'AST':sum,                                                      
                                        '3P%':np.mean,
                                        '2P%':np.mean,
                                        'FG':np.mean,
                                        'AST%':np.mean})

In [91]:
grouped_HOFers.head()

Unnamed: 0_level_0,PTS,FG,FGA,2PA,2P,3PA,3P,STL,BLK,TRB,AST,3P%,2P%,AST%
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Adrian Dantley*,23177.0,480.529412,15121.0,15080.0,8162.0,41.0,7.0,944.0,150.0,5455.0,2830.0,0.085882,0.523,12.623529
Al Cervi*,1591.0,101.25,1128.0,1128.0,405.0,0.0,0.0,0.0,0.0,261.0,648.0,0.0,0.37625,0.0
Alex English*,25613.0,666.1875,21036.0,20953.0,10641.0,83.0,18.0,1067.0,833.0,6538.0,4351.0,0.180875,0.503937,15.925
Alex Hannum*,2449.0,155.833333,2656.0,2656.0,935.0,0.0,0.0,0.0,0.0,1519.0,696.0,0.0,0.351833,0.0
Alfred McGuire*,757.0,62.75,663.0,663.0,251.0,0.0,0.0,0.0,0.0,418.0,363.0,0.0,0.3575,0.0


In [93]:
# Grouping by position, then aggregating

SG_HOFers = justsome_HOFers[justsome_HOFers.Pos == 'SG']
grouped_SG_HOFers=SG_HOFers.groupby('Player').agg({'PTS':sum,
                                        'FG':sum,
                                        'FGA':sum,
                                        '2PA':sum,
                                        '2P':sum,
                                        '3PA':sum,                                                      
                                        '3P':sum,
                                        'STL':sum,
                                        'BLK':sum,
                                        'TRB':sum,
                                        'AST':sum,                                                      
                                        '3P%':np.mean,
                                        '2P%':np.mean,
                                        'FG':np.mean,
                                        'AST%':np.mean})

grouped_SG_HOFers.sort_values('PTS',ascending=False)

Unnamed: 0_level_0,PTS,FG,FGA,2PA,2P,3PA,3P,STL,BLK,TRB,AST,3P%,2P%,AST%
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Michael Jordan*,29277.0,843.230769,21686.0,20016.0,10407.0,1670.0,555.0,2306.0,828.0,5836.0,5012.0,0.290692,0.510308,24.630769
Reggie Miller*,25279.0,457.833333,17499.0,11013.0,5681.0,6486.0,2560.0,1505.0,299.0,4182.0,4141.0,0.391167,0.512278,14.288889
Clyde Drexler*,22195.0,520.9375,17673.0,15070.0,7508.0,2603.0,827.0,2207.0,719.0,6677.0,6125.0,0.287,0.494937,24.725
Mitch Richmond*,20497.0,521.785714,16038.0,12621.0,5979.0,3417.0,1326.0,1211.0,254.0,3801.0,3398.0,0.370429,0.467643,16.414286
Hal Greer*,20413.0,617.230769,17713.0,17713.0,8024.0,0.0,0.0,0.0,0.0,5288.0,4113.0,0.0,0.453538,8.676923
George Gervin*,16448.0,796.5,12663.0,12404.0,6295.0,259.0,77.0,699.0,475.0,2753.0,1757.0,0.230125,0.505625,13.1375
Pete Maravich*,15948.0,562.454545,14025.0,14010.0,6177.0,15.0,10.0,587.0,108.0,2747.0,3563.0,0.126,0.441091,21.718182
Gail Goodrich*,15626.0,512.083333,13303.0,13303.0,6145.0,0.0,0.0,545.0,72.0,2502.0,3682.0,0.0,0.4585,19.191667
Joe Dumars*,15608.0,441.461538,12428.0,10134.0,4870.0,2294.0,869.0,859.0,80.0,2065.0,4347.0,0.369923,0.476231,20.738462
Allen Iverson*,14700.0,511.1,12169.0,10182.0,4492.0,1987.0,619.0,1211.0,90.0,2045.0,3052.0,0.3081,0.4463,25.93


In [94]:
# Here's an extensible method using a defined function
# starts from cleaned HOF list
# add more column elements

uniqueNames = HOF_Stats_a["Player"].drop_duplicates()

def aggregateData(playerName):
    justPlayer = HOF_Stats_a.loc[HOF_Stats_a["Player"].isin([playerName])]
    data = {"Player": playerName,
           "Games": justPlayer["G"].sum(),
           "Minutes Played": justPlayer["MP"].sum(),
           "Position": justPlayer.iloc[0]["Pos"],
           "Points": justPlayer["PTS"].sum(),
           "Blocks": justPlayer["BLK"].sum(),
           "Assists": justPlayer["AST"].sum()}
    return pd.Series(data)

lifetimeHallOfFamers = pd.DataFrame(columns=["Player", "Games", "Minutes Played", "Position", "Points", "Blocks", "Assists"])
for name in uniqueNames:
    lifetimeHallOfFamers = lifetimeHallOfFamers.append(aggregateData(name), ignore_index=True)

lifetimeHallOfFamers

Unnamed: 0,Player,Games,Minutes Played,Position,Points,Blocks,Assists
0,Al Cervi*,202.0,1151.0,PG,1591.0,0.0,648.0
1,Bob Davies*,402.0,8617.0,PG,5690.0,0.0,1929.0
2,Joe Fulks*,326.0,4490.0,PF,4105.0,0.0,462.0
3,Harry Gallatin*,630.0,15813.0,F-C,8409.0,0.0,1145.0
4,Alex Hannum*,391.0,5563.0,PF,2449.0,0.0,696.0
5,Red Holzman*,298.0,2106.0,PG,1620.0,0.0,572.0
6,Buddy Jeannette*,37.0,0.0,G,193.0,0.0,93.0
7,Ed Macauley*,641.0,18071.0,C-F,11234.0,0.0,2079.0
8,Slater Martin*,745.0,21889.0,PG,7337.0,0.0,3160.0
9,Dick McGuire*,738.0,17170.0,PG,5921.0,0.0,4205.0
