## Foundations of Data Science: Computational Thinking with Python

Mirroring course Jupyter notebook except using standard modules instead of Berkeley data science module.

## Lecture 10: Groups and Pivot

In [4]:
import pandas as pd

pd.set_option('max_rows', 9)

### Groups

In [55]:
cones = pd.DataFrame([['strawberry', 3.55, 1, 'Yum'],
                      ['chocolate',  4.75, 1, 'awesome'],
                      ['chocolate',  5.25, 2, 'meh'],
                      ['strawberry', 5.25, 2, ':)'],
                      ['chocolate',  5.25, 2, 'Wow']],
                     columns=['Flavor', 'Price', 'Scoops', 'Reaction'])
cones

Unnamed: 0,Flavor,Price,Scoops,Reaction
0,strawberry,3.55,1,Yum
1,chocolate,4.75,1,awesome
2,chocolate,5.25,2,meh
3,strawberry,5.25,2,:)
4,chocolate,5.25,2,Wow


In [56]:
# Counts
#  value_counts is essientially groupby then count
print(cones.Flavor.value_counts())
print()
print(cones.groupby('Flavor').Flavor.count())

chocolate     3
strawberry    2
Name: Flavor, dtype: int64

Flavor
chocolate     3
strawberry    2
Name: Flavor, dtype: int64


In [100]:
# Viewing groups

# Dictionary of groups to indicies in group
print(cones.groupby('Flavor').groups)
print()

# Dictionary of groups to items in groups 
print(dict(list(cones.groupby('Flavor'))))
print()

# Groups in nice tables
for key in dict(list(cones.groupby('Flavor'))):
    display(dict(list(cones.groupby('Flavor')))[key])

{'chocolate': Int64Index([1, 2, 4], dtype='int64'), 'strawberry': Int64Index([0, 3], dtype='int64')}

{'chocolate':       Flavor  Price  Scoops Reaction
1  chocolate   4.75       1  awesome
2  chocolate   5.25       2      meh
4  chocolate   5.25       2      Wow, 'strawberry':        Flavor  Price  Scoops Reaction
0  strawberry   3.55       1      Yum
3  strawberry   5.25       2       :)}



Unnamed: 0,Flavor,Price,Scoops,Reaction
1,chocolate,4.75,1,awesome
2,chocolate,5.25,2,meh
4,chocolate,5.25,2,Wow


Unnamed: 0,Flavor,Price,Scoops,Reaction
0,strawberry,3.55,1,Yum
3,strawberry,5.25,2,:)


In [101]:
# Groupby summery information
display(cones.groupby('Flavor').describe())

display(cones.loc[:, ['Flavor', 'Reaction']].groupby('Flavor').describe())

Unnamed: 0_level_0,Price,Price,Price,Price,Price,Price,Price,Price,Scoops,Scoops,Scoops,Scoops,Scoops,Scoops,Scoops,Scoops
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Flavor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
chocolate,3.0,5.083333,0.288675,4.75,5.0,5.25,5.25,5.25,3.0,1.666667,0.57735,1.0,1.5,2.0,2.0,2.0
strawberry,2.0,4.4,1.202082,3.55,3.975,4.4,4.825,5.25,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0


Unnamed: 0_level_0,Reaction,Reaction,Reaction,Reaction
Unnamed: 0_level_1,count,unique,top,freq
Flavor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
chocolate,3,3,awesome,1
strawberry,2,2,:),1


In [117]:
# GroupBy aggregates
print(cones.groupby('Flavor').Price.agg('min'))
display(cones.groupby('Flavor').agg('min'))
display(cones.groupby('Flavor').agg('mean'))

Flavor
chocolate     4.75
strawberry    3.55
Name: Price, dtype: float64


Unnamed: 0_level_0,Price,Scoops,Reaction
Flavor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chocolate,4.75,1,Wow
strawberry,3.55,1,:)


Unnamed: 0_level_0,Price,Scoops
Flavor,Unnamed: 1_level_1,Unnamed: 2_level_1
chocolate,5.083333,1.666667
strawberry,4.4,1.5


In [124]:
# Applying functions to GroupBy objects
cones.groupby('Flavor').apply(lambda df: df.Price.max() - df.Price.min())

Flavor
chocolate     0.5
strawberry    1.7
dtype: float64

In [152]:
# More examples
nba = pd.read_csv('data/nba_salaries.csv')
nba.rename({'2015-2016 SALARY':'SALARY'}, axis='columns', inplace=True)

nba.groupby('TEAM').agg('sum').sort_values('SALARY', ascending=False)

Unnamed: 0_level_0,SALARY
TEAM,Unnamed: 1_level_1
Cleveland Cavaliers,102.312412
Oklahoma City Thunder,96.832165
Golden State Warriors,94.085137
Memphis Grizzlies,93.796439
...,...
Boston Celtics,50.285499
Portland Trail Blazers,45.446878
Philadelphia 76ers,42.481345
Detroit Pistons,42.211760


In [153]:
nba['POSITION'].value_counts()

nba.groupby('POSITION').agg('mean').SALARY.sort_values()

POSITION
SG    3.988195
PF    4.951344
PG    5.165487
SF    5.532675
C     6.082913
Name: SALARY, dtype: float64