# Aggregation and Grouping

In [2]:
# We can do efficient summarization, such as computing aggregations (sum(), mean(), median() etc)
# in which a single number gives insight into the nature of a large dataset.

In [3]:
 # Planets Data

 '''Let’s use the Planets dataset, which is available on Seaborn Package. It gives information on
 planets that astronomers have discovered around other stars. '''

'Let’s use the Planets dataset, which is available on Seaborn Package. It gives information on\nplanets that astronomers have discovered around other stars. '

In [4]:
import seaborn as sb
planets = sb.load_dataset('planets')
planets.shape

(1035, 6)

In [5]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [6]:
# Simple Aggregation in Pandas

 # Just like in NumPy arrays, in Pandas Series the aggregates return a single value.

In [7]:
import numpy as np
import pandas as pd
rng = np.random.RandomState(42)

 # Series
ser = pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [8]:
ser.sum()

np.float64(2.811925491708157)

In [9]:
ser.mean()

np.float64(0.5623850983416314)

In [10]:
# DataFrame
df = pd.DataFrame({'A': rng.rand(5),
                    'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [11]:
df.mean() # returns means by columns by default
 

A    0.477888
B    0.443420
dtype: float64

In [12]:
# mean by rows
df.mean(axis=1)

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [13]:
#alternatively
df.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [14]:
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [15]:
planets.dropna().describe() # drop the NA's and compute the aggregates


Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


In [16]:
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [35]:
# GroupBy: Split, Apply, Combine

'''As seen in the above figure,
 • Thesplit step breaks up and groups a DataFrame depending on the value of the specified key.
 • The apply step involves computing some function (usually an aggregate, transformation, or
 f
 iltering) within the individual groups.
 • The combine step merges the results of these individual operations into an output array.
 Let’s look at an example.
 '''

'As seen in the above figure,\n • Thesplit step breaks up and groups a DataFrame depending on the value of the specified key.\n • The apply step involves computing some function (usually an aggregate, transformation, or\n f\n iltering) within the individual groups.\n • The combine step merges the results of these individual operations into an output array.\n Let’s look at an example.\n '