# Pandas Groupby
Allows you to group together rows based off of a column and perform an aggregate function on those rows.  
Ex: Many rows that have the same index would aggregate (sum) their values.

In [68]:
import numpy as np
import pandas as pd

In [69]:
# create a dictionary with company, people, and sales
data = {"Company": ["GOOG", "GOOG", "MSFT", "MSFT", "FB", "FB"],
       "Person": ["Sam", "Charlie", "Amy", "Vanessa", "Carl", "Sarah"],
       "Sales": [200, 120, 340, 124, 243, 350]}

# create a DataFrame using the data dictionary
df = pd.DataFrame(data)

df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [70]:
# group the "Company" column as a GroupByDF, set the "Company" as the index
by_company = df.groupby("Company")

In [71]:
# return the mean (average) of each company's columns as a DataFrame
by_company.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,296.5
GOOG,160.0
MSFT,232.0


In [72]:
# return the sum of each company's columns as a DataFrame
by_company.sum()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [73]:
# return the standard deviation of each company's columns as a DataFrame
by_company.std()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,75.660426
GOOG,56.568542
MSFT,152.735065


In [74]:
# return the sum values of each column of Facebook
by_company.sum().loc["FB"]

Sales    593
Name: FB, dtype: int64

In [75]:
"""
return the maximum values of each column in the group as a DataFrame
Note: String's maxmimum are the latest in the alphabet
"""
print(by_company.max())
print()

"""
return the minimum values of each column in the group as a DataFrame
Note: String's minimum are the earliest in the alphabet
"""
print(by_company.min())

          Person  Sales
Company                
FB         Sarah    350
GOOG         Sam    200
MSFT     Vanessa    340

          Person  Sales
Company                
FB          Carl    243
GOOG     Charlie    120
MSFT         Amy    124


In [76]:
"""
count the number of instances per column

notice how the "Person" column also shows since it
just counts the total without relying on the value
"""
by_company.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
GOOG,2,2
MSFT,2,2


In [77]:
# return a bunch of statistical information of the group as a DataFrame
by_company.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [78]:
# transpose (reverse) the described group as a DataFrame
by_company.describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [79]:
# receive Facebook's described/statistical data as a Series
by_company.describe().transpose()["FB"]

Sales  count      2.000000
       mean     296.500000
       std       75.660426
       min      243.000000
       25%      269.750000
       50%      296.500000
       75%      323.250000
       max      350.000000
Name: FB, dtype: float64