# Working with data

## pandas.DataFrame.groupby

A groupby operation involves some combination of splitting the object, applying a function, and combining the results.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

In [None]:
import pandas as pd

In [None]:
# Heads up! Proper naming -> Good practice to name dataframes properly, including subsets.

df = pd.DataFrame({'key': ["A", "C", "B", "C", "A", "B", "C", "B", "A"],
                'data': [3, 10, 8, 1, 5, 0, 22, 7, 2]})
print(df)

In [None]:
grouped = df["data"].groupby(df["key"])
#grouped = df.groupby(df["key"])
print(grouped) # Simply a reference to a grouping, no operation performed on the elements
print(grouped.count()) # How many of each key to group by

In [None]:
print(grouped.sum())

# Heads up! Operations to apply must be supported by data types in columns.

### Multi-column groups

In [44]:
sales_df = pd.read_excel("sales.xlsx")
print(sales_df)
# Just a reminder: increase profit by 5%
#df["Profit"] *= 1.05

  Order ID Product Name  Quantity  Unit Cost  Cost To Make  Profit
0    M0051    Hamburger         2      14.99           8.0   13.98
1    M0051      Fries L         2       7.99           5.0    5.98
2    M0051       Soda L         1       5.99           1.0    4.99
3    M0052       Hotdog         1      12.99           6.0    6.99
4    M0052       Soda M         1       3.99           0.8    3.19
5    M0053    Hamburger         1      14.99           8.0    6.99
6    M0053      Fries S         1       3.99           1.0    2.99
7    M0054        Salad         3      11.99           4.0   23.97
8    M0055        Salad         1      11.99           4.0    7.99


## Filtering

In [45]:
filtered_sales_df = sales_dataframe[sales_dataframe["Quantity"] == 1]
print(filtered_sales_df)

  Order ID Product Name  Quantity  Unit Cost  Cost To Make  Profit
2    M0051       Soda L         1       5.99           1.0    4.99
3    M0052       Hotdog         1      12.99           6.0    6.99
4    M0052       Soda M         1       3.99           0.8    3.19
5    M0053    Hamburger         1      14.99           8.0    6.99
6    M0053      Fries S         1       3.99           1.0    2.99
8    M0055        Salad         1      11.99           4.0    7.99


In [46]:
# Group by filtered orders, only selected columns included
singe_quantity_profits_df = filtered_sales_df[["Quantity", "Profit"]].groupby(filtered_sales_df["Order ID"]).sum()
print(singe_quantity_profits_df)

          Quantity  Profit
Order ID                  
M0051            1    4.99
M0052            2   10.18
M0053            2    9.98
M0055            1    7.99
