# Working with data

## pandas.DataFrame.groupby

A groupby operation involves some combination of splitting the object, applying a function, and combining the results.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

In [None]:
import pandas as pd

In [None]:
# Heads up! Proper naming -> Good practice to name dataframes properly, including subsets.

data_dict = {
    "key":  ["A", "C", "B", "C", "A", "B", "C", "B", "A"],
    "data": [3, 10, 8, 1, 5, 0, 22, 7, 2]
}

df = pd.DataFrame(data_dict)
df

In [None]:
# Syntax:
# <DataFrame>[<data>].groupby(<DataFrame>[<key>])

grouped_df = df.groupby(df["key"])
#grouped_df = df["data"].groupby(df["key"]) # Basically the same as line above, but no need since we only have one additional column
grouped_df # Simply a reference to a grouping, no operation performed on the elements
# grouped_df.count() # How many of each key to group by

![](https://i.imgur.com/q3tdGth.png)

In [None]:
summed_df = grouped_df.sum()
# By default removes the original index
# summed_df = summed_df.reset_index()
summed_df

# Heads up! Operations to apply must be supported by data types in columns.

### Multi-column groups

In [None]:
data_dict_expanded = {
    "key":  ["A", "C", "B", "C", "A", "B", "C", "B", "A"],
    "data": [3, 10, 8, 1, 5, 0, 22, 7, 2],
    "valid": [True, False, False, False, True, False, True, True, True]
}

df_expanded = pd.DataFrame(data_dict_expanded)
df_expanded

In [None]:
grouped_df_expanded = df_expanded["data"].groupby(df["key"])
# grouped_df_expanded = df_expanded["data"].groupby(df["key"])
grouped_df_expanded.count()

In [None]:
summed_grouped_df_expanded = grouped_df_expanded.sum(numeric_only=True)
summed_grouped_df_expanded

****

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# Just a reminder: increase profit by 5%
sales_df["Profit"] *= 1.05
sales_df

In [None]:
# Group by without specifying columns
sales_grouped = sales_df.groupby(sales_df["Order ID"]) # Not sales_df[<data>].groupby(..)
# print(sales_grouped.count())
sales_grouped = sales_grouped.sum(numeric_only=True)

sales_grouped

In [None]:
# Group by, include only desired columns
sales_grouped_columns = sales_df[["Quantity", "Profit"]].groupby(sales_df["Order ID"]) # Not sales_df[<data>].groupby(..)
sales_grouped_columns = sales_grouped_columns.sum()

sales_grouped_columns

In [None]:
sales_grouped_columns.sum()

## Filtering

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# Syntax:
# <DataFrame>[<DataFrame>[<column>]<conditional expression>]

filtered_sales_df = sales_df[sales_df["Quantity"] == 1]
filtered_sales_df

In [None]:
# Heads up! Multiple conditions in Pandas: must use bitwise operators (&, |), not logical operators (and, or)

filtered_sales_df = sales_df[(sales_df["Quantity"] == 1) & (sales_df["Product Name"].str.startswith("Fries"))]
filtered_sales_df

Also worth checking out: [pandas.core.groupby.DataFrameGroupBy.filter](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.filter.html)

In [None]:
# Multi-level filtering (often I perfer this to start with for sanity)
sales_df = pd.read_excel("sales.xlsx")
filtered_df = sales_df[sales_df["Cost To Make"] > 1]
filtered_df = filtered_df[filtered_df["Cost To Make"] < 7] # remember: don’t use "sales_df"!
filtered_df