# Working with data

## pandas.DataFrame.groupby

A groupby operation involves some combination of splitting the object, applying a function, and combining the results.

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

In [None]:
import pandas as pd

In [None]:
# Heads up! Proper naming -> Good practice to name dataframes properly, including subsets.

data_dict = {
    "key":  ["A", "C", "B", "C", "A", "B", "C", "B", "A"],
    "data": [3, 10, 8, 1, 5, 0, 22, 7, 2]
}

df = pd.DataFrame(data_dict)
df

In [None]:
# Syntax:
# <DataFrame>[<data>].groupby(<DataFrame>[<key>])

grouped_df = df.groupby(df["key"])
#grouped_df = df["data"].groupby(df["key"]) # Basically the same as line above, but no need since we only have one additional column
grouped_df # Simply a reference to a grouping, no operation performed on the elements
#grouped_df.count() # How many of each key to group by

![](https://i.imgur.com/q3tdGth.png)

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.sum.html
summed_df = grouped_df.sum()
# By default removes the original index
# summed_df = summed_df.reset_index()
summed_df

# Heads up! Operations to apply to a grouping should* be supported by data types in columns.

### Multi-column groups

In [None]:
data_dict_expanded = {
    "key":  ["A", "C", "B", "C", "A", "B", "C", "B", "A"],
    "data": [3, 10, 8, 1, 5, 0, 22, 7, 2],
    "valid": [True, False, False, False, True, False, True, True, True]
}

df_expanded = pd.DataFrame(data_dict_expanded)
df_expanded

In [None]:
grouped_df_expanded = df_expanded.groupby(df["key"])
# grouped_df_expanded = df_expanded["data"].groupby(df["key"])
grouped_df_expanded.count()

In [None]:
summed_grouped_df_expanded = grouped_df_expanded.sum(numeric_only=True)
summed_grouped_df_expanded

****

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# Just a reminder: increase profit by 5%
sales_df["Profit"] *= 1.05
sales_df

In [None]:
# Group by without specifying columns
sales_grouped = sales_df.groupby(sales_df["Order ID"]) # Not sales_df[<data>].groupby(..)
# print(sales_grouped.count())
sales_grouped = sales_grouped.sum(numeric_only=True)

sales_grouped

In [None]:
# Group by, include only desired columns
sales_grouped_columns = sales_df[["Quantity", "Profit"]].groupby(sales_df["Order ID"]) # Not sales_df[<data>].groupby(..)
sales_grouped_columns = sales_grouped_columns.sum()

sales_grouped_columns

In [None]:
sales_grouped_columns.sum()

### Filtering

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# Syntax:
# <DataFrame>[<DataFrame>[<column>]<conditional expression>]

filtered_sales_df = sales_df[sales_df["Quantity"] == 1]
filtered_sales_df

In [None]:
# Heads up! Multiple conditions in Pandas: must use bitwise operators (&, |), not logical operators (and, or)

filtered_sales_df = sales_df[(sales_df["Quantity"] == 1) & (sales_df["Product Name"].str.startswith("Fries"))]
filtered_sales_df

In [None]:
# Multi-level filtering (often I perfer this to start with for sanity)
sales_df = pd.read_excel("sales.xlsx")
filtered_df = sales_df[sales_df["Cost To Make"] > 1]
filtered_df = filtered_df[filtered_df["Cost To Make"] < 7] # remember: don’t use "sales_df"!
filtered_df

Also worth checking out: [pandas.core.groupby.DataFrameGroupBy.filter](https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.filter.html)

### Manipulating dataframe

In [None]:
# Syntax:
# <DataFrame>["new column"] = <DataFrame>[<column>] <expression>
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# Add new column
sales_df["15% Unit Cost"] = sales_df["Unit Cost"] * 1.15
sales_df

In [None]:
del sales_df["Profit"]
sales_df

In [None]:
sales_df["Profit"] = (sales_df["15% Unit Cost"] * sales_df["Quantity"]) - (sales_df["Cost To Make"] * sales_df["Quantity"])
sales_df

## Aggregations

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
# What is the average number of items sold per order?
# https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.mean.html
sales_df["Quantity"].groupby(sales_df["Order ID"]).sum().mean()

In [None]:
# How many orders had a particular type of product sold in it?
# https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.count.html
sales_df["Order ID"].groupby(sales_df["Product Name"]).count()

In [None]:
# What was our most profitable item?
# https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.max.html
group_by_product_profit = sales_df["Profit"].groupby(sales_df["Product Name"]).sum()
group_by_product_profit

In [None]:
max_profit = group_by_product_profit.max()
group_by_product_profit[group_by_product_profit == max_profit]

### Multi-aggregation

https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.agg.html

In [None]:
sales_df = pd.read_excel("sales.xlsx")
sales_df

In [None]:
sales_df["Quantity"].groupby(sales_df["Order ID"]).agg("sum")
# sales_df["Quantity"].groupby(sales_df["Order ID"]).sum()

In [None]:
sales_df[["Quantity", "Profit"]].groupby(sales_df["Order ID"]).agg(["sum", "mean"])

In [None]:
sales_df

## Qualitative to Quantitative

In [None]:
survery_df = pd.read_excel("survey_results.xlsx")
survery_df.head()

In [None]:
# Use some form of explicit conversion to numerical values, e.g.:
convert_dict = {
    "Strongly Disagree" : 0,
    "Disagree" : 25,
    "Neither agree or disagree" : 50,
    "Agree" : 75,
    "Strongly Agree" : 100,
    "Yes" : True,
    "No": False,
}

In [None]:
# Add new columns containing converted data based on original (i.e. new data)
survery_df["Satisfaction"] = [convert_dict[key] for key in survery_df["I was happy with the product"]]
survery_df["Recommendation"] = [convert_dict[key] for key in survery_df["I will recommend the product"]]
survery_df["Subscription"] = [convert_dict[key] for key in survery_df["Would you like to receive our newsletter"]]
survery_df.head()

In [None]:
# Average satisfaction and recommendation
survery_df[["Satisfaction", "Recommendation"]].mean()