# Groupby
- The groupby() method in Pandas is a powerful function used for splitting a DataFrame into groups based on specified criteria, applying a function to each group, and then combining the results.

In [4]:
import pandas as pd
path = "D:/tutedude/Data Analysis/CSV_Files/numeric.csv"
df = pd.read_csv(path)
df

Unnamed: 0,S.No,A,B,C,D,E,Length
0,1,2,1,5,3,4,45
1,2,4,3,10,6,8,43
2,3,6,5,15,9,12,66
3,4,8,7,20,12,16,78
4,5,10,9,25,15,20,32


In [15]:
import numpy as np

# 1️⃣ grp is a grouped DataFrame
grp = df.groupby(["A","B"]) # you don’t actually get a DataFrame — you get a GroupBy object. It’s like a “container” that knows how your data is grouped (by columns A and B).

# 2️⃣ .aggregate("sum")
# The .aggregate() function tells pandas:- “Apply a specific operation to each group.”

# Here "sum" is the operation you’re applying.
# It means:- “For every group, add up (sum) the numeric values column-wise.”

grp.aggregate("sum")


Unnamed: 0_level_0,Unnamed: 1_level_0,S.No,C,D,E,Length
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1,1,5,3,4,45
4,3,2,10,6,8,43
6,5,3,15,9,12,66
8,7,4,20,12,16,78
10,9,5,25,15,20,32


In [23]:
# Group the DataFrame 'df' by columns "A" and "B"
# 'as_index=False' means: 
#     → Keep "A" and "B" as normal columns in the final result
#     → Don't make them the index (default is as_index=True)
grp = df.groupby(["A", "B"], as_index=False)

# Aggregate (combine) the grouped data by applying the "sum" function
# This means:
#   → For each unique combination of A and B,
#   → Add up (sum) all numeric column values within that group
# The result will be a new DataFrame with summed values for each group
grp.aggregate("sum")

Unnamed: 0,A,B,S.No,C,D,E,Length
0,2,1,1,5,3,4,45
1,4,3,2,10,6,8,43
2,6,5,3,15,9,12,66
3,8,7,4,20,12,16,78
4,10,9,5,25,15,20,32


# .size()
- .size() counts the number of rows in each group.
- It tells you how many rows (entries) belong to each unique (A, B) group.
- The output is a Series with a MultiIndex (A and B) and the counts as values.

In [34]:
grp.size()

Unnamed: 0,A,B,size
0,2,1,1
1,4,3,1
2,6,5,1
3,8,7,1
4,10,9,1


# .count()
- Returns the number of non-null (non-missing) values in each column for every group.
- Unlike .size(), which counts all rows, .count() counts only non-NaN entries per column.
- The output is a DataFrame (not a Series).

In [32]:
grp.count()

Unnamed: 0,A,B,S.No,C,D,E,Length
0,2,1,1,1,1,1,1
1,4,3,1,1,1,1,1
2,6,5,1,1,1,1,1
3,8,7,1,1,1,1,1
4,10,9,1,1,1,1,1


# Let's create new data

In [44]:
l = [['apple',1],['apple',2],['apple',2],['ball',1],['ball',1]]
# Each inner list represents one row of data — with two elements:
# First element → value for column A
# Second element → value for column B

df_10 = pd.DataFrame(l, columns=["A", "B"])
# The argument columns=["A", "B"] assigns column names.
# So the first item in each sublist becomes the value for A, and the second item for B.

df_10

Unnamed: 0,A,B
0,apple,1
1,apple,2
2,apple,2
3,ball,1
4,ball,1


## To check unique elements

In [49]:
df_10.groupby("A")["B"].nunique()

# For apple → B values are [1, 2, 2] → unique values are {1, 2} → 2 unique values
# For ball → B values are [1, 1] → unique values are {1} → 1 unique value

A
apple    2
ball     1
Name: B, dtype: int64