## Group by in pandas 
     In Pandas, groupby() is used to split data into groups based on one or more columns, 
     perform operations on each group, and then combine the results

     The groupby() function follows the Split–Apply–Combine approach:
     - Split the DataFrame into groups based on a column
     - Apply a function (such as sum, mean, count) to each group
     - Combine the results into a new DataFrame or Series

In [7]:
import pandas as pd

In [9]:
df = pd.read_csv("C:\\Users\\Dell\\Desktop\\_PANDAS_\\Data\\student_result1.csv")
df.head(2)

Unnamed: 0,Student ID,Section,Class,Study hrs,Social Media usage hrs,Percentage
0,1001,A,10,2,3,50
1,1002,B,10,6,2,80


###             --- Basic Grouping ---
#### Create a GroupBy object based on the 'Section' column

In [14]:
gr = df.groupby(by= 'Section')
gr

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001935ADF1EE0>

In [5]:
gr.groups 

{'A': [0, 2], 'B': [1], 'C': [3, 4]}

### --- Grouping by Multiple Columns ---
#### Group data by both 'Student ID' and 'Section' to see specific row mappings

In [6]:
df.groupby(['Student ID','Section']).groups

{(1001, 'A'): [0], (1002, 'B'): [1], (1003, 'A'): [2], (1004, 'C'): [3], (1005, 'C'): [4]}

### --- Iterating through Groups ---
#### Loop through each group and print the data belonging to that specific Section

In [7]:
for Section, df_1 in gr:
    print(df_1)

   Student ID Section  Class  Study hrs  Social Media usage hrs  Percentage
0        1001       A     10          2                       3          50
2        1003       A     10          3                       2          60
   Student ID Section  Class  Study hrs  Social Media usage hrs  Percentage
1        1002       B     10          6                       2          80
   Student ID Section  Class  Study hrs  Social Media usage hrs  Percentage
3        1004       C     11          0                       1          45
4        1005       C     12          5                       2          75


### --- Selecting Specific Groups ---
#### Re-group by 'Class' and extract only the data for Class 10

In [8]:
gr = df.groupby("Class").get_group(10)    # Access the particular col. with given value  
gr

Unnamed: 0,Student ID,Section,Class,Study hrs,Social Media usage hrs,Percentage
0,1001,A,10,2,3,50
1,1002,B,10,6,2,80
2,1003,A,10,3,2,60


#### Extract records where students achieved exactly a 50% result

In [9]:
gr = df.groupby("Percentage").get_group(50)
gr

Unnamed: 0,Student ID,Section,Class,Study hrs,Social Media usage hrs,Percentage
0,1001,A,10,2,3,50


### --- Group Aggregations ---
#### Group by Section and calculate the sum of all numerical columns

In [10]:
gr_1 = df.groupby("Section")   
gr_1.sum()

Unnamed: 0_level_0,Student ID,Class,Study hrs,Social Media usage hrs,Percentage
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2004,20,5,5,110
B,1002,10,6,2,80
C,2009,23,5,3,120


In [11]:
gr_1.mean()

Unnamed: 0_level_0,Student ID,Class,Study hrs,Social Media usage hrs,Percentage
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1002.0,10.0,2.5,2.5,55.0
B,1002.0,10.0,6.0,2.0,80.0
C,1004.5,11.5,2.5,1.5,60.0


In [12]:
gr_1.describe()

Unnamed: 0_level_0,Student ID,Student ID,Student ID,Student ID,Student ID,Student ID,Student ID,Student ID,Class,Class,...,Social Media usage hrs,Social Media usage hrs,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Section,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A,2.0,1002.0,1.414214,1001.0,1001.5,1002.0,1002.5,1003.0,2.0,10.0,...,2.75,3.0,2.0,55.0,7.071068,50.0,52.5,55.0,57.5,60.0
B,1.0,1002.0,,1002.0,1002.0,1002.0,1002.0,1002.0,1.0,10.0,...,2.0,2.0,1.0,80.0,,80.0,80.0,80.0,80.0,80.0
C,2.0,1004.5,0.707107,1004.0,1004.25,1004.5,1004.75,1005.0,2.0,11.5,...,1.75,2.0,2.0,60.0,21.213203,45.0,52.5,60.0,67.5,75.0


### --- Advanced Aggregation ---

In [38]:
df = pd.read_csv("C:\\Users\\Dell\\Desktop\\_PANDAS_\\Data\\student_result1.csv")
df.head()

Unnamed: 0,Student ID,Section,Class,Study hrs,Social Media usage hrs,Percentage
0,1001,A,10,2,3,50
1,1002,B,10,6,2,80
2,1003,A,10,3,2,60
3,1004,C,11,0,1,45
4,1005,C,12,5,2,75


#### Apply multiple specific aggregate functions simultaneously to each group

In [14]:
gr_1.agg(['sum', 'max', 'mean','std'])

Unnamed: 0_level_0,Student ID,Student ID,Student ID,Student ID,Class,Class,Class,Class,Study hrs,Study hrs,Study hrs,Study hrs,Social Media usage hrs,Social Media usage hrs,Social Media usage hrs,Social Media usage hrs,Percentage,Percentage,Percentage,Percentage
Unnamed: 0_level_1,sum,max,mean,std,sum,max,mean,std,sum,max,mean,std,sum,max,mean,std,sum,max,mean,std
Section,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
A,2004,1003,1002.0,1.414214,20,10,10.0,0.0,5,3,2.5,0.707107,5,3,2.5,0.707107,110,60,55.0,7.071068
B,1002,1002,1002.0,,10,10,10.0,,6,6,6.0,,2,2,2.0,,80,80,80.0,
C,2009,1005,1004.5,0.707107,23,12,11.5,0.707107,5,5,2.5,3.535534,3,2,1.5,0.707107,120,75,60.0,21.213203
