# Groupby Object in pandas

### Key concept of groupby: 
-> Split-Apply-Combine: The core concept behind groupby.
 - Split: Splitting the data into groups based on some criteria.
 - Apply: Applying a function to each group independently.
 - Combine: Combining the results into a data structure.

In [1]:
import pandas as pd
import numpy as np

In [None]:
data1 = pd.read_csv('StudentsPerformance.csv')

In [None]:
data1.head()

In [None]:
data1.value_counts('race/ethnicity')

In [None]:
data1.info()

In [7]:
data1.rename(columns={'race/ethnicity':'groups'},inplace=True)

In [8]:
groups = data1.groupby('groups') # This creates a groupby object named groups

In [None]:
print(groups) # Address of the group.. 

In [None]:
type(groups)

In [None]:
for group,data in groups: # This way can be used to print that group data.. because it stores similiar as dictionary.
    print(group)
    print(data)
   

## Aggregation
-> Common aggregation functions:
 - sum()
 - mean()
 - count()
 - min()
 - max()
 - std() (standard deviation)
 - var() (variance)

-> Custom aggregation using agg() or aggregate()

-> Named aggregation (defining multiple aggregation operations at once)

In [None]:
# sum()
groups.sum()

In [None]:
# sum()
groups.sum(numeric_only=True)

In [None]:
# mean()
groups.mean(numeric_only=True)

In [None]:
# max()
groups.max(numeric_only=True)

In [None]:
groups.count()

### Practice:

In [None]:
# Find that group which scored highest total in math.
groups['math score'].sum().sort_values(ascending=False).head(1)


In [None]:
# Find that group which scored highest avg in writing.


groups['writing score'].mean().sort_values(ascending=False).head(1)

In [None]:
# find that parental level of education which has highest total in reading score.
data1.groupby('parental level of education')['reading score'].sum().sort_values(ascending=False).head(1)

In [None]:
# find the highest math score of each group
groups['math score'].max()

### groupby() attributes and Functions in Pandas:

 - groups
 - ngroups
 - len()
 - size()
 - indices
 - nunique()
 - dtypes
 - count()
 - first()
 - last()
 - head()
 - tail()
 - get_group()
 - describe()
 - apply()

In [None]:
# groups
data1.groupby('parental level of education').groups

In [None]:
data1['parental level of education'].iloc[12]

In [None]:
# ngroups
data1.groupby('gender').ngroups

In [None]:
# len()
len(data1.groupby('parental level of education'))

In [None]:
# size()
data1.groupby('groups').size()

In [None]:
# indices
groups.indices


In [None]:
# nunique()
groups.nunique()

In [None]:
# dtypes
groups.dtypes

In [None]:
data1.head()

In [None]:
gen=data1.groupby('gender')
for x,y in gen:
    print(x)
    print(y.head())

In [None]:
# count
groups.count()

In [None]:
# first() -> Returns the first row of each group.
groups.first()

In [None]:
# last() -> Returns the last row of each group.
groups.last()

In [None]:
data1.head(10)

In [None]:
# head(n) -> Returns the first n rows from each group.
data1.groupby('gender').head(1)

In [None]:
# tail(n) -> Returns the last n rows from each group.
data1.groupby('parental level of education').tail(2)

In [None]:
groups.head(2)

In [None]:
# sample()
groups.sample()

In [None]:
# describe()
groups.describe()

In [None]:
male1=gen.get_group('male')
male1.head()

In [None]:
# get_group()
groups.get_group('group A')

### Custom aggregation

In [None]:
# agg() -> using dict
groups.agg(
    {
        'math score':'mean',
        'reading score':'max',
        'writing score':'sum',
    }
)

### Named aggregation

In [None]:
# agg() -> using list (defining multiple aggregation operations at once for all the columns of group)
groups.agg(['min','max','count'])

In [None]:
# Adding both the syntax
groups.agg(
    {
        'math score':['mean','min'],
        'reading score':'std',
        'writing score':['sum','max','min']
    }
)

### Looping on groups.

In [None]:
# looping on groups

for group,data in groups:
  print(group,data)


In [None]:
for group,data in groups:
    if group == 'group A':
        data['test preparation course'] += ' course'
    print(data)

### groupby on multiple cols

In [None]:
# groupby on multiple cols
two_groups = data1.groupby(['groups','gender'])
two_groups


In [None]:
two_groups.mean(numeric_only=True)

In [None]:
two_groups['math score'].sum().sort_values(ascending=False).head(1)

In [None]:
# agg on multiple groupby
two_groups.agg(['min','max'])

### Excercise

# Merging in pandas.

1. pd.concat()
2. df1.merge(df2)

In [7]:
d1 = pd.read_csv("course_info_1.csv")

In [None]:
d1.head(4)

In [9]:
d2 = pd.read_csv("course_info_2.csv")

In [None]:
d2.head(4)

In [None]:
# concat()
ans = pd.concat([d1,d2])
ans

In [None]:
# ignore_index
ans1 = pd.concat([d1,d2],keys=['class1','class2'])
ans1

### -> merge()
1. left join
2. right join
3. inner join
4. outer join

In [3]:
course_fees = pd.read_csv("course_fees.csv")

In [None]:
course_fees.head()

In [4]:
course_info_1 = pd.read_csv("course_info_1.csv")

In [None]:
course_info_1.head()

In [None]:
# left join
course_info_1.merge(course_fees,how="left",on="course_id")

In [None]:
# right join
course_info_1.merge(course_fees,how="right",on="course_id")

In [None]:
# outer join
course_info_1.merge(course_fees,how="outer",on="course_id")

In [None]:
# inner join
course_info_1.merge(course_fees,how="inner",on="course_id")

In [61]:
# self join

In [None]:
# Alternate syntax of merging

pd.merge(course_info_1,course_fees,how="inner",on='course_id')