In [1]:
import numpy as np

In [2]:
import pandas as pd

### In this tutorial we cover
apply(), agg(), transform(), groupby(),
join(), merge()

#### apply(): 
With apply() we can apply a given lambda func or user defined func or numpy func to each row or column in a dataframe

In [15]:
## Create a DataFrame

data = np.random.randint(low=10, high=20, size=(3,4))
cols = ['Q'+ str(x) for x in np.arange(1,5)]
indx = list('ABC')
    
df = pd.DataFrame(data=data, columns=cols, index=indx)
df

Unnamed: 0,Q1,Q2,Q3,Q4
A,18,12,10,13
B,10,17,16,16
C,11,10,16,16


In [17]:
# Total Sales for each product in year

df.apply(sum, axis=1)

A    53
B    59
C    53
dtype: int64

In [24]:
# Total sales for all products in each Quarter

df.apply(np.sum, axis=0)

Q1    39
Q2    39
Q3    42
Q4    45
dtype: int64

In [23]:
# Find max sale for each product across all Quarters

df.apply(np.max, axis=1)

A    18
B    17
C    16
dtype: int64

In [22]:
# Avg sales for each product across all Quarters

df.apply(np.mean, axis=1)

A    13.25
B    14.75
C    13.25
dtype: float64

#### agg()
Aggregate applies one or more operations over the specified axis.

In [25]:
df

Unnamed: 0,Q1,Q2,Q3,Q4
A,18,12,10,13
B,10,17,16,16
C,11,10,16,16


In [32]:
df.agg(['min', np.mean, 'max'], axis='columns')

Unnamed: 0,min,mean,max
A,10.0,13.25,18.0
B,10.0,14.75,17.0
C,10.0,13.25,16.0


In [33]:
df.agg([np.min, np.median, np.max], axis='rows')

Unnamed: 0,Q1,Q2,Q3,Q4
amin,10.0,10.0,10.0,13.0
median,11.0,12.0,16.0,16.0
amax,18.0,17.0,16.0,16.0


#### transform()
Creates a  new DataFrame, transforming values by applying function

In [34]:
df.transform(np.sqrt)

Unnamed: 0,Q1,Q2,Q3,Q4
A,4.242641,3.464102,3.162278,3.605551
B,3.162278,4.123106,4.0,4.0
C,3.316625,3.162278,4.0,4.0


#### groupby() - split-apply-combine

In [42]:
## First create a dataframe stud

math = pd.DataFrame({'Subject':'Math',
        'Marks':[67, 76, 88, 56, 90],
        'Gender':['M','F','F','M','F']},
        index= ['Rollno-' +  str(x) for x in [1,2,3,4,5]])
science = pd.DataFrame({'Subject':'Science',
        'Marks':[70, 60, 78, 65, 87],
        'Gender':['M','F','F','M','F']},
        index= ['Rollno-' +  str(x) for x in [1,2,3,4,5]])

stud = math.append(science)

stud

Unnamed: 0,Subject,Marks,Gender
Rollno-1,Math,67,M
Rollno-2,Math,76,F
Rollno-3,Math,88,F
Rollno-4,Math,56,M
Rollno-5,Math,90,F
Rollno-1,Science,70,M
Rollno-2,Science,60,F
Rollno-3,Science,78,F
Rollno-4,Science,65,M
Rollno-5,Science,87,F


In [43]:
# Avg. marks for each subject

stud.groupby('Subject')['Marks'].mean()

Subject
Math       75.4
Science    72.0
Name: Marks, dtype: float64

In [44]:
# Avg marks by Subject and Gender

stud.groupby(['Subject', 'Gender'])['Marks'].mean()

Subject  Gender
Math     F         84.666667
         M         61.500000
Science  F         75.000000
         M         67.500000
Name: Marks, dtype: float64

In [45]:

g = stud.groupby('Subject')


In [49]:
for sub, frame in g:
    print(f'Subject: {sub}')
    print(frame)
    print('--'*10)

Subject: Math
         Subject  Marks Gender
Rollno-1    Math     67      M
Rollno-2    Math     76      F
Rollno-3    Math     88      F
Rollno-4    Math     56      M
Rollno-5    Math     90      F
--------------------
Subject: Science
          Subject  Marks Gender
Rollno-1  Science     70      M
Rollno-2  Science     60      F
Rollno-3  Science     78      F
Rollno-4  Science     65      M
Rollno-5  Science     87      F
--------------------


In [50]:
g.groups['Math']

Index(['Rollno-1', 'Rollno-2', 'Rollno-3', 'Rollno-4', 'Rollno-5'], dtype='object')

In [51]:
g.get_group('Math')

Unnamed: 0,Subject,Marks,Gender
Rollno-1,Math,67,M
Rollno-2,Math,76,F
Rollno-3,Math,88,F
Rollno-4,Math,56,M
Rollno-5,Math,90,F


In [59]:
stud.groupby(['Subject', 'Gender'])['Marks'].agg([ 'min', np.mean, 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,mean,max
Subject,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Math,F,76,84.666667,90
Math,M,56,61.5,67
Science,F,60,75.0,87
Science,M,65,67.5,70


### Join
Joins columns of another DataFrame

In [76]:
indx = pd.date_range(start='01-May-2020', periods=7, freq='D')
data = np.random.randint(low=10, high=20, size=len(indx))
ser = pd.Series(data, indx)
d1 = pd.DataFrame(ser, columns=['A'])
d1

Unnamed: 0,A
2020-05-01,15
2020-05-02,10
2020-05-03,19
2020-05-04,18
2020-05-05,19
2020-05-06,16
2020-05-07,18


In [79]:
indx = pd.date_range(start='03-May-2020', periods=7, freq='D')
data = np.random.randint(low=10, high=20, size=len(indx))
ser = pd.Series(data, indx)
d2 = pd.DataFrame(ser, columns=['B'])
d2

Unnamed: 0,B
2020-05-03,11
2020-05-04,17
2020-05-05,18
2020-05-06,19
2020-05-07,19
2020-05-08,17
2020-05-09,15


In [81]:
# inner join

d1.join(d2, how='inner')

Unnamed: 0,A,B
2020-05-03,19,11
2020-05-04,18,17
2020-05-05,19,18
2020-05-06,16,19
2020-05-07,18,19


In [82]:
# left join

d1.join(d2, how='left')

Unnamed: 0,A,B
2020-05-01,15,
2020-05-02,10,
2020-05-03,19,11.0
2020-05-04,18,17.0
2020-05-05,19,18.0
2020-05-06,16,19.0
2020-05-07,18,19.0


In [83]:
# right join

d1.join(d2, how='right')

Unnamed: 0,A,B
2020-05-03,19.0,11
2020-05-04,18.0,17
2020-05-05,19.0,18
2020-05-06,16.0,19
2020-05-07,18.0,19
2020-05-08,,17
2020-05-09,,15


In [84]:
# outer join

d1.join(d2, how='outer')

Unnamed: 0,A,B
2020-05-01,15.0,
2020-05-02,10.0,
2020-05-03,19.0,11.0
2020-05-04,18.0,17.0
2020-05-05,19.0,18.0
2020-05-06,16.0,19.0
2020-05-07,18.0,19.0
2020-05-08,,17.0
2020-05-09,,15.0


#### merge()

In [107]:
Emp = pd.DataFrame(data={'EmpId':[10,11,12],'DeptId':['D1','D2', 'D3']})
Dept= pd.DataFrame(data={'DeptName':['HR','IT', 'Sales'],'DeptCode':['D1','D2','D3']})

In [108]:
Emp

Unnamed: 0,EmpId,DeptId
0,10,D1
1,11,D2
2,12,D3


In [109]:
Dept

Unnamed: 0,DeptName,DeptCode
0,HR,D1
1,IT,D2
2,Sales,D3


In [110]:
Emp.merge(Dept, how='inner', left_on='DeptId', right_on='DeptCode')

Unnamed: 0,EmpId,DeptId,DeptName,DeptCode
0,10,D1,HR,D1
1,11,D2,IT,D2
2,12,D3,Sales,D3


In [113]:
Emp[Emp.merge(Dept, how='inner', left_on='DeptId', right_on='DeptCode')['DeptName'] == 'Sales']

Unnamed: 0,EmpId,DeptId
2,12,D3
