Union == use outer join

Intersection == use inner join

In [15]:
import pandas as pd

staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                        {'Name': 'Sally', 'Role': 'Course liasion'},
                        {'Name': 'James', 'Role': 'Grader'}])

staff_df = staff_df.set_index('Name')

student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                          {'Name': 'Mike', 'School': 'Law'},
                          {'Name': 'Sally', 'School': 'Engineering'}])

student_df = student_df.set_index('Name')

print(staff_df.head())
print(student_df.head())

                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader
            School
Name              
James     Business
Mike           Law
Sally  Engineering


In [16]:
# union == outer join
# pass in left and right indices as the joining columns
pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [17]:
# intersection == inner join
# returns those who are student AND staff
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Business


In [18]:
# left join
# get a list of all staff regardless of whether they are students or not
# but if they are students, we want to get student details as well
# 1st df == left, 2nd df == right
pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Business


In [19]:
# right join
pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Mike,,Law
Sally,Course liasion,Engineering


In [20]:
# you can use columns to join on, instead of indices

# remove index from both df
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

# merge using the on parameter
pd.merge(staff_df, student_df, how='right', on='Name')

Unnamed: 0,Name,Role,School
0,James,Grader,Business
1,Mike,,Law
2,Sally,Course liasion,Engineering


In [21]:
# What happens when we have conflict between dfs?
# Lets create new dfs with Location info
# for staff, location is office location
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location':'State Street'},
                        {'Name': 'Sally', 'Role': 'Course liasion', 'Location':'Washington Avenue'},
                        {'Name': 'James', 'Role': 'Grader', 'Location':'Washington Avenue'}])

# for students, location is home address
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location':'1024 Billiard Avenue'},
                          {'Name': 'Mike', 'School': 'Law', 'Location':'Fraternity House #22'},
                          {'Name': 'Sally', 'School': 'Engineering', 'Location':'512 Wilson Crescent'}])

pd.merge(staff_df, student_df, how='left', on='Name')

# location_x refers to location from left df
# location_y refers to location from right df

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


In [23]:
# multi-indexing and multiple columns
# use a list of multiple columns that should be used to join keys from both dfs on the on parameter

# new staff and student data with last names
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name':'Desjardins', 'Role': 'Director of HR'},
                        {'First Name': 'Sally', 'Last Name':'Brooks', 'Role': 'Course liasion'},
                        {'First Name': 'James','Last Name':'Wilde', 'Role': 'Grader'}])

student_df = pd.DataFrame([{'First Name': 'James', 'Last Name':'Hammond', 'School': 'Business'},
                          {'First Name': 'Mike', 'Last Name':'Smith', 'School': 'Law'},
                          {'First Name': 'Sally', 'Last Name':'Brooks', 'School': 'Engineering'}])

# James Wilde and James Hammond dont match, so inner join should not include these 2 ppl
# only Sally Brooks retained
pd.merge(staff_df, student_df, how='inner', on=['First Name', 'Last Name'])


Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


In [25]:
# merging == joining horizontally
# concatenating == joining vertically
df1 = pd.DataFrame([{'Name': 'A', 'Name': 'B', 'Name': 'C'}])
df2 = pd.DataFrame([{'Name': 'D', 'Name': 'E', 'Name': 'F'}])
df3 = pd.DataFrame([{'Name': 'G', 'Name': 'H', 'Name': 'I'}])

frames = [df1, df2, df3]
pd.concat(frames)

Unnamed: 0,Name
0,C
0,F
0,I
