# Merging Dataframes

Summuray of pandas data structures:

- Series object: 1 dimensional, a row
- Dataframe object: 2 dimensional, a table

Querying:
- iloc[]: for queries based on position
- loc[]: for queries based on label
- project a subset of columns using df[cols]
- project using a boolean mask to filter data

Setting data:
- df[column] = [a,b,c]	#add new data
- df[column] = 2	#broadcasting: set default data, or overwrite

In [2]:
import pandas as pd

In [8]:
df = pd.DataFrame([
    {'Cost': 22.5,'Item Purchased':'Sponge','Name':'Chris'},
    {'Cost': 2.5,'Item Purchased':'Kitty Litter','Name':'Kevyn'},
    {'Cost': 5.0,'Item Purchased':'Spoon','Name':'Filip'}
], 
    index=['Store 1','Store 1','Store 2']
)
df

Unnamed: 0,Cost,Item Purchased,Name
Store 1,22.5,Sponge,Chris
Store 1,2.5,Kitty Litter,Kevyn
Store 2,5.0,Spoon,Filip


In [11]:
df['Date'] = ['Dec 1st','Jan 1st','Mid may']
df['Delivered'] = True
df

Unnamed: 0,Cost,Item Purchased,Name,Date,Delivered
Store 1,22.5,Sponge,Chris,Dec 1st,True
Store 1,2.5,Kitty Litter,Kevyn,Jan 1st,True
Store 2,5.0,Spoon,Filip,Mid may,True


In [16]:
adf = df.reset_index()
adf

Unnamed: 0,index,Cost,Item Purchased,Name,Date,Delivered
0,Store 1,22.5,Sponge,Chris,Dec 1st,True
1,Store 1,2.5,Kitty Litter,Kevyn,Jan 1st,True
2,Store 2,5.0,Spoon,Filip,Mid may,True


In [21]:
adf['Date'] = pd.Series({0:'date 1', 2:'date 2'})   # will automatically map using existing index and add NaN for missing indexes
adf

Unnamed: 0,index,Cost,Item Purchased,Name,Date,Delivered
0,Store 1,22.5,Sponge,Chris,date 1,True
1,Store 1,2.5,Kitty Litter,Kevyn,,True
2,Store 2,5.0,Spoon,Filip,date 2,True


In [25]:
# Joining dataframes
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])
student_df = student_df.set_index('Name')

In [23]:
staff_df.head()

Unnamed: 0_level_0,Role
Name,Unnamed: 1_level_1
Kelly,Director of HR
Sally,Course liasion
James,Grader


In [24]:
student_df.head()

Unnamed: 0_level_0,School
Name,Unnamed: 1_level_1
James,Business
Mike,Law
Sally,Engineering


In [30]:
pd.merge(staff_df, student_df, how='inner',left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Business


In [31]:
pd.merge(staff_df, student_df, how='outer',left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [32]:
pd.merge(staff_df, student_df, how='left',left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Business


In [33]:
pd.merge(staff_df, student_df, how='right',left_index=True, right_index=True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Mike,,Law
Sally,Course liasion,Engineering


In [35]:
# we can join on columns as well
pd.merge(staff_df.reset_index(), student_df.reset_index(),how='inner',left_on='Name',right_on='Name')

Unnamed: 0,Name,Role,School
0,Sally,Course liasion,Engineering
1,James,Grader,Business


In [36]:
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])
student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},
                           {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},
                           {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')

Unnamed: 0,Location_x,Name,Role,Location_y,School
0,State Street,Kelly,Director of HR,,
1,Washington Avenue,Sally,Course liasion,512 Wilson Crescent,Engineering
2,Washington Avenue,James,Grader,1024 Billiard Avenue,Business


In [37]:
# it's possible to use multiple keys by passing lists in left_on and right_on
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},
                         {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},
                         {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])
student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},
                           {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},
                           {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])
staff_df
student_df
pd.merge(staff_df, student_df, how='inner', left_on=['First Name','Last Name'], right_on=['First Name','Last Name'])

Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


# Pandas idioms