In [2]:
import pandas as pd

# let's create 2 dataframes for students and staff

staff_df = pd.DataFrame([{"Name": "Kelly", "Role": "Director of HR"},
                            {"Name": "Sally", "Role": "Course liasion"},
                            {"Name": "James", "Role": "Grader"}])

staff_df = staff_df.set_index("Name")

student_df = pd.DataFrame([{"Name": "James", "school": "Business"},
                            {"Name": "Mike", "school": "Law"},
                            {"Name": "Sally", "school": "Engineering"}])

student_df = student_df.set_index("Name")
print(staff_df)
print(student_df)

                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader
            school
Name              
James     Business
Mike           Law
Sally  Engineering


# Union or outer join

In [3]:
pd.merge(staff_df, student_df, how="outer", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,school
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


# Intersaction or inner join

In [4]:
pd.merge(staff_df, student_df, how="inner", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,school
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Business


# left / right join

Left: we want the list of all the staff and if they were students, we would want to get their student details as well.

In [5]:
pd.merge(staff_df, student_df, how="left", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,school
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Business


In [6]:
pd.merge(staff_df, student_df, how="right", left_index=True, right_index=True)

Unnamed: 0_level_0,Role,school
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Business
Mike,,Law
Sally,Course liasion,Engineering


# Joining by column

We can join two dataframes by a column that both dataframes have. This is the most common type of join

In [7]:
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

pd.merge(staff_df, student_df, how="right", on="Name")

Unnamed: 0,Name,Role,school
0,James,Grader,Business
1,Mike,,Law
2,Sally,Course liasion,Engineering


In [8]:
pd.merge(staff_df, student_df, how="outer", on="Name")

Unnamed: 0,Name,Role,school
0,Kelly,Director of HR,
1,Sally,Course liasion,Engineering
2,James,Grader,Business
3,Mike,,Law


# Joining when we have non-matching columns

In [10]:
# let's create two dataframes with non-matching columns

staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])

student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},
                           {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},
                           {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])


print(staff_df)
print(student_df)



    Name            Role           Location
0  Kelly  Director of HR       State Street
1  Sally  Course liasion  Washington Avenue
2  James          Grader  Washington Avenue
    Name       School              Location
0  James     Business  1024 Billiard Avenue
1   Mike          Law  Fraternity House #22
2  Sally  Engineering   512 Wilson Crescent


In [11]:
#we want the details of the staff, and their student details in case they are students too

pd.merge(staff_df, student_df, how="left", on="Name")

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


# Multiple indexing

In [15]:
# let's create new data for students and staff. This time we will have First Name and Last Name

staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},
                            {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},
                            {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])

student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},
                            {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},
                            {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])

print(staff_df.head())
print(student_df)

  First Name   Last Name            Role
0      Kelly  Desjardins  Director of HR
1      Sally      Brooks  Course liasion
2      James       Wilde          Grader
  First Name Last Name       School
0      James   Hammond     Business
1       Mike     Smith          Law
2      Sally    Brooks  Engineering


In [16]:
pd.merge(staff_df, student_df, how="inner", on=["First Name", "Last Name"])

Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


# Merging vertically

In [17]:
%%capture # this is used to supress output

df_2004 = pd.read_csv("MERGED2004_05_PP.csv", error_bad_lines=False)
df_2005 = pd.read_csv("MERGED2005_06_PP.csv", error_bad_lines=False)
df_2006 = pd.read_csv("MERGED2006_07_PP.csv", error_bad_lines=False)

In [18]:
df_2004.head()

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,


In [19]:
frames = [df_2004, df_2005, df_2006]

pd.concat(frames)

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
0,100654,00100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
1,100663,00105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2,100690,02503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
3,100706,00105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
4,100724,00100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6843,44098901,02568108,25681,Texas Barber College - Branch Campus #1,Dallas,TX,75241,,,,...,,,,,,,,,,
6844,44098902,02568101,25681,Texas Barber College - Branch Campus #2,Dallas,TX,75228,,,,...,,,,,,,,,,
6845,44098903,02568106,25681,Texas Barber Colleges and Hairstyling Schools ...,Houston,TX,77063,,,,...,,,,,,,,,,
6846,44098904,02568107,25681,Texas Barber College - Branch Campus #5,Houston,TX,77022,,,,...,,,,,,,,,,


In [20]:
len(df_2004) + len(df_2005) + len(df_2006)

20332

In [21]:
# If we want to know which data is from which year, we can add a key to the concat function

pd.concat(frames, keys=["2004", "2005", "2006"])

Unnamed: 0,Unnamed: 1,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,OMAWDP8_NOTFIRSTTIME_POOLED_SUPP,OMENRUP_NOTFIRSTTIME_POOLED_SUPP,OMENRYP_FULLTIME_POOLED_SUPP,OMENRAP_FULLTIME_POOLED_SUPP,OMAWDP8_FULLTIME_POOLED_SUPP,OMENRUP_FULLTIME_POOLED_SUPP,OMENRYP_PARTTIME_POOLED_SUPP,OMENRAP_PARTTIME_POOLED_SUPP,OMAWDP8_PARTTIME_POOLED_SUPP,OMENRUP_PARTTIME_POOLED_SUPP
2004,0,100654,00100200,1002,Alabama A & M University,Normal,AL,35762,,,,...,,,,,,,,,,
2004,1,100663,00105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,,...,,,,,,,,,,
2004,2,100690,02503400,25034,Amridge University,Montgomery,AL,36117-3553,,,,...,,,,,,,,,,
2004,3,100706,00105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,,...,,,,,,,,,,
2004,4,100724,00100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,6843,44098901,02568108,25681,Texas Barber College - Branch Campus #1,Dallas,TX,75241,,,,...,,,,,,,,,,
2006,6844,44098902,02568101,25681,Texas Barber College - Branch Campus #2,Dallas,TX,75228,,,,...,,,,,,,,,,
2006,6845,44098903,02568106,25681,Texas Barber Colleges and Hairstyling Schools ...,Houston,TX,77063,,,,...,,,,,,,,,,
2006,6846,44098904,02568107,25681,Texas Barber College - Branch Campus #5,Houston,TX,77022,,,,...,,,,,,,,,,
