# Merge 

In [1]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('precision', 2) # show only two decimal digits

Load the survey data

In [2]:
df  = pd.read_csv('cleaned_survey.csv', index_col=0)

Let us assume that we also have a dataframe <i>df_programs</i>, which contains the units required to complete the graduate programs at our business school.

In [3]:
df_programs = pd.DataFrame({'Program' : ['MSIS', 'MBA', 'Master of Finance', 'Supply Chain Mgmt & Analytics', 'Master of Hacking'], 
                            'Units_required' : [51, 70, 48, 49, 100]})

In [4]:
df_programs

Unnamed: 0,Program,Units_required
0,MSIS,51
1,MBA,70
2,Master of Finance,48
3,Supply Chain Mgmt & Analytics,49
4,Master of Hacking,100


Note that Master of Hacking (unfortunately) does not actually exist... 

## Merge on columns

A Merge operation ("join" in relational DBs) consists of joining the columns of two tables based on the equality of one or more columns. For example, we can add to <i>df</i> a column <i>Units_required</i>, which reports the units required by the program in which each student is enrolled.

### INNER MERGE

Compact formulation: the merge will be performed on the columns with the same name in both tables. Merging <i>df</i> with <i>df_programs</i> will perform the merge on the column <i>Program</i>, because that is the only column with the same name.

In [5]:
df.merge(df_programs)

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,...,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,...,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,...,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51
5,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,1,0,0,4.0,1,51
6,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,...,0.0,2,2,1,0,0,0,3.0,1,51
7,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,1.0,2,1,0,0,0,1,4.0,1,51
8,0.5,MSIS,4,1,1,0.0,1,0.0,0.0,0.0,...,1.0,1,1,0,0,0,1,5.0,1,51
9,0.0,MSIS,5,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51


Or we can specify the names of the columns with <i>left_on</i> (the column or list of columns on the "left" table) and <i>right_on</i> (the column or list of columns on the "right" table)

In [6]:
df.merge(df_programs,left_on='Program',right_on='Program')

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,...,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,...,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,...,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51
5,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,1,0,0,4.0,1,51
6,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,...,0.0,2,2,1,0,0,0,3.0,1,51
7,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,1.0,2,1,0,0,0,1,4.0,1,51
8,0.5,MSIS,4,1,1,0.0,1,0.0,0.0,0.0,...,1.0,1,1,0,0,0,1,5.0,1,51
9,0.0,MSIS,5,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51


Because the result table is very large, we will retain only a few columns for the remainder of this notebook.

In [7]:
df.merge(df_programs)[['Program', 'ProgSkills','Languages','Expert','Units_required']]

Unnamed: 0,Program,ProgSkills,Languages,Expert,Units_required
0,MSIS,4,6.0,1,51
1,MSIS,3,4.0,1,51
2,MSIS,3,3.0,1,51
3,MSIS,3,5.0,1,51
4,MSIS,3,4.0,1,51
5,MSIS,3,4.0,1,51
6,MSIS,2,3.0,1,51
7,MSIS,3,4.0,1,51
8,MSIS,4,5.0,1,51
9,MSIS,5,4.0,1,51


### LEFT MERGE

This is the equivalent of the left outer join in relational DBs. If a row on the left table finds no match, it will still appear in the result and the missing values will be filled with NAs.

In [8]:
df.merge(df_programs, how='left')[['Program', 'ProgSkills','Languages','Expert','Units_required']]

Unnamed: 0,Program,ProgSkills,Languages,Expert,Units_required
0,MSIS,4,6.0,1,51.0
1,MSIS,3,4.0,1,51.0
2,MSIS,3,3.0,1,51.0
3,MSIS,3,5.0,1,51.0
4,MSIS,3,4.0,1,51.0
5,Supply Chain Mgmt & Analytics,1,1.0,0,49.0
6,MSIS,3,4.0,1,51.0
7,MSIS,2,3.0,1,51.0
8,MBA,1,0.0,0,70.0
9,MSIS,3,4.0,1,51.0


### RIGHT MERGE

In [9]:
df.merge(df_programs, how='right')[['Program', 'ProgSkills','Languages','Expert','Units_required']]

Unnamed: 0,Program,ProgSkills,Languages,Expert,Units_required
0,MSIS,4.0,6.0,1.0,51
1,MSIS,3.0,4.0,1.0,51
2,MSIS,3.0,3.0,1.0,51
3,MSIS,3.0,5.0,1.0,51
4,MSIS,3.0,4.0,1.0,51
5,MSIS,3.0,4.0,1.0,51
6,MSIS,2.0,3.0,1.0,51
7,MSIS,3.0,4.0,1.0,51
8,MSIS,4.0,5.0,1.0,51
9,MSIS,5.0,4.0,1.0,51


### OUTER MERGE

In [10]:
df.merge(df_programs,how='outer')[['Program', 'ProgSkills','Languages','Expert','Units_required']]

Unnamed: 0,Program,ProgSkills,Languages,Expert,Units_required
0,MSIS,4.0,6.0,1.0,51.0
1,MSIS,3.0,4.0,1.0,51.0
2,MSIS,3.0,3.0,1.0,51.0
3,MSIS,3.0,5.0,1.0,51.0
4,MSIS,3.0,4.0,1.0,51.0
5,MSIS,3.0,4.0,1.0,51.0
6,MSIS,2.0,3.0,1.0,51.0
7,MSIS,3.0,4.0,1.0,51.0
8,MSIS,4.0,5.0,1.0,51.0
9,MSIS,5.0,4.0,1.0,51.0


## Merge on Indices

Let's create a new DataFrame, called <i>df_programs_i</i>, which is a copy of <i>df_programs</i> but with <i>Program</i> being an index instead of a column.

In [11]:
df_programs_i = df_programs.set_index('Program')

In [12]:
df_programs_i

Unnamed: 0_level_0,Units_required
Program,Unnamed: 1_level_1
MSIS,51
MBA,70
Master of Finance,48
Supply Chain Mgmt & Analytics,49
Master of Hacking,100


In [13]:
df.head()

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Tableau,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,...,0,1.0,4,4,0,1,0,0,6.0,1
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0,0.0,2,2,0,0,0,1,4.0,1
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,...,0,1.0,3,3,0,0,1,0,3.0,1
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,...,0,1.0,2,3,0,0,0,1,5.0,1
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,0,0.0,1,1,0,0,1,0,4.0,1


To merge <i>df</i> (left table) with <i>df_index_i</i> (right table), we need to specify that we use the index on the right table (<b>right_index = True</b>).

In [14]:
df.merge(df_programs_i,left_on='Program', right_index=True)

Unnamed: 0,Job,Program,ProgSkills,C,CPP,CS,Java,Python,JS,R,...,Regression,Classification,Clustering,Bach_0to1,Bach_1to3,Bach_3to5,Bach_5Plus,Languages,Expert,Units_required
0,0.0,MSIS,4,1,1,0.0,1,1.0,1.0,0.0,...,1.0,4,4,0,1,0,0,6.0,1,51
1,0.5,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,0,0,1,4.0,1,51
2,0.0,MSIS,3,0,0,0.0,1,1.0,0.0,0.0,...,1.0,3,3,0,0,1,0,3.0,1,51
3,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,1.0,...,1.0,2,3,0,0,0,1,5.0,1,51
4,0.0,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51
6,0.0,MSIS,3,1,1,0.0,1,0.0,0.0,0.0,...,0.0,2,2,0,1,0,0,4.0,1,51
7,0.0,MSIS,2,1,0,0.0,1,0.0,0.0,0.0,...,0.0,2,2,1,0,0,0,3.0,1,51
9,0.5,MSIS,3,1,0,0.0,1,1.0,0.0,0.0,...,1.0,2,1,0,0,0,1,4.0,1,51
12,0.5,MSIS,4,1,1,0.0,1,0.0,0.0,0.0,...,1.0,1,1,0,0,0,1,5.0,1,51
19,0.0,MSIS,5,1,0,0.0,1,1.0,0.0,0.0,...,0.0,1,1,0,0,1,0,4.0,1,51


## Problems

For each programming skills level, find the average number of units to be completed by students with that programming skill level


In [15]:
df.merge(df_programs).groupby('ProgSkills')['Units_required'].mean()

ProgSkills
1    63.33
2    57.73
3    54.39
4    53.92
5    60.50
Name: Units_required, dtype: float64

For each existing program (i.e., for each Program in df_programs), find the units required to complete it and the number of students belonging to that program that responded to the survey. 

In [16]:
df.merge(df_programs, how="right").groupby('Program').agg({'Units_required' : {'Units_required' : 'mean'},
                                                          'Languages' : {'Number of Students': 'count'}})

Unnamed: 0_level_0,Languages,Units_required
Unnamed: 0_level_1,Number of Students,Units_required
Program,Unnamed: 1_level_2,Unnamed: 2_level_2
MBA,16,70
MSIS,40,51
Master of Finance,1,48
Master of Hacking,0,100
Supply Chain Mgmt & Analytics,2,49


For each student in df_students, the number of weekly hours they are working, assuming that:
<ul>
<li>each required unit of coursework is 0.25 hours a week of work
<li>Job=0 is 0 hours a week of work
<li>Job=0.5 is 20 hours a week of work
<li>Job=1 is 40 hours a week of work
</ul>

In [17]:
d = df.merge(df_programs, how='left')
d.Units_required * .25 + (d.Job == 0.5) * 20 + (d.Job == 1) * 40

0     12.75
1     32.75
2     12.75
3     12.75
4     12.75
5     52.25
6     12.75
7     12.75
8     57.50
9     32.75
10    37.50
11    57.50
12    32.75
13    12.25
14    57.50
15    57.50
16      NaN
17    57.50
18    37.50
19    12.75
20    32.75
21    12.75
22    17.50
23    32.75
24    17.50
25    12.75
26    12.75
27    32.75
28    32.75
29    12.75
      ...  
31      NaN
32    17.50
33    12.75
34    12.75
35    12.00
36    57.50
37    12.75
38    17.50
39    12.75
40    52.75
41    12.75
42    12.75
43    12.75
44    12.75
45    52.75
46    12.75
47    12.75
48    32.75
49    57.50
50    37.50
51    12.75
52    12.75
53    32.75
54    12.75
55    32.75
56    57.50
57    12.75
58    32.75
59    12.75
60    57.50
dtype: float64