## Fixes the problem of jumbled data in concat , by auto aranging

In [1]:
import pandas as pd

In [2]:
usa_temperature = pd.DataFrame({'city' : ['new york','los angeles','las vegas','washington dc'],
                  'temperature' : [25,18,26,17]})

usa_humidity = pd.DataFrame({'city' : ['los angeles','las vegas','new york','washington dc'],
                  'humidity' : [45,34,62,49]})

usa_population = pd.DataFrame({'city' : ['new york','washington dc','las vegas','los angeles'],
                               'population (in 10k)' : [65,32,25,54]})

In [3]:
usa_temperature

Unnamed: 0,city,temperature
0,new york,25
1,los angeles,18
2,las vegas,26
3,washington dc,17


In [4]:
usa_humidity

Unnamed: 0,city,humidity
0,los angeles,45
1,las vegas,34
2,new york,62
3,washington dc,49


In [5]:
usa_population

Unnamed: 0,city,population (in 10k)
0,new york,65
1,washington dc,32
2,las vegas,25
3,los angeles,54


# on attribute
- it is the column that is common in both dataset and on basis of which the datasets are being merged

In [6]:
# merge can only operate on two dataframes at once

usa_data = pd.merge(usa_temperature,usa_humidity, on = 'city')
usa_data = pd.merge(usa_data,usa_population, on = 'city')
usa_data

Unnamed: 0,city,temperature,humidity,population (in 10k)
0,new york,25,62,65
1,los angeles,18,45,54
2,las vegas,26,34,25
3,washington dc,17,49,32


# suffixes attribute
- if adds custom suffixes if datasets being merged have same column names

In [19]:
math_mark = pd.DataFrame({'name' : ['henry','george','fieonna','sally','tom','ruck'],
                          'mark' : [12,45,78,75,85,96]})

science_mark = pd.DataFrame({'name' : ['george','henry','tom','grace','ruck'],
                             'mark' : [14,47,25,58,65]})

In [20]:
math_mark

Unnamed: 0,name,mark
0,henry,12
1,george,45
2,fieonna,78
3,sally,75
4,tom,85
5,ruck,96


In [21]:
science_mark

Unnamed: 0,name,mark
0,george,14
1,henry,47
2,tom,25
3,grace,58
4,ruck,65


In [23]:
# when columns overlap, like above, merge by default adds '_x' , '_y' as suffix

df = pd.merge(math_mark,science_mark, on = 'name')
df

Unnamed: 0,name,mark_x,mark_y
0,henry,12,47
1,george,45,14
2,tom,85,25
3,ruck,96,65


In [24]:
# adding custom suffixes

df = pd.merge(math_mark,science_mark,on = 'name',suffixes = ('_math','_science'))
df

Unnamed: 0,name,mark_math,mark_science
0,henry,12,47
1,george,45,14
2,tom,85,25
3,ruck,96,65


## how attribute
- we can perform various set operations

In [9]:
math_mark

Unnamed: 0,name,mark
0,henry,12
1,george,45
2,fieonna,78
3,sally,75
4,tom,85
5,ruck,96


In [10]:
science_mark

Unnamed: 0,name,mark
0,george,14
1,henry,47
2,tom,25
3,grace,58
4,ruck,65


In [25]:
# by default merge gives us intersection between two datasets

marks = pd.merge(math_mark,science_mark,on = 'name',suffixes = ('_math','_science'))
marks

Unnamed: 0,name,mark_math,mark_science
0,henry,12,47
1,george,45,14
2,tom,85,25
3,ruck,96,65


In [13]:
# by default merge provides intersection data
# but we can change it by specifying 'how' parameter
#  by default : how = 'inner' --- (intersection)
# how = 'outer' ------- (union)
# how  = 'left' -----(A-B)
# how = 'right' ----- (B-A)

# left right operate according to dataframe order passed as merge argument

marks = pd.merge(math_mark,science_mark,on = 'name',how = 'outer', suffixes = ('_math','_science'))
marks

Unnamed: 0,name,mark_math,mark_science
0,henry,12.0,47.0
1,george,45.0,14.0
2,fieonna,78.0,
3,sally,75.0,
4,tom,85.0,25.0
5,ruck,96.0,65.0
6,grace,,58.0


In [14]:
marks = pd.merge(math_mark,science_mark,on = 'name',how = 'inner', suffixes = ('_math','_science'))
marks

Unnamed: 0,name,mark_math,mark_science
0,henry,12,47
1,george,45,14
2,tom,85,25
3,ruck,96,65


In [15]:
marks = pd.merge(math_mark,science_mark,on = 'name',how = 'left', suffixes = ('_math','_science'))
marks

Unnamed: 0,name,mark_math,mark_science
0,henry,12,47.0
1,george,45,14.0
2,fieonna,78,
3,sally,75,
4,tom,85,25.0
5,ruck,96,65.0


In [16]:
marks = pd.merge(math_mark,science_mark,on = 'name',how = 'right', suffixes = ('_math','_science'))
marks

Unnamed: 0,name,mark_math,mark_science
0,george,45.0,14
1,henry,12.0,47
2,tom,85.0,25
3,grace,,58
4,ruck,96.0,65


## indicator attribute
-  to know that data came from which side of dataframe, we use indictor paremeter

In [27]:
# both = belongs to both dataframes
# left_only = belongs to left dataframe
# right_only = belongs to right dataframe

marks = pd.merge(math_mark,science_mark,on = 'name',how = 'outer', suffixes = ('_math','_science') , indicator = True)
marks

Unnamed: 0,name,mark_math,mark_science,_merge
0,henry,12.0,47.0,both
1,george,45.0,14.0,both
2,fieonna,78.0,,left_only
3,sally,75.0,,left_only
4,tom,85.0,25.0,both
5,ruck,96.0,65.0,both
6,grace,,58.0,right_only


## left_index & right_index attribute
- to read indexes separately than merging them

In [30]:
df = pd.merge(science_mark,math_mark, left_index= True, right_index = True)
df

Unnamed: 0,name_x,mark_x,name_y,mark_y
0,george,14,henry,12
1,henry,47,george,45
2,tom,25,fieonna,78
3,grace,58,sally,75
4,ruck,65,tom,85
