In [1]:
import pandas as pd

# Create dataset

In [2]:
orchard = [
    {"name": "apple", "image": "🍏"},
    {"name": "pear", "image": "🍐"},
    {"name": "tomato", "image": "🍅"},
    {"name": "lemon", "image": "🍋"},
]

garden = [
    {"name": "tomato", "color": "red", "image": "🍅"},
    {"name": "potato", "color": "brown", "image": "🥔"},
    {"name": "carrot", "color": "orange", "image": "🥕"}
]

fruits = pd.DataFrame(orchard)
veggies = pd.DataFrame(garden)

In [3]:
fruits

Unnamed: 0,name,image
0,apple,🍏
1,pear,🍐
2,tomato,🍅
3,lemon,🍋


In [4]:
veggies

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕


# Join: ```concat()```

## Combining two dataframes along the row axis. 

- You can concatenate datasets with different column sizes. Fills with NaN by default.

In [5]:
pd.concat([fruits, veggies])

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,
3,lemon,🍋,
0,tomato,🍅,red
1,potato,🥔,brown
2,carrot,🥕,orange


In [6]:
pd.concat([veggies, fruits])

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕
0,apple,,🍏
1,pear,,🍐
2,tomato,,🍅
3,lemon,,🍋


In [7]:
pd.concat?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mobjs[0m[0;34m:[0m [0;34m'Iterable[NDFrame] | Mapping[Hashable, NDFrame]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjoin[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'outer'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeys[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverify_integrity[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[

## Concatenate along the Column Axis

- Which is two dataframes next to one another.
- Fills the missing values by NaN.

In [8]:
pd.concat([fruits, veggies], axis="columns")

Unnamed: 0,name,image,name.1,color,image.1
0,apple,🍏,tomato,red,🍅
1,pear,🍐,potato,brown,🥔
2,tomato,🍅,carrot,orange,🥕
3,lemon,🍋,,,


## Marking your combined dataframes with Keys

- Notice in the above concatenation with columns, the column names get repeated whilst belonging to fruit and vegetable categories.
  
- The way to handle this is to assign keys to the dataframes.

In [9]:
pd.concat([fruits, veggies], axis="columns", keys=["fruits", "veggies"])

Unnamed: 0_level_0,fruits,fruits,veggies,veggies,veggies
Unnamed: 0_level_1,name,image,name,color,image
0,apple,🍏,tomato,red,🍅
1,pear,🍐,potato,brown,🥔
2,tomato,🍅,carrot,orange,🥕
3,lemon,🍋,,,


In [10]:
pd.concat([fruits, veggies], axis="rows", keys=["fruits", "veggies"])

Unnamed: 0,Unnamed: 1,name,image,color
fruits,0,apple,🍏,
fruits,1,pear,🍐,
fruits,2,tomato,🍅,
fruits,3,lemon,🍋,
veggies,0,tomato,🍅,red
veggies,1,potato,🥔,brown
veggies,2,carrot,🥕,orange


## Access dataframes with multi-index

In [11]:
single = pd.concat([fruits, veggies])

In [12]:
single

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,
3,lemon,🍋,
0,tomato,🍅,red
1,potato,🥔,brown
2,carrot,🥕,orange


In [13]:
# single.loc[1:2, :]  # KeyError

In [14]:
multivitamin = pd.concat([fruits, veggies], keys=["fruits", "veggies"])

In [15]:
multivitamin

Unnamed: 0,Unnamed: 1,name,image,color
fruits,0,apple,🍏,
fruits,1,pear,🍐,
fruits,2,tomato,🍅,
fruits,3,lemon,🍋,
veggies,0,tomato,🍅,red
veggies,1,potato,🥔,brown
veggies,2,carrot,🥕,orange


In [16]:
multivitamin.loc[("fruits", 1):("fruits", 2), :]

Unnamed: 0,Unnamed: 1,name,image,color
fruits,1,pear,🍐,
fruits,2,tomato,🍅,


In [17]:
multivitamin.loc[("veggies", 1):("veggies", 2), :]

Unnamed: 0,Unnamed: 1,name,image,color
veggies,1,potato,🥔,brown
veggies,2,carrot,🥕,orange


## Recreate a new index after concatenation

In [18]:
pd.concat([fruits, veggies], ignore_index=True)

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,
3,lemon,🍋,
4,tomato,🍅,red
5,potato,🥔,brown
6,carrot,🥕,orange


In [19]:
pd.concat([fruits, veggies], axis="columns", ignore_index=True)

Unnamed: 0,0,1,2,3,4
0,apple,🍏,tomato,red,🍅
1,pear,🍐,potato,brown,🥔
2,tomato,🍅,carrot,orange,🥕
3,lemon,🍋,,,


## Joining columns when concatenating on rows

### Outer join

In [20]:
pd.concat([fruits, veggies], axis="rows", join="outer")

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,
3,lemon,🍋,
0,tomato,🍅,red
1,potato,🥔,brown
2,carrot,🥕,orange


### Inner join

In [21]:
pd.concat([fruits, veggies], axis="rows", join="inner")

Unnamed: 0,name,image
0,apple,🍏
1,pear,🍐
2,tomato,🍅
3,lemon,🍋
0,tomato,🍅
1,potato,🥔
2,carrot,🥕


## Joining rows when concatenating on columns

In [22]:
pd.concat([fruits, veggies], axis="columns", join="inner")

Unnamed: 0,name,image,name.1,color,image.1
0,apple,🍏,tomato,red,🍅
1,pear,🍐,potato,brown,🥔
2,tomato,🍅,carrot,orange,🥕


# Join: ```merge()```

- Primarily designed for joining two dataframes.

In [23]:
pd.merge(fruits, veggies)

Unnamed: 0,name,image,color
0,tomato,🍅,red


In [24]:
pd.merge?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mmerge[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mleft[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'inner'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m

### Inner join

In [25]:
pd.merge(fruits, veggies, how="inner")

Unnamed: 0,name,image,color
0,tomato,🍅,red


### Outer join

In [26]:
pd.merge(fruits, veggies, how="outer")

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,red
3,lemon,🍋,
4,potato,🥔,brown
5,carrot,🥕,orange


### Left join

In [27]:
pd.merge(fruits, veggies, how="left")

Unnamed: 0,name,image,color
0,apple,🍏,
1,pear,🍐,
2,tomato,🍅,red
3,lemon,🍋,


### Right join

In [28]:
pd.merge(fruits, veggies, how="right")

Unnamed: 0,name,image,color
0,tomato,🍅,red
1,potato,🥔,brown
2,carrot,🥕,orange


In [29]:
veggies

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕


In [30]:
fruits["taste"] = ["sweet", "sweet", "sour", "sour"]

In [31]:
fruits

Unnamed: 0,name,image,taste
0,apple,🍏,sweet
1,pear,🍐,sweet
2,tomato,🍅,sour
3,lemon,🍋,sour


In [32]:
pd.merge(fruits, veggies, how="right")

Unnamed: 0,name,image,taste,color
0,tomato,🍅,sour,red
1,potato,🥔,,brown
2,carrot,🥕,,orange


In [33]:
fruits = fruits.drop("taste", axis="columns")

In [34]:
fruits

Unnamed: 0,name,image
0,apple,🍏
1,pear,🍐
2,tomato,🍅
3,lemon,🍋


### Specify the Join columns explicitly

In [35]:
pd.merge(fruits, veggies, on=["name", "image"])

Unnamed: 0,name,image,color
0,tomato,🍅,red


In [36]:
pd.merge(fruits, veggies, on="name")

Unnamed: 0,name,image_x,color,image_y
0,tomato,🍅,red,🍅


In [37]:
pd.merge(fruits, veggies, on="image")

Unnamed: 0,name_x,image,name_y,color
0,tomato,🍅,tomato,red


The following doesn't work because color doesn't exist in BOTH the dataframes. ```on``` parameter value should exist in both dataframes.

In [38]:
pd.merge(fruits, veggies, on="color")

KeyError: 'color'

In [39]:
pd.merge?

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mmerge[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mleft[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'inner'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m

In [40]:
fruits

Unnamed: 0,name,image
0,apple,🍏
1,pear,🍐
2,tomato,🍅
3,lemon,🍋


In [41]:
veggies

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕


### Joining using Index columns

In [42]:
pd.merge(fruits, veggies, left_index=True, right_index=True)

Unnamed: 0,name_x,image_x,name_y,color,image_y
0,apple,🍏,tomato,red,🍅
1,pear,🍐,potato,brown,🥔
2,tomato,🍅,carrot,orange,🥕


In [43]:
orchard = [
    {"name": "apple", "image": "🍏", "amount": 1},
    {"name": "pear", "image": "🍐", "amount": 3},
    {"name": "tomato", "image": "🍅", "amount": 0},
    {"name": "lemon", "image": "🍋", "amount": 1},
]

garden = [
    {"name": "tomato", "color": "red", "image": "🍅"},
    {"name": "potato", "color": "brown", "image": "🥔"},
    {"name": "carrot", "color": "orange", "image": "🥕"}
]

fruits = pd.DataFrame(orchard)
veggies = pd.DataFrame(garden)

In [44]:
fruits

Unnamed: 0,name,image,amount
0,apple,🍏,1
1,pear,🍐,3
2,tomato,🍅,0
3,lemon,🍋,1


In [52]:
veggies

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕


In [53]:
pd.merge(fruits, veggies, left_on="amount", right_index=True)

Unnamed: 0,name_x,image_x,amount,name_y,color,image_y
0,apple,🍏,1,potato,brown,🥔
3,lemon,🍋,1,potato,brown,🥔
2,tomato,🍅,0,tomato,red,🍅


### Customize column suffixes

In [55]:
pd.merge(fruits, veggies, left_on="amount", right_index=True, suffixes=["_f", "_v"])

Unnamed: 0,name_f,image_f,amount,name_v,color,image_v
0,apple,🍏,1,potato,brown,🥔
3,lemon,🍋,1,potato,brown,🥔
2,tomato,🍅,0,tomato,red,🍅


### Cross join

In [58]:
pd.merge(fruits, veggies, how="cross", suffixes=["_fruits", "_veggies"])

Unnamed: 0,name_fruits,image_fruits,amount,name_veggies,color,image_veggies
0,apple,🍏,1,tomato,red,🍅
1,apple,🍏,1,potato,brown,🥔
2,apple,🍏,1,carrot,orange,🥕
3,pear,🍐,3,tomato,red,🍅
4,pear,🍐,3,potato,brown,🥔
5,pear,🍐,3,carrot,orange,🥕
6,tomato,🍅,0,tomato,red,🍅
7,tomato,🍅,0,potato,brown,🥔
8,tomato,🍅,0,carrot,orange,🥕
9,lemon,🍋,1,tomato,red,🍅


In [59]:
pd.merge(fruits, veggies)

Unnamed: 0,name,image,amount,color
0,tomato,🍅,0,red


In [60]:
fruits

Unnamed: 0,name,image,amount
0,apple,🍏,1
1,pear,🍐,3
2,tomato,🍅,0
3,lemon,🍋,1


### Practice: Add a new entry(row) to the fruits dataframe...

In [61]:
fruits = pd.concat(
    [fruits, pd.DataFrame([{"name": "orange", "image": "🍊", "amount": 10}])],
    ignore_index=True,
)

In [62]:
fruits

Unnamed: 0,name,image,amount
0,apple,🍏,1
1,pear,🍐,3
2,tomato,🍅,0
3,lemon,🍋,1
4,orange,🍊,10


In [63]:
veggies

Unnamed: 0,name,color,image
0,tomato,red,🍅
1,potato,brown,🥔
2,carrot,orange,🥕


### Practice...

In [64]:
pd.merge(fruits, veggies, left_on="name", right_on="color")

Unnamed: 0,name_x,image_x,amount,name_y,color,image_y
0,orange,🍊,10,carrot,orange,🥕


In [65]:
pd.merge(fruits, veggies, left_on="image", right_on="name")

Unnamed: 0,name_x,image_x,amount,name_y,color,image_y


In [67]:
merged = pd.merge(fruits, veggies, how="outer")

In [68]:
merged.dtypes

name       object
image      object
amount    float64
color      object
dtype: object

In [69]:
fruits.dtypes

name      object
image     object
amount     int64
dtype: object

In [70]:
merged

Unnamed: 0,name,image,amount,color
0,apple,🍏,1.0,
1,pear,🍐,3.0,
2,tomato,🍅,0.0,red
3,lemon,🍋,1.0,
4,orange,🍊,10.0,
5,potato,🥔,,brown
6,carrot,🥕,,orange


In [72]:
pd.merge(fruits, veggies).dtypes

name      object
image     object
amount     int64
color     object
dtype: object