# 1) Intro to Merging, Joining and Concatenating DataFrames

In [2]:
import pandas as pd

In [55]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

-------

# 2) The `pd.concat` Method, Part 1

In [11]:
week1.head(3)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1


In [12]:
week2.head(3)

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7
2,495,10


## Preserving Original Index after concat

We can easily combine week1 and week2 as columns are the same.

There are a total of 500 rows. But if we look a the index, we can see 249. This is because pandas glue one dataframe on top of each other without modifying the index. Note that in Pandas there can be same index, no need to be unique.
### If we need to preserve original index we need to set `ignore_index=False`, This will retain the original index after concatenation.

In [14]:
pd.concat(objs=[week1, week2])
pd.concat(objs=[week1, week2], ignore_index=False)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
245,783,10
246,556,10
247,547,9
248,252,9


### Otherwise, we can put `ignore_index=True`

We can see that the brand new indexes are created after concat.

In [15]:
pd.concat(objs=[week1, week2], ignore_index=True)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


# Alternative method is `append()`

In [18]:
week1.append(other=week2)
week1.append(other=week2, ignore_index=False)

week1.append(other=week2, ignore_index=True)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9
...,...,...
495,783,10
496,556,10
497,547,9
498,252,9


-------

# 3) The `pd.concat()` Method, Part 2

In [56]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

##  What if we want the best of both world?
+ Create Brand New indexes
+ also Retain the Original Indexes

### The answer is `Multi Index`

## `Keys` parameter
+ represents unique key identifier attached to each dataframe

Now we have unique identifier for each row, also retaining the original index.

In [27]:
sales = pd.concat(objs=[week1, week2], keys=['Week1', 'Week2'])
sales.head(3)

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
Week1,0,537,9
Week1,1,97,4
Week1,2,658,1


Now we can use our good old trusty `.loc` to extract information.

In [40]:
sales.loc[('Week1',)]

sales.loc[('Week1', 0)]
sales.loc['Week2', 50]

sales.loc[('Week1', 240), 'Customer ID']
sales.loc[('Week2', 183), ['Customer ID', 'Food ID']]

Customer ID    858
Food ID          5
Name: (Week2, 183), dtype: int64

-----

# 4) Inner Joins, Part 1

In [57]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

In [45]:
week1.head(2)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4


In [46]:
week2.head(2)

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7


### `on` needs to be on same name key

In [48]:
week1.merge(week2, how='inner', on='Customer ID').head()

Unnamed: 0,Customer ID,Food ID_x,Food ID_y
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9


We can see that Food ID is suffixed with x and y indicating Week1 and week2 tables.

There are also Customers who appear more than twice. We can dig into this.

Example: for customer ID 155, he came to in week1 two times and ordered different food items.

**There will be no de-dup.**

**Pandas will just combine every possible combinations.**

In [49]:
week1[week1['Customer ID'] == 155]

Unnamed: 0,Customer ID,Food ID
4,155,9
17,155,1


In [51]:
week2[week2['Customer ID'] == 155]

Unnamed: 0,Customer ID,Food ID
208,155,3


## We can customize `suffixes` too

In [52]:
week1.merge(week2, how='inner', on='Customer ID', suffixes=['Week - 1', 'Week - 2'])

Unnamed: 0,Customer ID,Food IDWeek - 1,Food IDWeek - 2
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9
...,...,...,...
57,945,5,4
58,343,3,5
59,343,3,2
60,343,3,7


------

# 5) Inner Joins, Part 2

In [58]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

In [59]:
week1.head(2)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4


In [60]:
week2.head(2)

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7


### Who are the customers who ordered the same food Item in both week1 and week2?

In [66]:
week1.merge(week2, how='inner', on=['Customer ID', 'Food ID'])

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,21,4
6,922,1
7,578,5
8,578,5


In [67]:
week1[week1['Customer ID'] == 21]

Unnamed: 0,Customer ID,Food ID
101,21,4
212,21,4


In [68]:
week2[week2['Customer ID'] == 21]

Unnamed: 0,Customer ID,Food ID
30,21,4


------

# 6) Outer Joins

In [69]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

In [70]:
week1.head(2)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4


In [71]:
week2.head(2)

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7


In [74]:
week1.merge(week2, how='outer', on='Customer ID', suffixes=[' - Week1', ' - Week2'])

Unnamed: 0,Customer ID,Food ID - Week1,Food ID - Week2
0,537,9.0,5.0
1,97,4.0,
2,658,1.0,
3,202,2.0,
4,155,9.0,3.0
...,...,...,...
449,855,,4.0
450,559,,10.0
451,276,,4.0
452,556,,10.0


In [75]:
len(week1.merge(week2, how='outer', on='Customer ID', suffixes=[' - Week1', ' - Week2']))

454

## `indicator` : tells visually where data are pulled from
+ we can see which table, data were pulled from

In [77]:
merged = week1.merge(week2, how='outer', on='Customer ID', suffixes=[' - Week1', ' - Week2'], 
                   indicator=True)

merged

Unnamed: 0,Customer ID,Food ID - Week1,Food ID - Week2,_merge
0,537,9.0,5.0,both
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
4,155,9.0,3.0,both
...,...,...,...,...
449,855,,4.0,right_only
450,559,,10.0,right_only
451,276,,4.0,right_only
452,556,,10.0,right_only


In [78]:
merged['_merge'].value_counts()

right_only    197
left_only     195
both           62
Name: _merge, dtype: int64

We can see 197 unique customers in Week2, 195 unique customers in Week 1 and 62 common customers in both Week1 and Week2.

In [79]:
197 + 195 + 62

454

## Let's say we want to find out customers who visited to Restrurant in either Week1 and Week2, not in both weeks.

In [93]:
mask = merged['_merge'] != 'both'
merged[mask]

mask = merged['_merge'].isin(['left_only', 'right_only'])
merged[mask]

Unnamed: 0,Customer ID,Food ID - Week1,Food ID - Week2,_merge
1,97,4.0,,left_only
2,658,1.0,,left_only
3,202,2.0,,left_only
6,213,8.0,,left_only
7,600,1.0,,left_only
...,...,...,...,...
449,855,,4.0,right_only
450,559,,10.0,right_only
451,276,,4.0,right_only
452,556,,10.0,right_only


---

# 7) Left Joins / Right Joins


In [94]:
week1 = pd.read_csv('Data/Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Data/Restaurant - Week 2 Sales.csv')
customers = pd.read_csv('Data/Restaurant - Customers.csv')
foods = pd.read_csv('Data/Restaurant - Foods.csv')

In [102]:
week1.head(3)

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1


In [101]:
foods.head(3)

Unnamed: 0,Food ID,Food Item,Price
0,1,Sushi,3.99
1,2,Burrito,9.99
2,3,Taco,2.99


### Which foods does customer order?

In [104]:
week1.merge(foods, how='left', on='Food ID')

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,537,9,Donut,0.99
1,97,4,Quesadilla,4.25
2,658,1,Sushi,3.99
3,202,2,Burrito,9.99
4,155,9,Donut,0.99
...,...,...,...,...
245,413,9,Donut,0.99
246,926,6,Pasta,13.99
247,134,3,Taco,2.99
248,396,6,Pasta,13.99


## `sort` parameter

In [106]:
week1.merge(foods, how='left', on='Food ID', sort=True)

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,658,1,Sushi,3.99
1,600,1,Sushi,3.99
2,155,1,Sushi,3.99
3,341,1,Sushi,3.99
4,20,1,Sushi,3.99
...,...,...,...,...
245,809,10,Drink,1.75
246,584,10,Drink,1.75
247,274,10,Drink,1.75
248,151,10,Drink,1.75


In [105]:
week1.merge(week2, how='right', on='Customer ID', suffixes=[' - Week1', ' - Week2'], indicator=True)

Unnamed: 0,Customer ID,Food ID - Week1,Food ID - Week2,_merge
0,688,,10,right_only
1,813,,7,right_only
2,495,,10,right_only
3,495,,6,right_only
4,495,,2,right_only
...,...,...,...,...
254,945,5.0,4,both
255,783,6.0,10,both
256,556,,10,right_only
257,252,,9,right_only


-------