In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Part 1 - Join using the index

In [34]:
df_orders = pd.read_csv('orders.csv',                 
                 infer_datetime_format=True,
                 parse_dates=['order_date_order'],
                 index_col="order_id"
                 )
df_orders.head()

Unnamed: 0_level_0,order_date_order,customer_id,order_status
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2013-07-25,11599,CLOSED
2,2013-07-25,256,PENDING_PAYMENT
3,2013-07-25,12111,COMPLETE
4,2013-07-25,8827,CLOSED
5,2013-07-25,11318,COMPLETE


In [22]:
df_orders.dtypes

order_date_order    datetime64[ns]
customer_id                  int64
order_status                object
dtype: object

In [35]:
df_orderitems = pd.read_csv('order_items.csv',                 
                 infer_datetime_format=True,
                 index_col=["order_id", "order_item_id"]
                 )
df_orderitems.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_id,quantity,subtotal,product_price
order_id,order_item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,299.98,299.98,957
2,2,1,199.99,199.99,1073
2,3,5,250.0,50.0,502
2,4,1,129.99,129.99,403
4,5,2,49.98,24.99,897


In [25]:
df_orderitems.dtypes

product_id         int64
quantity         float64
subtotal         float64
product_price      int64
dtype: object

In [26]:
# join both dataframes
df = pd.merge(df_orders, df_orderitems, left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,order_date_order,customer_id,order_status,product_id,quantity,subtotal,product_price
order_id,order_item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2013-07-25,11599,CLOSED,1,299.98,299.98,957
2,2,2013-07-25,256,PENDING_PAYMENT,1,199.99,199.99,1073
2,3,2013-07-25,256,PENDING_PAYMENT,5,250.0,50.0,502
2,4,2013-07-25,256,PENDING_PAYMENT,1,129.99,129.99,403
4,5,2013-07-25,8827,CLOSED,2,49.98,24.99,897


In [46]:
df.to_csv("merged_orders.csv")

# Part 2 - Joining without index

In [38]:
df_orders.reset_index(inplace=True)
df_orderitems.reset_index(inplace=True)

In [39]:
df_orders.head()

Unnamed: 0,order_id,order_date_order,customer_id,order_status
0,1,2013-07-25,11599,CLOSED
1,2,2013-07-25,256,PENDING_PAYMENT
2,3,2013-07-25,12111,COMPLETE
3,4,2013-07-25,8827,CLOSED
4,5,2013-07-25,11318,COMPLETE


In [40]:
df_orderitems.head()

Unnamed: 0,order_id,order_item_id,product_id,quantity,subtotal,product_price
0,1,1,1,299.98,299.98,957
1,2,2,1,199.99,199.99,1073
2,2,3,5,250.0,50.0,502
3,2,4,1,129.99,129.99,403
4,4,5,2,49.98,24.99,897


In [41]:
df2 = pd.merge(df_orders, df_orderitems)

In [42]:
df2.head()

Unnamed: 0,order_id,order_date_order,customer_id,order_status,order_item_id,product_id,quantity,subtotal,product_price
0,1,2013-07-25,11599,CLOSED,1,1,299.98,299.98,957
1,2,2013-07-25,256,PENDING_PAYMENT,2,1,199.99,199.99,1073
2,2,2013-07-25,256,PENDING_PAYMENT,3,5,250.0,50.0,502
3,2,2013-07-25,256,PENDING_PAYMENT,4,1,129.99,129.99,403
4,4,2013-07-25,8827,CLOSED,5,2,49.98,24.99,897


In [44]:
df2.set_index(['order_id', 'order_item_id'], inplace=True)
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,order_date_order,customer_id,order_status,product_id,quantity,subtotal,product_price
order_id,order_item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,2013-07-25,11599,CLOSED,1,299.98,299.98,957
2,2,2013-07-25,256,PENDING_PAYMENT,1,199.99,199.99,1073
2,3,2013-07-25,256,PENDING_PAYMENT,5,250.0,50.0,502
2,4,2013-07-25,256,PENDING_PAYMENT,1,129.99,129.99,403
4,5,2013-07-25,8827,CLOSED,2,49.98,24.99,897
