# Python 201 - Session 3: Reshaping and Merging Dataframes

# Five-Week Schedule

#### Session 1: Basic Pandas
- DataFrame, Series
- select, add, drop rows and columns
- read and write dataframes

#### Session 2: Sorting & Grouping Data
- sort_values()
- groupby()
- agg() statistics
- drop_duplicates()

#### Session 3: Shaping & Merging Dataframes
- choosing fields to join on
- types of joins: left join, inner join, outer join
- labeling columns
- Python 101 Skills Test

#### Session 4: Editing Data
- parsing, cleaning, re-typing, coding data
- apply and row-based lambda fxn
- Python 102 Skills Test

#### Session 5: Time Series Exercise
- Python 201 Skills Test

In [1]:
# troubleshooting error: install openpyxl
# pip3 install openpyxl

# Step 1 - open Plans.xlsx

In [2]:
import pandas as pd

In [55]:
plans = pd.read_excel("Plans.xlsx",header=0)

In [56]:
plans.drop(columns = ["Unnamed: 0"],inplace=True)
plans

Unnamed: 0,RoomType,RoomStats,January,February,March,April,May,June,July,August,September,October,November,December
0,Club Deluxe King Room,Number of rooms,28,28,28,28,28,28,28,28,28,28,28,28
1,Club Deluxe King Room,Sum of Revenue Total,304164,357840,445312,365792,397600,322056,500976,500976,333984,411516,357840,370160
2,Club Deluxe King Room,Sum of Revenue Rooms,214200,252000,313600,257600,280000,226800,352800,352800,235200,289800,252000,260680
3,Club Deluxe King Room,Sum of Revenue Food,64260,75600,94080,77280,84000,68040,105840,105840,70560,86940,75600,78204
4,Club Deluxe King Room,Sum of Revenue Bar,25704,30240,37632,30912,33600,27216,42336,42336,28224,34776,30240,31276
5,Club Deluxe King Room,Average of Occupancy,31,28,31,30,31,30,31,31,30,31,30,31
6,Club Deluxe King Room,Average of ADR,350,456,513,435,458,383,577,577,397,474,426,426
7,Executive Suite,Number of rooms,36,36,36,36,36,36,36,36,36,36,36,36
8,Executive Suite,Sum of Revenue Total,383400,322056,293940,345060,357840,184032,383400,383400,230040,429408,306720,371124
9,Executive Suite,Sum of Revenue Rooms,270000,226800,207000,243000,252000,129600,270000,270000,162000,302400,216000,261360


# Step 2 - in Plans, melt() months from wide to long 

melt()
- from wide to long
- https://pandas.pydata.org/docs/reference/api/pandas.melt.html

In [57]:
idvars =  ["RoomType","RoomStats"]
# valuevars = []
varname = "Month"
plans = pd.melt(plans,id_vars=idvars,var_name=varname)
plans

Unnamed: 0,RoomType,RoomStats,Month,value
0,Club Deluxe King Room,Number of rooms,January,28
1,Club Deluxe King Room,Sum of Revenue Total,January,304164
2,Club Deluxe King Room,Sum of Revenue Rooms,January,214200
3,Club Deluxe King Room,Sum of Revenue Food,January,64260
4,Club Deluxe King Room,Sum of Revenue Bar,January,25704
5,Club Deluxe King Room,Average of Occupancy,January,31
6,Club Deluxe King Room,Average of ADR,January,350
7,Executive Suite,Number of rooms,January,36
8,Executive Suite,Sum of Revenue Total,January,383400
9,Executive Suite,Sum of Revenue Rooms,January,270000


# Step 3 - in Plans, pivot() roomstats from long to wide

pivot()
- from long to wide
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html

In [58]:
plans = plans.pivot(index=["RoomType","Month"],columns="RoomStats",values="value")
plans

Unnamed: 0_level_0,RoomStats,Average of ADR,Average of Occupancy,Number of rooms,Sum of Revenue Bar,Sum of Revenue Food,Sum of Revenue Rooms,Sum of Revenue Total,Total Average of ADR,Total Average of Occupancy,Total Number of rooms,Total Sum of Revenue Bar,Total Sum of Revenue Food,Total Sum of Revenue Rooms,Total Sum of Revenue Total
RoomType,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,April,,,,,,,,333.0,30.0,200.0,125304.0,313368.0,1044560.0,1483232.0
,August,,,,,,,,386.0,31.0,200.0,157656.0,394140.0,1313800.0,1865596.0
,December,,,,,,,,430.0,31.0,200.0,151548.0,378864.0,1262880.0,1793292.0
,February,,,,,,,,386.0,28.0,200.0,131088.0,327720.0,1092400.0,1551208.0
,January,,,,,,,,386.0,31.0,200.0,135120.0,337800.0,1126000.0,1598920.0
,July,,,,,,,,386.0,31.0,200.0,154128.0,385320.0,1284400.0,1823848.0
,June,,,,,,,,386.0,30.0,200.0,108000.0,270000.0,900000.0,1278000.0
,March,,,,,,,,333.0,31.0,200.0,130056.0,325080.0,1083600.0,1538736.0
,May,,,,,,,,333.0,31.0,200.0,132418.0,331056.0,1103520.0,1566994.0
,November,,,,,,,,386.0,30.0,200.0,120960.0,302400.0,1008000.0,1431360.0


In [59]:
plans.reset_index(inplace=True)

In [60]:
pd.set_option('display.max_rows',None)
plans

RoomStats,RoomType,Month,Average of ADR,Average of Occupancy,Number of rooms,Sum of Revenue Bar,Sum of Revenue Food,Sum of Revenue Rooms,Sum of Revenue Total,Total Average of ADR,Total Average of Occupancy,Total Number of rooms,Total Sum of Revenue Bar,Total Sum of Revenue Food,Total Sum of Revenue Rooms,Total Sum of Revenue Total
0,,April,,,,,,,,333.0,30.0,200.0,125304.0,313368.0,1044560.0,1483232.0
1,,August,,,,,,,,386.0,31.0,200.0,157656.0,394140.0,1313800.0,1865596.0
2,,December,,,,,,,,430.0,31.0,200.0,151548.0,378864.0,1262880.0,1793292.0
3,,February,,,,,,,,386.0,28.0,200.0,131088.0,327720.0,1092400.0,1551208.0
4,,January,,,,,,,,386.0,31.0,200.0,135120.0,337800.0,1126000.0,1598920.0
5,,July,,,,,,,,386.0,31.0,200.0,154128.0,385320.0,1284400.0,1823848.0
6,,June,,,,,,,,386.0,30.0,200.0,108000.0,270000.0,900000.0,1278000.0
7,,March,,,,,,,,333.0,31.0,200.0,130056.0,325080.0,1083600.0,1538736.0
8,,May,,,,,,,,333.0,31.0,200.0,132418.0,331056.0,1103520.0,1566994.0
9,,November,,,,,,,,386.0,30.0,200.0,120960.0,302400.0,1008000.0,1431360.0


In [61]:
plans = plans[~plans.RoomType.isna()]
plans

RoomStats,RoomType,Month,Average of ADR,Average of Occupancy,Number of rooms,Sum of Revenue Bar,Sum of Revenue Food,Sum of Revenue Rooms,Sum of Revenue Total,Total Average of ADR,Total Average of Occupancy,Total Number of rooms,Total Sum of Revenue Bar,Total Sum of Revenue Food,Total Sum of Revenue Rooms,Total Sum of Revenue Total
12,Club Deluxe King Room,April,435.0,30.0,28.0,30912.0,77280.0,257600.0,365792.0,,,,,,,
13,Club Deluxe King Room,August,577.0,31.0,28.0,42336.0,105840.0,352800.0,500976.0,,,,,,,
14,Club Deluxe King Room,December,426.0,31.0,28.0,31276.0,78204.0,260680.0,370160.0,,,,,,,
15,Club Deluxe King Room,February,456.0,28.0,28.0,30240.0,75600.0,252000.0,357840.0,,,,,,,
16,Club Deluxe King Room,January,350.0,31.0,28.0,25704.0,64260.0,214200.0,304164.0,,,,,,,
17,Club Deluxe King Room,July,577.0,31.0,28.0,42336.0,105840.0,352800.0,500976.0,,,,,,,
18,Club Deluxe King Room,June,383.0,30.0,28.0,27216.0,68040.0,226800.0,322056.0,,,,,,,
19,Club Deluxe King Room,March,513.0,31.0,28.0,37632.0,94080.0,313600.0,445312.0,,,,,,,
20,Club Deluxe King Room,May,458.0,31.0,28.0,33600.0,84000.0,280000.0,397600.0,,,,,,,
21,Club Deluxe King Room,November,426.0,30.0,28.0,30240.0,75600.0,252000.0,357840.0,,,,,,,


In [62]:
columns_to_keep = ["RoomType",
                   "Month",
                   "Number of rooms",
                   "Sum of Revenue Bar",
                   "Sum of Revenue Food",
                   "Sum of Revenue Rooms",
                    "Sum of Revenue Total"]

In [63]:
plans = plans[columns_to_keep]
plans

RoomStats,RoomType,Month,Number of rooms,Sum of Revenue Bar,Sum of Revenue Food,Sum of Revenue Rooms,Sum of Revenue Total
12,Club Deluxe King Room,April,28.0,30912.0,77280.0,257600.0,365792.0
13,Club Deluxe King Room,August,28.0,42336.0,105840.0,352800.0,500976.0
14,Club Deluxe King Room,December,28.0,31276.0,78204.0,260680.0,370160.0
15,Club Deluxe King Room,February,28.0,30240.0,75600.0,252000.0,357840.0
16,Club Deluxe King Room,January,28.0,25704.0,64260.0,214200.0,304164.0
17,Club Deluxe King Room,July,28.0,42336.0,105840.0,352800.0,500976.0
18,Club Deluxe King Room,June,28.0,27216.0,68040.0,226800.0,322056.0
19,Club Deluxe King Room,March,28.0,37632.0,94080.0,313600.0,445312.0
20,Club Deluxe King Room,May,28.0,33600.0,84000.0,280000.0,397600.0
21,Club Deluxe King Room,November,28.0,30240.0,75600.0,252000.0,357840.0


# Step 4 - open Actuals.xlsx and rename columns for both dataframes

In [64]:
actuals = pd.read_excel("Actual_Agg.xlsx",header=0)
actuals

Unnamed: 0.1,Unnamed: 0,RoomType,Month,Reservation_Counts,DaysStayed_Counts,RevenueRoom_Sum,RevenueFood_Sum,RevenueBeverage_Sum,RevenueTotal_Sum
0,0,Club Deluxe King Room,1,135,456,205200,30271,21888,257359
1,1,Club Deluxe King Room,2,181,545,245250,32164,26281,303695
2,2,Club Deluxe King Room,3,235,727,290800,41080,41386,373266
3,3,Club Deluxe King Room,4,172,638,255200,25222,34999,315421
4,4,Club Deluxe King Room,5,169,720,288000,16293,31365,335658
5,5,Club Deluxe King Room,6,106,468,210600,6738,21657,238995
6,6,Club Deluxe King Room,7,179,785,353250,118192,48908,520350
7,7,Club Deluxe King Room,8,186,794,357300,108673,42766,508739
8,8,Club Deluxe King Room,9,141,569,227600,69920,24745,322265
9,9,Club Deluxe King Room,10,170,637,286650,86600,33078,406328


In [65]:
plans.columns

Index(['RoomType', 'Month', 'Number of rooms', 'Sum of Revenue Bar',
       'Sum of Revenue Food', 'Sum of Revenue Rooms', 'Sum of Revenue Total'],
      dtype='object', name='RoomStats')

In [66]:
actuals.columns

Index(['Unnamed: 0', 'RoomType', 'Month', 'Reservation_Counts',
       'DaysStayed_Counts', 'RevenueRoom_Sum', 'RevenueFood_Sum',
       'RevenueBeverage_Sum', 'RevenueTotal_Sum'],
      dtype='object')

In [67]:
actuals.drop(columns=["Unnamed: 0"],inplace=True)
actuals.columns

Index(['RoomType', 'Month', 'Reservation_Counts', 'DaysStayed_Counts',
       'RevenueRoom_Sum', 'RevenueFood_Sum', 'RevenueBeverage_Sum',
       'RevenueTotal_Sum'],
      dtype='object')

In [103]:
plans.columns = ["RoomType","Month","NumberOfRooms","Bar","Food","Room","Total"]
plans.columns

Index(['RoomType', 'Month', 'NumberOfRooms', 'Bar', 'Food', 'Room', 'Total'], dtype='object')

In [104]:
actuals.columns = ["RoomType","Month","Reservations","DaysStayed","Room","Food","Bar","Total"]
actuals.columns

Index(['RoomType', 'Month', 'Reservations', 'DaysStayed', 'Room', 'Food',
       'Bar', 'Total'],
      dtype='object')

# Step 5 - in Plans, recode Month from string to int

apply()
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

In [105]:
plans.dtypes

RoomType          object
Month              int64
NumberOfRooms    float64
Bar              float64
Food             float64
Room             float64
Total            float64
dtype: object

In [106]:
actuals.dtypes

RoomType        object
Month            int64
Reservations     int64
DaysStayed       int64
Room             int64
Food             int64
Bar              int64
Total            int64
dtype: object

In [107]:
monthdict = {"January":1,
            "February":2,
            "March":3,
            "April":4,
            "May":5,
            "June":6,
            "July":7,
            "August":8,
            "September":9,
            "October":10,
            "November":11,
            "December":12}

In [73]:
plans

Unnamed: 0,RoomType,Month,NumberOfRooms,Bar,Food,Rooms,Total
12,Club Deluxe King Room,April,28.0,30912.0,77280.0,257600.0,365792.0
13,Club Deluxe King Room,August,28.0,42336.0,105840.0,352800.0,500976.0
14,Club Deluxe King Room,December,28.0,31276.0,78204.0,260680.0,370160.0
15,Club Deluxe King Room,February,28.0,30240.0,75600.0,252000.0,357840.0
16,Club Deluxe King Room,January,28.0,25704.0,64260.0,214200.0,304164.0
17,Club Deluxe King Room,July,28.0,42336.0,105840.0,352800.0,500976.0
18,Club Deluxe King Room,June,28.0,27216.0,68040.0,226800.0,322056.0
19,Club Deluxe King Room,March,28.0,37632.0,94080.0,313600.0,445312.0
20,Club Deluxe King Room,May,28.0,33600.0,84000.0,280000.0,397600.0
21,Club Deluxe King Room,November,28.0,30240.0,75600.0,252000.0,357840.0


In [77]:
plans["Month"] = plans["Month"].apply(lambda row: monthdict[row])

KeyError: 4

In [79]:
plans.dtypes

RoomType          object
Month              int64
NumberOfRooms    float64
Bar              float64
Food             float64
Rooms            float64
Total            float64
dtype: object

In [80]:
actuals.dtypes

RoomType        object
Month            int64
Reservations     int64
DaysStayed       int64
Room             int64
Food             int64
Beverage         int64
Total            int64
dtype: object

# Step 6 - merge() Actuals and Plans

types of joins
- https://www.google.com/search?q=types+of+joins+sql&rlz=1C1CHBF_enUS844US844&sxsrf=ALiCzsZk3tdWCFd8EUJ_KByvyj5ARcmDlA:1669971460127&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiDyam9yNr7AhVG_SoKHXw7CTsQ_AUoAXoECAIQAw&cshid=1669971479997402&biw=1600&bih=712&dpr=1.2#imgrc=1GSEpVAJ6YPoSM

merge()
- join two tables
- types of joins: inner, left, right, outer, cross
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html

In [83]:
plans.columns

Index(['RoomType', 'Month', 'NumberOfRooms', 'Bar', 'Food', 'Rooms', 'Total'], dtype='object')

In [84]:
actuals.columns

Index(['RoomType', 'Month', 'Reservations', 'DaysStayed', 'Room', 'Food',
       'Beverage', 'Total'],
      dtype='object')

In [89]:
merged = plans.merge(actuals,on=["RoomType","Month"],how="left",suffixes=("_Plan", "_Actual"))
merged

Unnamed: 0,RoomType,Month,NumberOfRooms,Bar_Plan,Food_Plan,Room_Plan,Total_Plan,Reservations,DaysStayed,Room_Actual,Food_Actual,Bar_Actual,Total_Actual
0,Club Deluxe King Room,4,28.0,30912.0,77280.0,257600.0,365792.0,172,638,255200,25222,34999,315421
1,Club Deluxe King Room,8,28.0,42336.0,105840.0,352800.0,500976.0,186,794,357300,108673,42766,508739
2,Club Deluxe King Room,12,28.0,31276.0,78204.0,260680.0,370160.0,178,509,249410,61483,25306,336199
3,Club Deluxe King Room,2,28.0,30240.0,75600.0,252000.0,357840.0,181,545,245250,32164,26281,303695
4,Club Deluxe King Room,1,28.0,25704.0,64260.0,214200.0,304164.0,135,456,205200,30271,21888,257359
5,Club Deluxe King Room,7,28.0,42336.0,105840.0,352800.0,500976.0,179,785,353250,118192,48908,520350
6,Club Deluxe King Room,6,28.0,27216.0,68040.0,226800.0,322056.0,106,468,210600,6738,21657,238995
7,Club Deluxe King Room,3,28.0,37632.0,94080.0,313600.0,445312.0,235,727,290800,41080,41386,373266
8,Club Deluxe King Room,5,28.0,33600.0,84000.0,280000.0,397600.0,169,720,288000,16293,31365,335658
9,Club Deluxe King Room,11,28.0,30240.0,75600.0,252000.0,357840.0,190,560,252000,72942,28028,352970


# Step 7 - format dataframe

In [90]:
merged.columns

Index(['RoomType', 'Month', 'NumberOfRooms', 'Bar_Plan', 'Food_Plan',
       'Room_Plan', 'Total_Plan', 'Reservations', 'DaysStayed', 'Room_Actual',
       'Food_Actual', 'Bar_Actual', 'Total_Actual'],
      dtype='object')

In [94]:
columns_reordered = ['Month', 'RoomType',
                     'NumberOfRooms', 'Reservations', 'DaysStayed', 
                     'Bar_Plan', 'Bar_Actual', 
                     'Food_Plan', 'Food_Actual',
                     'Room_Plan', 'Room_Actual',
                     'Total_Plan', 'Total_Actual']

In [95]:
merged = merged[columns_reordered]
merged

Unnamed: 0,Month,RoomType,NumberOfRooms,Reservations,DaysStayed,Bar_Plan,Bar_Actual,Food_Plan,Food_Actual,Room_Plan,Room_Actual,Total_Plan,Total_Actual
0,4,Club Deluxe King Room,28.0,172,638,30912.0,34999,77280.0,25222,257600.0,255200,365792.0,315421
1,8,Club Deluxe King Room,28.0,186,794,42336.0,42766,105840.0,108673,352800.0,357300,500976.0,508739
2,12,Club Deluxe King Room,28.0,178,509,31276.0,25306,78204.0,61483,260680.0,249410,370160.0,336199
3,2,Club Deluxe King Room,28.0,181,545,30240.0,26281,75600.0,32164,252000.0,245250,357840.0,303695
4,1,Club Deluxe King Room,28.0,135,456,25704.0,21888,64260.0,30271,214200.0,205200,304164.0,257359
5,7,Club Deluxe King Room,28.0,179,785,42336.0,48908,105840.0,118192,352800.0,353250,500976.0,520350
6,6,Club Deluxe King Room,28.0,106,468,27216.0,21657,68040.0,6738,226800.0,210600,322056.0,238995
7,3,Club Deluxe King Room,28.0,235,727,37632.0,41386,94080.0,41080,313600.0,290800,445312.0,373266
8,5,Club Deluxe King Room,28.0,169,720,33600.0,31365,84000.0,16293,280000.0,288000,397600.0,335658
9,11,Club Deluxe King Room,28.0,190,560,30240.0,28028,75600.0,72942,252000.0,252000,357840.0,352970


In [101]:
merged.sort_values(["Month","RoomType"],inplace=True)
merged

Unnamed: 0,Month,RoomType,NumberOfRooms,Reservations,DaysStayed,Bar_Plan,Bar_Actual,Food_Plan,Food_Actual,Room_Plan,Room_Actual,Total_Plan,Total_Actual
4,1,Club Deluxe King Room,28.0,135,456,25704.0,21888,64260.0,30271,214200.0,205200,304164.0,257359
16,1,Executive Suite,36.0,323,962,32400.0,44426,81000.0,105237,270000.0,288600,383400.0,438263
28,1,King Suite,44.0,633,1332,29568.0,48043,73920.0,121782,246400.0,266400,349888.0,436225
40,1,One-Bedroom Club Room,10.0,60,177,12240.0,10025,30600.0,14138,102000.0,106200,144840.0,130363
52,1,One-Bedroom Suite,6.0,40,93,9216.0,7296,23040.0,9343,76800.0,74400,109056.0,91039
64,1,Superior King Room,76.0,464,1385,25992.0,21338,64980.0,51050,216600.0,207750,307572.0,280138
3,2,Club Deluxe King Room,28.0,181,545,30240.0,26281,75600.0,32164,252000.0,245250,357840.0,303695
15,2,Executive Suite,36.0,186,736,27216.0,26622,68040.0,63158,226800.0,220800,322056.0,310580
27,2,King Suite,44.0,439,1099,23232.0,31640,58080.0,83077,193600.0,219800,274912.0,334517
39,2,One-Bedroom Club Room,10.0,53,173,12240.0,10951,30600.0,10501,102000.0,103800,144840.0,125252


# Step 8 - export merged dataframe to Merged.xlsx

In [102]:
merged.to_excel("Merged.xlsx")