In [46]:
import os
import pandas as pd
import numpy as np
import seasonal
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import gmean
pd.options.mode.chained_assignment = None
# Seaborn Style
sns.set(style="whitegrid", color_codes=True)

#Fix plots inline for notebook
%matplotlib inline

# Data Location within CC DS
data_location = os.path.join("..","data","external")

#######################################################
# Helpful Functions I may need

def getFiscalYear(dt):
    year = dt.year
    if dt.month>6: year += 1
    return year

def wavg(group, weight_column, value_column):
    d = group[value_column]
    w = group[weight_column]
    try:
        return (d * w).sum() / w.sum()
    except ZeroDivisionError:
        return d.mean()


#######################################################
# Create Lists of Columns for Subsetting Data for Analysis
# Create Mapping of Column Names for Cleanliness
#


compset_oar_list = [
                   'Date',
                   'Region',
                   'Occ-Trans.',
                   'Occ-Grp.',
                   'Occ-Cont.',
                   'Occ-Total',
                   'ADR-Trans.',
                   'ADR-Grp.',
                   'ADR-Cont.',
                   'ADR-Total',
                   'RevPAR-Trans.',
                   'RevPAR-Grp.',
                   'RevPAR-Cont.',
                    'RevPAR-Total'
                    ]

compset_oar_mapping = {
                   'Occ-Trans.':'OCC_TRANS',
                   'Occ-Grp.':'OCC_GRP',
                   'Occ-Cont.':'OCC_CONT',
                   'Occ-Total':'OCC_TOTAL',
                   'ADR-Trans.':'ADR_TRANS',
                   'ADR-Grp.':'ADR_GRP',
                   'ADR-Cont.':'ADR_CONT',
                   'ADR-Total':'ADR_TOTAL',
                   'RevPAR-Trans.':'RevPAR_TRANS',
                   'RevPAR-Grp.':'RevPAR_GRP',
                   'RevPAR-Cont.':'RevPAR_CONT',
                   'RevPAR-Total':'RevPAR_TOTAL'
                    }

m_cnty_oar_list = [
                   'Year',
                   'Month',
                   'Occ-Trans.',
                   'Occ-Grp.',
                   'Occ-Cont.',
                   'Occ-Total',
                   'ADR-Trans.',
                   'ADR-Grp.',
                   'ADR-Cont.',
                   'ADR-Total',
                   'RevPAR-Trans.',
                   'RevPAR-Grp.',
                   'RevPAR-Cont.',
                   'RevPAR-Total'
                    ]

m_cnty_oar_mapping = {
                   'Occ-Trans.':'OCC_TRANS',
                   'Occ-Grp.':'OCC_GRP',
                   'Occ-Cont.':'OCC_CONT',
                   'Occ-Total':'OCC_TOTAL',
                   'ADR-Trans.':'ADR_TRANS',
                   'ADR-Grp.':'ADR_GRP',
                   'ADR-Cont.':'ADR_CONT',
                   'ADR-Total':'ADR_TOTAL',
                   'RevPAR-Trans.':'RevPAR_TRANS',
                   'RevPAR-Grp.':'RevPAR_GRP',
                   'RevPAR-Cont.':'RevPAR_CONT',
                   'RevPAR-Total':'RevPAR_TOTAL'
                     }

region_mapping = { 
                   'Santa Barbara/Santa Maria, CA':'Santa Barbara County, CA',
                   'San Jose/Santa Cruz, CA':'San Jose-Santa Cruz, CA',
                   'Napa Valley, CA':'Napa County, CA',
                 }

region_exclude = ['Long Beach/Torrance']

small_group =      [
                    'Napa County, CA',
                    'Santa Barbara, CA',
                    'Sonoma County, CA',
                    'South Lake Tahoe, CA',
                    'Laguna Beach, CA',
                    'Newport Beach/Dana Point, CA'
                   ]

large_group =   [
                 'San Diego, CA',
                 'San Francisco/San Mateo, CA',
                 'Palm Springs, CA',
                 'San Jose-Santa Cruz, CA'
                ]

comp_set_amalgam = [
                    'Napa County, CA',
                    'Laguna Beach, CA',
                    'Palm Springs, CA',
                    'San Diego, CA',
                    'Santa Barbara County, CA',
                    'San Francisco/San Mateo, CA',
                    'Sonoma County, CA',
                    'South Lake Tahoe, CA',
                    'San Jose-Santa Cruz, CA',
                    'Newport Beach/Dana Point, CA'
                    ]

# Read in Monterey County XLS
# Documentation for Monterey County XLS goes here...
file_path = os.path.join(data_location, "Monterey-County-City-STR-ALL.xlsx")
m_cnty = pd.read_excel(file_path, sheetname = "County of Monterey 05-16 Seg")

# Read in Compset data
file_path = os.path.join(data_location, "Comp-Set-STR-ALL.xlsx")
compset_hm = pd.read_excel(file_path, sheetname = "Comp-Set OCC-ADR-RevPARv2")
compset_econ = pd.read_excel(file_path, sheetname = "Comp-Set S-D-Rv3")

# Read in Budget data
file_path = os.path.join(data_location, "TID_BUDGET.xlsx")
budget = pd.read_excel(file_path, sheetname = "income")

#Helper Data to expand number of years so they are even for excel viz
file_path = os.path.join(data_location, "Region-Years.xlsx")
region_years = pd.read_excel(file_path, sheetname = "Regions")



#### Demand-Revenue Columns Mappings

S_D_R_mapping = {'Demand-Trans.':'D-TRANS',
                 'Demand-Grp.':'D-GRP',
                 'Demand-Con.':'D-CON',
                 'Demand-Total':'D-TOTAL',
                 'Revenue-Trans.':'R-TRANS',
                 'Revnue-Grp.':'R-GRP',
                 'Revenue-Con.':'R-CON',
                 'Revenue-Total':'R-TOTAL',
                 'Supply-Total':'S-TOTAL'}
S_D_R_cols = ['Year',
              'Region',
              'Month',
              'Date',
              'S-TOTAL',
              'R-TOTAL',
              'D-TOTAL',
              'D-GRP',
              'D-TRANS',
              'D-CON',
              'R-GRP',
              'R-TRANS',
              'R-CON']

In [48]:
###################################################
# Process Competitive Set down, We just want total supply so we can weight our annual averages
compset_econ = compset_econ.rename(columns=S_D_R_mapping)
compset_econ_v1 = compset_econ[S_D_R_cols]
compset_econ_v1['Month'] = compset_econ_v1['Month'].apply(str)
compset_econ_v1['Year'] = compset_econ_v1['Year'].apply(str)
compset_econ_v1['Date'] = compset_econ_v1['Year'] + "-" + compset_econ_v1['Month']
compset_econ_v1['Date'] = pd.to_datetime(compset_econ_v1['Date'], format = '%Y-%m')
compset_econ_v1['Region'] = compset_econ_v1['Region'].str.replace('+','')
compset_econ_v1 = compset_econ_v1.replace({'Region':region_mapping})
#compset_total = compset_econ_v1.groupby('Region')[['Region','Year','Supply-Total','re']].mean()
#compset_total = compset_total.reset_index()

###################################################
# Process monterey_county
m_cnty_v1 = m_cnty[m_cnty_oar_list]
m_cnty_v2 = m_cnty_v1.rename(columns=m_cnty_oar_mapping)
m_cnty_v2['Month'] = m_cnty_v2['Month'].apply(str)
m_cnty_v2['Year'] = m_cnty_v2['Year'].apply(str)
m_cnty_v2['Date'] = m_cnty_v2['Year'] + "-" + m_cnty_v2['Month']
m_cnty_v2['Date'] = pd.to_datetime(m_cnty_v2['Date'], format = "%Y-%b")
m_cnty_v2['Month'] = m_cnty_v2['Date'].apply(lambda x: x.month)
m_cnty_v2['Region'] = 'Monterey County, CA'

###################################################
# Process compset data
compset_hm_v1 = compset_hm = compset_hm[compset_oar_list]
compset_hm_v2 = compset_hm_v1.rename(columns=compset_oar_mapping)

#Date & Time Manipulation
compset_hm_v2['Date'] = pd.to_datetime(compset_hm_v2['Date'])
compset_hm_v2['Month'] = compset_hm_v2['Date'].apply(lambda x: str(x.month))
compset_hm_v2['Year'] = compset_hm_v2['Date'].apply(lambda x: str(x.year))

#String Manipulation
compset_hm_v2['Region'] = compset_hm_v2['Region'].str.replace('+','')


In [49]:
###############################################
# Combine datasets for Hotel Metrics 
compset_hm_v3 = pd.merge(compset_hm_v2, compset_econ_v1, on=['Region','Date','Year','Month'], how = 'right')
tot_set = compset_hm_v3.append(m_cnty_v2)
#tot_set = tot_set.append(m_city_v2) # These datasets differ

# Sort before Calculations (Do I need to do this?)
tot_set = tot_set[(tot_set['Date'] < '2016-07-01') & (tot_set['Date'] > '2009-01-01')]
tot_set = tot_set.sort_values(by=['Date'], ascending = True)
tot_set = tot_set.round(2)
tot_set = tot_set.reset_index()

#This is an IMPORTANT STEP-HERE WE APPLY OUR FISCAL YEAR (Jun-Jul) to the YEAR variable.
# Is this best practices? I don't think so... not sure. need training.
tot_set['Year'] = tot_set['Date'].apply(getFiscalYear)

#String Manipulation, The mappings change over time, so we need to make sure all the names are consistent
tot_set = tot_set.replace({'Region':region_mapping})
tot_set['Month'] = tot_set['Month'].apply(str)

# Drop Duplicates, The City of Monterey is included in the Comp Set as well as the city data, the city data went back further
tot_set = tot_set.drop_duplicates(subset=['Region','Year','Month'], keep='last')

# Regions we want to keep, We only had one period of Long Beach. Exclude it
tot_set = tot_set[~(tot_set['Region'].isin(region_exclude))]

# Let's merge everything together, the comp set data and our data about the county and city
tot_set = pd.merge(tot_set, region_years, on = ['Region','Year'], how = 'right')


In [52]:
tot_set_v2.head()
tot_set_v2.columns

Index([       u'index',     u'ADR_CONT',      u'ADR_GRP',    u'ADR_TOTAL',
          u'ADR_TRANS',        u'D-CON',        u'D-GRP',      u'D-TOTAL',
            u'D-TRANS',         u'Date',        u'Month',     u'OCC_CONT',
            u'OCC_GRP',    u'OCC_TOTAL',    u'OCC_TRANS',        u'R-CON',
              u'R-GRP',      u'R-TOTAL',      u'R-TRANS',       u'Region',
        u'RevPAR_CONT',   u'RevPAR_GRP', u'RevPAR_TOTAL', u'RevPAR_TRANS',
            u'S-TOTAL',         u'Year'],
      dtype='object')

In [77]:
######################################
# Lets begin by creating a dataframe whose sole purpose is to create and output 'micro data'

#Micro data can only be calculated post 2011-04-01.
tot_set_v2 = tot_set[(tot_set['Date'] < '2016-07-01') & (tot_set['Date'] > '2011-04-01')]

micro = pd.DataFrame()

micro = tot_set_v2
micro['GRP_RN'] = micro['S-TOTAL']*(micro['OCC_GRP']/100)
micro['TRANS_RN'] = micro['S-TOTAL']*(micro['OCC_TRANS']/100)
micro['TOTAL_RN'] = micro['S-TOTAL']*(micro['OCC_TOTAL']/100)

micro['GRP_ADR'] = micro['R-GRP']/micro['GRP_RN']
micro['TRANS_ADR'] = micro['R-TRANS']/micro['TRANS_RN']
micro['TOTAL_ADR'] = micro['R-TOTAL']/micro['TOTAL_RN']


x = micro.groupby(['Region','Year'])['GRP_RN'].sum()/micro.groupby(['Region','Year'])['S-TOTAL'].sum()
x = x.reset_index()
x['pct_change'] = x.groupby(['Region'])[0].pct_change(1)
x

Unnamed: 0,Region,Year,0,pct_change
0,"City of Monterey, CA",2012.0,0.231291,
1,"City of Monterey, CA",2013.0,0.222537,-0.037846
2,"City of Monterey, CA",2014.0,0.230847,0.037341
3,"Laguna Beach, CA",2014.0,0.275697,
4,"Laguna Beach, CA",2015.0,0.253103,-0.081952
5,"Laguna Beach, CA",2016.0,0.210567,-0.168055
6,"Monterey County, CA",2011.0,,
7,"Monterey County, CA",2012.0,0.239464,
8,"Monterey County, CA",2013.0,0.20591,-0.140122
9,"Monterey County, CA",2014.0,0.227039,0.102615


In [71]:
x

0     0.231291
1     0.222537
2     0.230847
3     0.275697
4     0.253103
5     0.210567
6          NaN
7     0.239464
8     0.205910
9     0.227039
10    0.227315
11    0.217076
12    0.249959
13    0.190282
14    0.206166
15    0.200286
16    0.177152
17    0.241246
18    0.246370
19    0.235811
20    0.211821
21    0.188936
22    0.228453
23    0.224192
24    0.260019
25    0.253424
26    0.248574
27    0.232020
28    0.213565
29    0.205616
30    0.115863
31    0.147336
32    0.183001
33    0.169379
34    0.157218
35    0.157688
36    0.123731
37         NaN
38         NaN
39    0.205972
40    0.160056
41    0.181667
42    0.197392
43    0.214756
44    0.192562
45    0.210557
46    0.165081
47    0.137275
Name: 0, dtype: float64