# Imports

In [2]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim #used in filling missing zipcodes


# Read in And Cleaning Calendar.csv

In [4]:
calendar.columns

Index(['listing_id', 'date', 'available', 'price'], dtype='object')

In [12]:
#Calendar.csv READING

CALENDAR = 'data/calendar.csv'

#Read in all columns from calendar.csv : listing_id, date, available, price
calendar = pd.read_csv(CALENDAR)


#############################
#         Cleaning          #
#############################

# 'date' ##########
calendar.date = pd.to_datetime(calendar.date)

# 'available' ##########
calendar.available.replace({'f':False,'t':True}, inplace=True)

# 'price' ##########
calendar.price = calendar.price.replace('[^0-9.]+','',regex=True).astype(float)






# Calendar Column Augmentation

In [15]:
#create column to represent the day of the week for each date
calendar['day_of_week'] = calendar.date.dt.dayofweek

#Fill in missing price values for each listing using mean value for day of week from that listing
calendar.price.fillna(calendar.groupby(['listing_id','day_of_week'])['price'].transform("mean"), inplace=True)

#create column for revenue generate by property (all prices for occupied days are modeled from mean)
calendar['day_revenue'] = np.where(calendar.available, 0.0, calendar.price)

In [32]:
#Save to Pickle because it preserves the index and types
calendar.to_pickle('data/calendar_cleaned.pkl')

# Using Calendar to Augment Listings DataFrame

In [53]:
#Create Quarter Revenue Columns in Calendar

quarter_dates = ['2016-09-06','2016-12-06','2017-03-06','2017-06-06','2017-09-06']

#Create a list of calendars seperated into 4 quarters
q_cal = [calendar[calendar.date.isin(pd.date_range(quarter_dates[n], quarter_dates[n+1]))] for n in range(4)]

listings = pd.read_pickle('data/listings_cleaned.pkl')

for n,q in enumerate(q_cal):
    listings = listings.join(q_cal[n].groupby('listing_id').day_revenue.sum()).rename(columns={'day_revenue':'q'+str(n+1)+'_revenue'})
    
listings.to_pickle('data/listing_cleaned.pkl')

In [56]:
#Create Quarter occupancy rate Columns in Calendar

quarter_dates = ['2016-09-06','2016-12-06','2017-03-06','2017-06-06','2017-09-06']

#Create a list of calendars seperated into 4 quarters
q_cal = [calendar[calendar.date.isin(pd.date_range(quarter_dates[n], quarter_dates[n+1]))] for n in range(4)]

listings = pd.read_pickle('data/listings_cleaned.pkl')

for n,q in enumerate(q_cal):
    q_len = len(pd.date_range(quarter_dates[n], quarter_dates[n+1]))
    listings = listings.join((q_len - q_cal[n].groupby('listing_id').available.sum())/q_len).rename(columns={'available':'q'+str(n+1)+'_occupancy_rate'})
    
listings.to_pickle('data/listing_cleaned.pkl')