# Imports

In [2]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim #used in filling missing zipcodes


# Read and Clean Listings.csv

In [3]:
#listings.csv READING

LISTINGS = 'data/listings.csv'

#Choose which columns from the csv to read in.
listings_cols = ['id',
                'host_id',
                'neighbourhood_cleansed',
                'zipcode',
                'latitude',
                'longitude',
                'property_type',
                'room_type',
                'accommodates',
                'bathrooms',
                'amenities',
                'price',
                'cleaning_fee',
                'number_of_reviews',
                'first_review',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'calculated_host_listings_count',
                'reviews_per_month'] 

#Read in data from the csv
listings = pd.read_csv(LISTINGS, usecols=listings_cols)

#Rename any Columns as needed
rename_dict = {'id':'listing_id',
              'price':'listed_price'}
listings.rename(columns = rename_dict, inplace=True)

#use listing_id as index
listings.set_index('listing_id', inplace=True)



#############################
#         Cleaning          #
#############################

# 'zipcode' ##########
#Paste this in to zipcode section of cleaning
def latLonToZip(lat, lon):
    '''Take in a latitude and longitude and return the zipcode for that location'''
    geolocator = Nominatim()
    try:
        location = geolocator.reverse(str(lat)+','+str(lon))
        z = re.compile('(\s)([0-9]{5})(,\sUnited)')
        return z.findall(location[0])[0][1]
    except:
        print(str(lat)+','+str(lon),'-----',location)
        return np.nan
    
    print(str(lat)+','+str(lon),'-----',location)
    return np.nan
#Find all missing zippcodes : missing_zipcodes
missing_zipcodes = listings[listings.zipcode.isnull()].copy()

#update rows that are missing zipcodes using latLonToZip to fill missin
listings.zipcode.update(missing_zipcodes.apply(lambda x: latLonToZip(x['latitude'], x['longitude']), axis=1))

#Remove 'zip+4' part of any zipcode 
listings.zipcode = listings.zipcode.apply(lambda x: x[:5])

# 'price' --> 'listed_price' ##########
listings.listed_price = listings.listed_price.replace('[^0-9.]+','',regex=True).astype(float)

# 'cleaning_fee' ##########
listings.cleaning_fee = listings.cleaning_fee.replace('[^0-9.]+','',regex=True).astype(float)

# 'first_review' ##########
listings.first_review = pd.to_datetime(listings.first_review)

# 'amenities' ##########
listings.amenities = listings.amenities.replace('[^\w,\s/]+','',regex=True).apply(lambda x: x.split(','))



#Save to Pickle because it preserves the index and types###################
listings.to_pickle('data/listings_cleaned.pkl')

# Read and Clean Calendar.csv

In [4]:
#Calendar.csv READING

CALENDAR = 'data/calendar.csv'

#Read in all columns from calendar.csv : listing_id, date, available, price
calendar = pd.read_csv(CALENDAR)


#############################
#         Cleaning          #
#############################

# 'date' ##########
calendar.date = pd.to_datetime(calendar.date)

# 'available' ##########
calendar.available.replace({'f':False,'t':True}, inplace=True)

# 'price' ##########
calendar.price = calendar.price.replace('[^0-9.]+','',regex=True).astype(float)

#############################
#         Augmenting        #
#############################

#create column to represent the day of the week for each date
calendar['day_of_week'] = calendar.date.dt.dayofweek

#Fill in missing price values for each listing using mean value for day of week from that listing
calendar.price.fillna(calendar.groupby(['listing_id','day_of_week'])['price'].transform("mean"), inplace=True)

#create column for revenue generate by property (all prices for occupied days are modeled from mean)
calendar['day_revenue'] = np.where(calendar.available, 0.0, calendar.price)

#Save to Pickle because it preserves the index and types
calendar.to_pickle('data/calendar_cleaned.pkl')

# Using Calendar to Augment Listings DataFrame

In [5]:
listings = pd.read_pickle('data/listings_cleaned.pkl')
calendar = pd.read_pickle('data/calendar_cleaned.pkl')

#Create a list of calendars seperated into 4 quarters
quarter_dates = ['2016-09-06','2016-12-06','2017-03-06','2017-06-06','2017-09-06']
q_cal = [calendar[calendar.date.isin(pd.date_range(quarter_dates[n], quarter_dates[n+1]))] for n in range(4)]

#Revnue Per Quarter
for n,q in enumerate(q_cal):
    listings = listings.join(q_cal[n].groupby('listing_id').day_revenue.sum()).rename(columns={'day_revenue':'q'+str(n+1)+'_revenue'})

#Occupancy Per Quarter
for n,q in enumerate(q_cal):
    q_len = len(pd.date_range(quarter_dates[n], quarter_dates[n+1]))
    listings = listings.join((q_len - q_cal[n].groupby('listing_id').available.sum())/q_len).rename(columns={'available':'q'+str(n+1)+'_occupancy_rate'})

listings.to_pickle('data/listing_cleaned.pkl')

In [6]:
amenities = list(set([item for item_list in listings.amenities for item in item_list]))
amenities.remove('translation missing enhosting_amenity_49')
amenities.remove('')

amn_frame = pd.DataFrame(index = listings.index)

for amn in amenities:
    amn_frame = amn_frame.join(listings.amenities.apply(lambda amns: amn in amns)).rename(columns={'amenities':amn})

analysis_table = amn_frame.copy()
listings['analysis_table'] = listings.index
listings['analysis_table'] = pd.DataFrame(listings.analysis_table.map(lambda x: amn_frame.loc[x]))

# Hosts by Listings Count

In [None]:
import folium
import os
from matplotlib import colors

#map with our data in center
thisMap = folium.Map([42.321145, -71.057083], zoom_start=12, tiles="Cartodb Positron")


#create a color dictionary for with custom buckets
buckets = [(1,1+1),(2,4+1),(5,10+1),(11,49+1),(50,136+1)]
color_names = ['blue','whitesmoke','silver','grey','red']
color_dict = {}
for n in range(len(buckets)):
    for m in range(buckets[n][0],buckets[n][1]):
        color_dict[m] = colors.to_hex(color_names[n])



for n in listings.index:
    alist = listings.loc[n]
    plot_val = alist.calculated_host_listings_count
    popup_text = str(plot_val)
    folium.CircleMarker(location=[alist.latitude, alist.longitude], radius=1,
    popup=popup_text, color=color_dict[plot_val],
    fill_color=color_dict[plot_val]).add_to(thisMap)

thisMap.save(os.path.join('Listings_Colored_by_Host_Listing_Count.html'))

thisMap

In [36]:
host_counts = listings.groupby('host_id').calculated_host_listings_count.mean().value_counts().sort_index()
host_counts

1      1768
2       233
3        71
4        37
5        17
6        14
7        10
8         3
9         2
10        5
11        3
12        1
13        1
14        1
15        2
17        1
19        1
20        2
24        3
25        1
50        1
58        1
61        1
79        1
136       1
Name: calculated_host_listings_count, dtype: int64

In [41]:
pd.Series(host_counts.index * host_counts, index = host_counts.index)

1      1768
2       466
3       213
4       148
5        85
6        84
7        70
8        24
9        18
10       50
11       33
12       12
13       13
14       14
15       30
17       17
19       19
20       40
24       72
25       25
50       50
58       58
61       61
79       79
136     136
dtype: int64

In [43]:
8 in range(1,6)

False

In [81]:
color_dict[3]

'#f5f5f5'

In [74]:
from matplotlib import colors
colors.to_hex('whitesmoke')

'#f5f5f5'

In [82]:
color_dict

{1: '#0000ff',
 2: '#f5f5f5',
 3: '#f5f5f5',
 4: '#f5f5f5',
 5: '#c0c0c0',
 6: '#c0c0c0',
 7: '#c0c0c0',
 8: '#c0c0c0',
 9: '#c0c0c0',
 10: '#c0c0c0',
 11: '#808080',
 12: '#808080',
 13: '#808080',
 14: '#808080',
 15: '#808080',
 16: '#808080',
 17: '#808080',
 18: '#808080',
 19: '#808080',
 20: '#808080',
 21: '#808080',
 22: '#808080',
 23: '#808080',
 24: '#808080',
 25: '#808080',
 26: '#808080',
 27: '#808080',
 28: '#808080',
 29: '#808080',
 30: '#808080',
 31: '#808080',
 32: '#808080',
 33: '#808080',
 34: '#808080',
 35: '#808080',
 36: '#808080',
 37: '#808080',
 38: '#808080',
 39: '#808080',
 40: '#808080',
 41: '#808080',
 42: '#808080',
 43: '#808080',
 44: '#808080',
 45: '#808080',
 46: '#808080',
 47: '#808080',
 48: '#808080',
 49: '#808080',
 50: '#ff0000',
 51: '#ff0000',
 52: '#ff0000',
 53: '#ff0000',
 54: '#ff0000',
 55: '#ff0000',
 56: '#ff0000',
 57: '#ff0000',
 58: '#ff0000',
 59: '#ff0000',
 60: '#ff0000',
 61: '#ff0000',
 62: '#ff0000',
 63: '#ff0000',
 

In [88]:
! pwd

/Users/collinreinking/Google_Drive/Data_Science/MIDS/W18_Python/boston-airbnb-open-data
