# NYC Airbnb Price Prediction - Explore Geospatial Data

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.

This notebook contains the geospatial exploration steps:

- show the data on a map with clusters by listing location
- show the data on a map with clusters by listing price


In [120]:
! pip install -U folium


Requirement already up-to-date: folium in c:\users\ryanm\appdata\local\programs\python\python37\lib\site-packages (0.11.0)


You are using pip version 19.0.3, however version 20.2.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Load libraries

In [121]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os
from folium.plugins import MarkerCluster
import folium
from folium.plugins import HeatMap
import logging
import yaml
from collections import Counter
import numbers



In [122]:
def get_config(config_file):
    ''' open config file with name config_file that contains parameters
    for this module and return Python object

    Args:
        config_file: filename containing config parameters

    Returns:
        config: Python dictionary with config parms from config file - dictionary


    '''
    current_path = os.getcwd()
    path_to_yaml = os.path.join(current_path, config_file)
    print("path_to_yaml " + path_to_yaml)
    try:
        with open(path_to_yaml, 'r') as c_file:
            config = yaml.safe_load(c_file)
        return config
    except Exception as error:
        print('Error reading the config file ' + str(error))

In [123]:
def print_config_values(config):
    for val in config:
        print("config value ",val," ",str(config[val]))

# Load dataframe

- load pickled dataframe

In [124]:
def get_path():
    ''' get the path for data files

    Returns:
        path: path for data directory

    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory
    # containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return path

In [125]:
def ingest_data(path,input_csv,pickled_input_dataframe,save_raw_dataframe,load_from_scratch):
    ''' load data into dataframe
    Args:
        path: path containing input file
        input_csv: input file name
        pickled_input_dataframe: pickled version of input file

    Returns:
        path: path for data directory
    '''
    if load_from_scratch:
        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) 
        if save_raw_dataframe:
            file_name = os.path.join(path,pickled_input_dataframe)
            print("file_name is ",file_name)
            unpickled_df.to_pickle(file_name)
    else:
        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))
        logging.debug("reloader done")
    return(unpickled_df)

# Visualize using Folium: clustering listing by price
Use Folium to display a cluster view of delay counts

In [126]:
def point_box(min_tuple,max_tuple):
    points = []
    #sw corner
    points.append(min_tuple)
    #nw corner
    points.append(tuple([min_tuple[0],max_tuple[1]]))
    #ne corner
    points.append(max_tuple)
    #se corner
    points.append(tuple([max_tuple[0],min_tuple[1]]))
    # finish off box
    points.append(min_tuple)
    return(points)

In [127]:
# define centre of map
'''
def create_heatmap(df, bounding_box):
    NYC_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
    min_tuple = tuple([bounding_box[min_lat],bounding_box[min_long]])
    max_tuple = tuple([bounding_box[max_lat],bounding_box[max_long]])

    points = []
    # latitude - distance from the equator; longitude - distance from prime meridian
    # points.append(min_tuple)
    # points.append(max_tuple)
    # points = point_box(min_tuple,max_tuple)

for route in dfn['Route']:
    # df.loc[df['column_name'] == some_value]
    dfs = dfn.loc[dfn['Route'] == route]
    # purchase_group['Column_name'].values[0]
    min_tuple = tuple([dfs['min_lat'].values[0],dfs['min_long'].values[0]])
    max_tuple = tuple([dfs['max_lat'].values[0],dfs['max_long'].values[0]])
    pbox = point_box(min_tuple,max_tuple)
    points.append(pbox)

    #points 

    # folium.PolyLine(points).add_to(my_map)

    points = []
    for track in gpx.tracks:
    for segment in track.segments:        
        for point in segment.points:
            points.append(tuple([point.latitude, point.longitude]))
 
    # subset to match subset of locations
    MAX_RECORDS = 2500
  
    # create empty map zoomed in on NYC
    map_nyc = folium.Map(location=NYC_COORDINATES, zoom_start=12)
    folium.PolyLine(points).add_to(map_tor)

    mc = MarkerCluster()

    # iterate through dataset to create clusters

    for row in df[0:MAX_RECORDS].itertuples():
    mc.add_child(folium.Marker(location=[row.latitude,  row.longitude],
                 popup=row.Location))

    # map_tor.add_child(mc)
    display(map_nyc)
'''

"\ndef create_heatmap(df, bounding_box):\n    NYC_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())\n    min_tuple = tuple([bounding_box[min_lat],bounding_box[min_long]])\n    max_tuple = tuple([bounding_box[max_lat],bounding_box[max_long]])\n\n    points = []\n    # latitude - distance from the equator; longitude - distance from prime meridian\n    # points.append(min_tuple)\n    # points.append(max_tuple)\n    # points = point_box(min_tuple,max_tuple)\n\nfor route in dfn['Route']:\n    # df.loc[df['column_name'] == some_value]\n    dfs = dfn.loc[dfn['Route'] == route]\n    # purchase_group['Column_name'].values[0]\n    min_tuple = tuple([dfs['min_lat'].values[0],dfs['min_long'].values[0]])\n    max_tuple = tuple([dfs['max_lat'].values[0],dfs['max_long'].values[0]])\n    pbox = point_box(min_tuple,max_tuple)\n    points.append(pbox)\n\n    #points \n\n    # folium.PolyLine(points).add_to(my_map)\n\n    points = []\n    for track in gpx.tracks:\n    for segment in track.seg

# Visualize using Folium: heatmap of listings
Use Folium to display a heat map view of delay counts

In [128]:

# define centre of map
def create_heatmap(df, bounding_box,map_col):
    ''' create a heatmap from a dataframe
    Args:
        df: dataframe to create map from
        bounding_box: dimensions of focus area
        map_col: column on which to base mapping

  
    '''
    # define centre point of map
    NYC_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
    global map_nyc
    min_tuple = tuple([bounding_box['min_lat'],bounding_box['min_long']])
    max_tuple = tuple([bounding_box['max_lat'],bounding_box['max_long']])
 
  
    # create empty map zoomed in on NYC
    map_nyc = folium.Map(location=NYC_COORDINATES, zoom_start=12)

   
    df['count'] = 1
    # define heat map
    HeatMap(data=df[['latitude', 'longitude', map_col]].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(map_nyc)

    display(map_nyc)

# Master Cell

This cell contains calls to the other functions in this notebook to do the map renderings.

In [129]:
# master cell to call the other functions
# get the path for data files
path = get_path()
print("path is ",path)
config = get_config('data_exploration_config.yml')
print("past config definition")
logging.getLogger().setLevel(logging.WARNING)
logging.warning("logging check")
print_config_values(config)
# load dataframe
df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])
print("columns is "+str(config['columns']))
create_heatmap(df, config['bounding_box'],"price")

# Tableau rendering of the same dataset

Here is an example of the same dataset rendered in Tableau:

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/end_to_end_deep_learning_liveproject/master/media/tableau_south_west_cp_points.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>

# Tableau rendering colour to distinguish data points

<table style="border: none" align="left">
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/end_to_end_deep_learning_liveproject/master/media/tableau_neighbourhood_colours.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>