# NYC Airbnb Price Prediction - Explore Geospatial Data

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.

This notebook contains the geospatial exploration steps:

- show the data on a map with clusters by listing location
- show the data on a map with clusters by listing price


In [1]:
! pip install -U folium


Requirement already up-to-date: folium in c:\users\ryanm\appdata\local\programs\python\python37\lib\site-packages (0.12.1)


You are using pip version 19.0.3, however version 21.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Load libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# import seaborn as sns
import datetime
import os
from folium.plugins import MarkerCluster
import folium
from folium.plugins import HeatMap
import logging
import yaml
from collections import Counter
import numbers



In [3]:
def get_config(config_file):
    ''' open config file with name config_file that contains parameters
    for this module and return Python object

    Args:
        config_file: filename containing config parameters

    Returns:
        config: Python dictionary with config parms from config file - dictionary


    '''
    current_path = os.getcwd()
    path_to_yaml = os.path.join(current_path, config_file)
    print("path_to_yaml " + path_to_yaml)
    try:
        with open(path_to_yaml, 'r') as c_file:
            config = yaml.safe_load(c_file)
        return config
    except Exception as error:
        print('Error reading the config file ' + str(error))

In [4]:
def print_config_values(config):
    for val in config:
        print("config value ",val," ",str(config[val]))

# Load dataframe

- load pickled dataframe

In [5]:
def get_path():
    ''' get the path for data files

    Returns:
        path: path for data directory

    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory
    # containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return path

In [6]:
def ingest_data(path,input_csv,pickled_input_dataframe,save_raw_dataframe,load_from_scratch):
    ''' load data into dataframe
    Args:
        path: path containing input file
        input_csv: input file name
        pickled_input_dataframe: pickled version of input file

    Returns:
        path: path for data directory
    '''
    if load_from_scratch:
        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) 
        if save_raw_dataframe:
            file_name = os.path.join(path,pickled_input_dataframe)
            print("file_name is ",file_name)
            unpickled_df.to_pickle(file_name)
    else:
        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))
        logging.debug("reloader done")
    return(unpickled_df)

# Visualize using Folium: create a basic map
Use Folium to display a basic map

In [7]:

def create_basic_map(df, bounding_box,map_col):
    ''' create a heatmap from a dataframe
    Args:
        df: dataframe to create map from
        bounding_box: dimensions of focus area
        map_col: column on which to base mapping

  
    '''
    # define centre point of map
    NYC_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
    global map_nyc
    corner1 = tuple([bounding_box['min_lat'],bounding_box['min_long']])
    corner2 = tuple([bounding_box['min_lat'],bounding_box['max_long']])
    corner3 = tuple([bounding_box['max_lat'],bounding_box['max_long']])
    corner4 = tuple([bounding_box['max_lat'],bounding_box['min_long']])
    
    points = [corner1,corner2, corner3, corner4,corner1]
 
  
    # create empty map zoomed in on NYC
    map_nyc = folium.Map(location=NYC_COORDINATES, zoom_start=9)
    folium.PolyLine(points).add_to(map_nyc)
 
    
    display(map_nyc)

# Visualize using Folium: heatmap of listings
Use Folium to display a heat map view of delay counts

In [8]:

# define centre of map
def create_heatmap(df, bounding_box,map_col):
    ''' create a heatmap from a dataframe
    Args:
        df: dataframe to create map from
        bounding_box: dimensions of focus area
        map_col: column on which to base mapping

  
    '''
    # define centre point of map
    NYC_COORDINATES = (df['latitude'].mean(), df['longitude'].mean())
    global map_nyc
    min_tuple = tuple([bounding_box['min_lat'],bounding_box['min_long']])
    max_tuple = tuple([bounding_box['max_lat'],bounding_box['max_long']])
 
  
    # create empty map zoomed in on NYC
    map_nyc = folium.Map(location=NYC_COORDINATES, zoom_start=12)
    

   
    df['count'] = 1
    # define heat map
    HeatMap(data=df[['latitude', 'longitude', map_col]].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(map_nyc)

    display(map_nyc)

# Master Cell

This cell contains calls to the other functions in this notebook to do the map renderings.

In [9]:
# master cell to call the other functions
# get the path for data files
path = get_path()
print("path is ",path)
config = get_config('data_exploration_config.yml')
print("past config definition")
logging.getLogger().setLevel(logging.WARNING)
logging.warning("logging check")
print_config_values(config)
# load dataframe
df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])
print("columns is "+str(config['columns']))

create_heatmap(df, config['bounding_box'],"price")




path is  C:\personal\manning_liveproject\end_to_end_deep_learning_live_project\data
path_to_yaml C:\personal\manning_liveproject\end_to_end_deep_learning_live_project\notebooks\data_exploration_config.yml
past config definition
config value  general   {'load_from_scratch': False, 'save_raw_dataframe': False, 'save_transformed_dataframe': False, 'remove_bad_values': True}
config value  columns   {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}
config value  bounding_box   {'max_long': -73.70018092, 'max_lat': 40.91617849, 'min_long': -74.25909008, 'min_lat': 40.47739894}
config value  newark_bounding_box   {'max_long': -74.11278706, 'max_lat': 40.67325015, 'min_long': -74.25132408, 'min_lat': 40.78813864}
config value  geo_columns   ['latitude', 'lo

# Tableau rendering of the same dataset

Here is an example of the same dataset rendered in Tableau:

<table style="border: none" align="left">
   </tr>
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/end_to_end_deep_learning_liveproject/master/media/tableau_south_west_cp_points.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>

# Tableau rendering colour to distinguish data points

<table style="border: none" align="left">
   <tr style="border: none">
       <th style="border: none"><img src="https://raw.githubusercontent.com/ryanmark1867/end_to_end_deep_learning_liveproject/master/media/tableau_neighbourhood_colours.jpg" width="900" alt="Icon"> </th>
   </tr>
</table>