# NYC Airbnb Price Prediction - Data Preparation

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.


This notebook contains the common data loading and preparation steps:
- load data from the input CSV
- fix missing values
- clean up anomalies

# Common imports and variables
Imports and variable definitions that are common to the entire notebook


In [67]:
!pip install requests
!pip install xlrd



You are using pip version 19.0.3, however version 20.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.





You are using pip version 19.0.3, however version 20.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [68]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime as dt
# common imports
import zipfile
import time
# import datetime, timedelta
import datetime
from datetime import datetime, timedelta
from datetime import date
from dateutil import relativedelta
from io import StringIO
import pandas as pd
import pickle
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from io import StringIO
import requests
import json
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline 
import os
import math
from subprocess import check_output
from IPython.display import display
import logging
import yaml
from collections import Counter
import re
import os


In [69]:
def get_config(config_file):
    ''' open config file with name config_file that contains parameters
    for this module and return Python object

    Args:
        config_file: filename containing config parameters

    Returns:
        config: Python dictionary with config parms from config file - dictionary


    '''
    current_path = os.getcwd()
    print("current directory is: " + current_path)

    path_to_yaml = os.path.join(current_path, config_file)
    print("path_to_yaml " + path_to_yaml)
    try:
        with open(path_to_yaml, 'r') as c_file:
            config = yaml.safe_load(c_file)
        return config
    except Exception as error:
        print('Error reading the config file ' + str(error))

# Load Data
- ingest CVS into a Pandas dataframe 

In [70]:
def get_path():
    ''' get the path for data files

    Returns:
        path: path for data directory

    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory
    # containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return path

In [71]:
def fill_missing(dataset,textcols,continuouscols,collist,text_default,continuous_default,categorical_default):
    ''' replace missing values with placeholders by column type
    
    Args:
        dataset: dataframe in which missing values being processed
        textcols: list of text columns
        continuouscols: list of continuous columns
        collist: list of categorical columns
        text_default: replacement value for missing values in text columns
        continous_default: replacement value for missing values in continuous columns
        categorical_default: replacement value for missing values in categorical columns

    Returns:
        dataset: dataframe with missing values replaced

    '''
    logging.debug("before mv")
    for col in collist:
        dataset[col].fillna(value=categorical_default, inplace=True)
    for col in continuouscols:
        dataset[col].fillna(value=continuous_default,inplace=True)
    for col in textcols:
        dataset[col].fillna(value=text_default, inplace=True)
    return (dataset)

# Load dataframe
- load pickled dataframe
- show info about the dataset


In [72]:
def ingest_data(path,input_csv,pickled_input_dataframe,save_raw_dataframe,load_from_scratch):
    ''' load data into dataframe
    Args:
        path: path containing input file
        input_csv: input file name
        pickled_input_dataframe: pickled version of input file

    Returns:
        path: path for data directory
    '''
    if load_from_scratch:
        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) 
        if save_raw_dataframe:
            file_name = os.path.join(path,pickled_input_dataframe)
            print("file_name is ",file_name)
            df.to_pickle(file_name)
    else:
        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))
        logging.debug("reloader done")
    return(unpickled_df)

# General cleanup
- correct types for Route and Vehicle
- fill missing values
- create report-date-time index

In [73]:
# the dataset incorporated some anomalies in the 2019 data, including:
# extraneous Incident ID in April 2019 tab
# Gap and Delay columns in April and June 2019 tabs for what had otherwise been called Min Gap and Min Delay
# this function cleans up these anomalies
def fix_anomalous_columns(df):
    # for rows where there is NaN in the Min Delay or Min Gap columns, copy over value from Delay or Gap
    # df.Temp_Rating.fillna(df.Farheit, inplace=True)
    df['Min Delay'].fillna(df['Delay'], inplace=True)
    df['Min Gap'].fillna(df['Gap'], inplace=True)
    # now that the useful values have been copied from Delay and Gap, remove them
    del df['Delay']
    del df['Gap']
    # remove Incident ID column - it's extraneous
    del df['Incident ID']
    return(df)

In [74]:
def replace_time(date_time_value,time_value):
    ''' given a datetime replace the time portion '''
     
    date_time_value = date_time_value.replace(hour=time_value.hour,minute=time_value.minute,second=time_value.minute)
    return(date_time_value)


In [75]:
def general_cleanup(df):
    # ensure Route and Vehicle are strings, not numeric
    df['Route'] = df['Route'].astype(str)
    df['Vehicle'] = df['Vehicle'].astype(str)
    # remove extraneous characters left from Vehicle values being floats
    df['Vehicle'] = df['Vehicle'].str[:-2]
    # tactical definition of categories
    allcols,textcols,continuouscols,timecols,collist = define_feature_categories(df)
    # fill in missing values
    df.isnull().sum(axis = 0)
    df = fix_anomalous_columns(df)
    df = fill_missing(df,allcols,textcols,continuouscols,timecols,collist)
    # create new column combining date + time (needed for resampling) and make it the index
    df['Report Date Time'] = df.apply(lambda x: replace_time(x['Report Date'], x['Time']), axis=1)
    df.index = df['Report Date Time']
    # return the updated dataframe along with the column category lists
    return(df,allcols,textcols,continuouscols,timecols,collist)

# Master cell
This cell contains calls to the other functions in this notebook to complete the data preparation

In [76]:
# master cell to call the other functions
# get the path for data files
path = get_path()
print("path is ",path)
config = get_config('data_preparation_config.yml')
logging.getLogger().setLevel(logging.WARNING)
logging.warning("logging check")
# load route direction and delay data datframes
df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])
# iterate through columns to get basic information
for col in list(df):
    print("Missing values in ",col," ",str(df[col].isna().sum()))
    print("Distinct values ",str(df[col].nunique()))
df = fill_missing(df,config['text'],config['continuous'],config['categorical'],config['general']['text_default'],config['general']['continuous_default'],config['general']['categorical_default'])
df.head()
'''
collist = config['categorical']
textcols = config['text']
continuouscols = config['continuous']
excludefromcolist = config['excluded']

'''

if config['general']['save_transformed_dataframe']:
    print("path is ",path)
    file_name = os.path.join(path,config['file_names']['pickled_output_dataframe'])
    print("file_name is ",file_name)
    df.to_pickle(file_name)
df.head()



path is  C:\personal\manning_liveproject\end_to_end_deep_learning_live_project\data
current directory is: C:\personal\manning_liveproject\end_to_end_deep_learning_live_project\notebooks
path_to_yaml C:\personal\manning_liveproject\end_to_end_deep_learning_live_project\notebooks\data_preparation_config.yml
Missing values in  id   0
Distinct values  48895
Missing values in  name   0
Distinct values  47906
Missing values in  host_id   0
Distinct values  37457
Missing values in  host_name   0
Distinct values  11453
Missing values in  neighbourhood_group   0
Distinct values  5
Missing values in  neighbourhood   0
Distinct values  221
Missing values in  latitude   0
Distinct values  19048
Missing values in  longitude   0
Distinct values  14718
Missing values in  room_type   0
Distinct values  3
Missing values in  price   0
Distinct values  674
Missing values in  minimum_nights   0
Distinct values  109
Missing values in  number_of_reviews   0
Distinct values  394
Missing values in  last_revie

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149.0,1.0,9.0,2018-10-19,0.21,6.0,365.0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225.0,1.0,45.0,2019-05-21,0.38,2.0,355.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150.0,3.0,0.0,2019-01-01,0.0,1.0,365.0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89.0,1.0,270.0,2019-07-05,4.64,1.0,194.0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80.0,10.0,9.0,2018-11-19,0.1,1.0,0.0


In [77]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149.0,1.0,9.0,2018-10-19,0.21,6.0,365.0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225.0,1.0,45.0,2019-05-21,0.38,2.0,355.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150.0,3.0,0.0,2019-01-01,0.0,1.0,365.0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89.0,1.0,270.0,2019-07-05,4.64,1.0,194.0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80.0,10.0,9.0,2018-11-19,0.1,1.0,0.0
