# NYC Airbnb Price Prediction - Data Preparation

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple deep learning model to predict prices for Airbnb properties.


This notebook contains the common data loading and preparation steps:
- load data from the input CSV
- do an assessment of the dataset to understand the number of distinct, missing, or invalid values by column


# Common imports and variables
Imports and variable definitions that are common to the entire notebook


In [2]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import logging
import yaml
from collections import Counter
import os

In [3]:
def get_config(config_file):
    ''' open config file with name config_file that contains parameters
    for this module and return Python object

    Args:
        config_file: filename containing config parameters

    Returns:
        config: Python dictionary with config parms from config file - dictionary


    '''
    current_path = os.getcwd()
    path_to_yaml = os.path.join(current_path, config_file)
    print("path_to_yaml " + path_to_yaml)
    try:
        with open(path_to_yaml, 'r') as c_file:
            config = yaml.safe_load(c_file)
        return config
    except Exception as error:
        print('Error reading the config file ' + str(error))


In [4]:
def print_config_values(config):
    #1. All JSON keys/vals are strings, no conversions needed
    #2. Even better: Use f-strings: PEP 498, since Py 3.6 
    for val in config:
        print(f"config value {val} = {config[val]}" )
#         print("config value ",val," ",str(config[val]))

# Load Data
- ingest CSV into a Pandas dataframe 

In [5]:
def get_path():
    ''' get the path for data files

    Returns:
        path: path for data directory

    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory
    # containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return path

In [6]:
def ingest_data(path,input_csv,pickled_input_dataframe,save_raw_dataframe,load_from_scratch):
    ''' load data into dataframe
    Args:
        path: path containing input file
        input_csv: input file name
        pickled_input_dataframe: pickled version of input file

    Returns:
        path: path for data directory
    '''
    if load_from_scratch:
        # if loading from scratch, the raw CSV file is expected to be in the data directory which is a sibling to the 
        # directory that contains this notebook
        unpickled_df = pd.read_csv(os.path.join(path,input_csv)) 
        if save_raw_dataframe:
            file_name = os.path.join(path,pickled_input_dataframe)
            print("file_name is ",file_name)
            unpickled_df.to_pickle(file_name)
    else:
        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))
        logging.debug("reloader done")
    return(unpickled_df)

# Assess values
- assess columns for missing or invalid values

In [7]:
def not_in_list(x, list):
    ''' check if a value is in a list
    Args:
        x: value to check
        list: list in which to check for the value

    Returns:
        return_val: 1 if value is in not in list, 0 otherwise
    '''
    return x not in list
# 
#     if x in list:
#         return_val = 0
#     else:
#         return_val = 1
#     return(return_val)
    

In [8]:
def neg_val(x):
    ''' check if a value is negative #in a list
    Args:
        x: value to check
    
    Returns:
        return_val: 1 if value is negative, 0 otherwise
    '''
    return x < 0

In [9]:
def basic_assessment(df,columns,valid_values,non_neg_continuous):
    ''' assess the values in a dataframe
    Args:
        df: dataframe for assessment
        columns: dictionary of column names by category
        valid_values: dictionary of valid values for categorical columns with limited number of valid values
        non_neg_continuous: list of continuous columns with only non-negative values as valid
    '''
# No need to wrap the df in a list to get the columns! But look out below! 
# columns is also the YML field passed in!      
    for col in df.columns:
#         print("Missing values in ",col," ",str(df[col].isna().sum()))
#         print("Distinct values in ",col," ",str(df[col].nunique()))
        print("Missing values in ",col," ",df[col].isna().sum())
        print("Distinct values in ",col," ", df[col].nunique())

    # for categorical columns with a limited number of valid values, count the number of invalid values by column
    for col in valid_values:
#        print("non-valid values in column ",col," ",str(df[col].apply(lambda x:not_in_list(x,valid_values[col])).sum()))
         print("non-valid values in column ",col," ", df[col].apply(lambda x:not_in_list(x,valid_values[col])).sum())
    
    # count non-numeric values in continuous columns
    for col in columns['continuous']:
        mask = pd.to_numeric(df[col], errors='coerce').isna()
        # count non-numeric values in continuous columns
        print("non-numeric values in continuous col ",col," ", mask.sum())
        # if there are no non-numeric values in the column and it must have non-negative values, count negative values
        if (mask.sum()==0) and (col in non_neg_continuous):
#           print("negative values in column ",col," ",str(df[col].apply(lambda x:neg_val(x)).sum()))
            print("negative values in column ",col," ", df[col].apply(lambda x:neg_val(x)).sum())
    

In [10]:
def out_of_range(x,max,min):
    ''' count whether a value is in a range
    Args:
        x: value to check in range
        max: top of the range to check
        min: bottom of the range to check
        
    Returns:
        ret_val: 1 if out of range, 0 otherwise
    '''
    return x > max or x < min

In [11]:
def out_of_bounding_box(latitude,longitude,bounding_box):
    ''' count whether a location is within a bounding box
        NOTE: Also checks for invalid lat/longs, if any.
        TODO: Since lat/long are Series objects, might be 
        more optimal to combine them into a df and then
        have apply return a bunch of tuples?
    Args:
        latitude: latitude portion of location
        longitude: longitude portion of location
        bounding_box: dictionary with max and min values to compare the location with
        min: bottom of the range to check
        
    Returns:
        ret_val: 1 if out of range, 0 otherwise
    '''    
    min_lat =  bounding_box['min_lat']
    max_lat =  bounding_box['max_lat']
    min_long = bounding_box['min_long']
    max_long = bounding_box['max_long']
# The apply() calls below take about 10X as long!!!    
#     t1 = time.perf_counter_ns()
#     df = pd.merge(latitude, longitude, left_index = True, right_index = True)
#     print(f"Merge of 2 pd.Series(.) took {time.perf_counter_ns() - t1} ns.")
#     t1 = time.perf_counter_ns()
#     d1 = df.apply(lambda x: (x.latitude > 90) + (x.latitude < -90) + (x.longitude > 180) +
#          (x.longitude < -180) + (x.latitude > max_lat) + (x.latitude < min_lat) + (x.longitude > max_long) +
#          (x.longitude < min_long), axis = 1, result_type='expand').sum() 
#     print(f"type(d1) = {type(d1)}, Invalid lats calc took {time.perf_counter_ns() - t1} ns.")
#     t1 = time.perf_counter_ns()
    
# #     print(df.apply(lambda x: (x.latitude > 90) + (x.latitude < -90) + (x.longitude > 180) +
# #          (x.longitude < -180) + (x.latitude > max_lat) + (x.latitude < min_lat) + (x.longitude > max_long) +
# #          (x.longitude < min_long), axis = 1, result_type='expand').sum(), f" <= Invalid lats: calc took {time.perf_counter_ns() - t1} ns.")
#     d2 = df.apply(lambda x: (x.latitude > 90) + (x.latitude < -90) + (x.longitude > 180) +
#          (x.longitude < -180) + (x.latitude > max_lat) + (x.latitude < min_lat) + (x.longitude > max_long) +
#          (x.longitude < min_long), axis = 1, result_type='expand').sum() #, f" <= Invalid lats: calc took {time.perf_counter_ns() - t1} ns.")
#     print(f"type(d2) = {type(d2)}, Invalid lats calc 2 took {time.perf_counter_ns() - t1} ns.")
    t1 = time.perf_counter_ns()
    total = (
#         latitude[latitude > 90].sum() + #alternately
        sum(latitude > 90)
        + sum(latitude < -90) + sum(longitude > 180) 
        + sum (longitude < -180) + sum(latitude > max_lat) + sum(latitude < min_lat) 
        + sum(longitude > max_long) + sum(longitude < min_long)
    )   
    print(f"lat/long checks took {time.perf_counter_ns() - t1} ns.")   
    return total                                  

In [12]:
def geo_assessment(df,bounding_box):
    ''' assess the geo columns in a dataframe by counting how many latitude and longitude values are outside the bounding box
    Args:
        df: dataframe for assessment
        bounding_box: dictionary of maximum and minimum valid latitude and longitude values
    ''' 
    oobb = out_of_bounding_box(df.latitude,df.longitude,bounding_box)
    print("location out of bounds count ",oobb)

In [13]:
#Master cell
#This cell contains calls to the other functions in this notebook to complete the data preparation

# master cell to call the other functions
# get the path for data files
path = get_path()
print("path is ",path)
config = get_config('data_preparation_config.yml')
logging.getLogger().setLevel(logging.WARNING)
logging.warning("logging check")
print_config_values(config)
# load dataframe and, if parameter set, save CSV file as a pickled dataframe
df = ingest_data(path,config['file_names']['input_csv'],config['file_names']['pickled_input_dataframe'],config['general']['save_raw_dataframe'],config['general']['load_from_scratch'])
# get basic assessment information for the dataframe
basic_assessment(df,config['columns'],config['valid_values'],config['non_negative_continuous'])
# get assessment for geospatial information
geo_assessment(df,config['bounding_box'])
df.head()
samp = df.sample(frac=.001, random_state=42)
samp.shape



path is  /media/srutis/Acer/Users/SRUTIS/projects/Manning/DLAirBnBPrices-LP/data
path_to_yaml /media/srutis/Acer/Users/SRUTIS/projects/Manning/DLAirBnBPrices-LP/solution_milestone1/data_preparation_config.yml
config value general = {'load_from_scratch': True, 'save_raw_dataframe': True, 'save_transformed_dataframe': True, 'remove_bad_values': True}
config value columns = {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['id', 'hostid']}
config value category_defaults = {'categorical': 'missing', 'continuous': 0.0, 'text': 'missing', 'date': datetime.date(2019, 1, 1), 'excluded': 'missing'}
config value category_invalid_value_replacements = {'categorical': 'bad_categorical', 'continuous': 'bad_continuous', 'text': 'bad_text', 'date': 'bad_date', 'exclude': 'bad_excl

(49, 16)