# NYC Airbnb Price Prediction - fastai model training

Use dataset published by Kaggle - https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data - to train a simple XGBoost model to predict prices for Airbnb properties.

This notebook contains the code to train the model from the dataset prepared in the [data cleanup](https://github.com/ryanmark1867/fastai_basics/blob/master/notebooks/data_cleanup.ipynb) notebook. It is adapted from the [Keras model training notebook](https://github.com/ryanmark1867/deep_learning_basics/blob/master/notebooks/model_training.ipynb) trained on the same dataset.


# Links to key parts of the notebook <a name='linkanchor' />
<a href=#ingestdash>Ingest data</a>

<a href=#buildpipe>Build pipeline</a>

<a href=#modelfit>Define and fit model</a>



# Common imports and global variable definitions

In [30]:

''' check to see if the notebook is being run in Colab, and if so, set the current directory appropriately'''
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/machine_learning_tabular_book/code/fastai_basics/notebooks

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/machine_learning_tabular_book/code/fastai_basics/notebooks


In [31]:
import time
start_time = time.time()

In [32]:
# fastai imports
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.tabular.all import *

In [33]:
# common imports
import zipfile
import pandas as pd
import numpy as np
import time
import seaborn as sns
from matplotlib import pyplot
# import datetime, timedelta
import datetime
import pydotplus
from datetime import datetime, timedelta
from datetime import date
from dateutil import relativedelta
from io import StringIO
import pandas as pd
import pickle
from pickle import dump
from pickle import load
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
# DSX code to import uploaded documents
from io import StringIO
import requests
import json
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import os
import yaml
import math
import sys
from subprocess import check_output
from IPython.display import display
#model libraries

#from datetime import date
from sklearn import metrics



In [34]:
# load config file
current_path = os.getcwd()
print("current directory is: "+current_path)

path_to_yaml = os.path.join(current_path, 'model_training_config.yml')
print("path_to_yaml "+path_to_yaml)
try:
    with open (path_to_yaml, 'r') as c_file:
        config = yaml.safe_load(c_file)
except Exception as e:
    print('Error reading the config file')


current directory is: /content/drive/MyDrive/machine_learning_tabular_book/code/fastai_basics/notebooks
path_to_yaml /content/drive/MyDrive/machine_learning_tabular_book/code/fastai_basics/notebooks/model_training_config.yml


In [35]:
# load parameters

repeatable_run = config['test_parms']['repeatable_run']
# fix seeds to get identical results on mulitiple runs
if repeatable_run:
    from numpy.random import seed
    seed(4)
    tf.random.set_seed(7)


testproportion = config['test_parms']['testproportion'] # proportion of data reserved for test set
trainproportion = config['test_parms']['trainproportion'] # proportion of non-test data dedicated to training (vs. validation)
get_test_train_acc = config['test_parms']['get_test_train_acc']
verboseout = config['general']['verboseout']
includetext = config['general']['includetext'] # switch to determine whether text columns are included in the model
save_model_plot = config['general']['save_model_plot'] # switch to determine whether to generate plot with plot_model
tensorboard_callback = config['general']['tensorboard_callback'] # switch to determine if tensorboard callback defined

presaved = config['general']['presaved']
savemodel = config['general']['savemodel']
picklemodel = config['general']['picklemodel']
hctextmax = config['general']['hctextmax']
maxwords = config['general']['maxwords']
textmax = config['general']['textmax']

targetthresh = config['general']['targetthresh']
targetcontinuous = config['general']['targetcontinuous']
target_col = config['general']['target_col']

#time of day thresholds
time_of_day = {'overnight':{'start':0,'end':5},'morning_rush':{'start':5,'end':10},
              'midday':{'start':10,'end':15},'aft_rush':{'start':15,'end':19},'evening':{'start':19,'end':24}}



emptythresh = config['general']['emptythresh']
zero_weight = config['general']['zero_weight']
one_weight = config['general']['one_weight']
one_weight_offset = config['general']['one_weight_offset']
patience_threshold = config['general']['patience_threshold']


# modifier for saved model elements
modifier = config['general']['modifier']

# control whether training controlled by early stop
early_stop = True

# default hyperparameter values
learning_rate = config['hyperparameters']['learning_rate']
dropout_rate = config['hyperparameters']['dropout_rate']
l2_lambda = config['hyperparameters']['l2_lambda']
loss_func = config['hyperparameters']['loss_func']
output_activation = config['hyperparameters']['output_activation']
batch_size = config['hyperparameters']['batch_size']
epochs = config['hyperparameters']['epochs']

# date values
date_today = datetime.now()
print("date today",date_today)

# pickled original dataset and post-preprocessing dataset
pickled_data_file = config['general']['pickled_data_file']
pickled_dataframe = config['general']['pickled_dataframe']

# experiment parameter

current_experiment = config['test_parms']['current_experiment']

# load lists of column categories
collist = config['categorical']
textcols = config['text']
continuouscols = config['continuous']
excludefromcolist = config['excluded']

date today 2022-10-23 03:59:09.540405


# Helper functions

In [36]:
# time_of_day = {'overnight':{'start':0,'end':5},'morning_rush':{'start':5,'end':10},
#              'midday':{'start':10,'end':15},'aft_rush':{'start':15,'end':19},'evening':{'start':19,'end':23}}


def get_time(hour):
    for tod in time_of_day:
        if (hour >= time_of_day[tod]['start']) and (hour < time_of_day[tod]['end']):
            tod_out = tod
    return(tod_out)

def weekend_time(day, tod):
    if (day=='Saturday') or (day=='Sunday'):
        return('w'+tod)
    else:
        return(tod)




In [37]:
# get the paths required

def get_path():
    '''get the path for data files

    Returns:
        path: path for data files
    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))
    return(path)

def get_pipeline_path():
    '''get the path for data files
    
    Returns:
        path: path for pipeline files
    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'pipelines'))
    return(path)

def get_model_path():
    '''get the path for data files
    
    Returns:
        path: path for model files
    '''
    rawpath = os.getcwd()
    # data is in a directory called "data" that is a sibling to the directory containing the notebook
    path = os.path.abspath(os.path.join(rawpath, '..', 'models'))
    return(path)

In [38]:
def set_experiment_parameters(experiment_number, count_no_delay, count_delay):
    ''' set the appropriate parameters for the experiment 
    Args:
        experiment_number: filename containing config parameters
        count_no_delay: count of negative outcomes in the dataset
        count_delay: count of positive outcomes in the dataset

    Returns:
        early_stop: whether the experiment includes an early stop callback
        one_weight: weight applied to positive outcomes
        epochs: number of epochs in the experiment
        es_monitor: performance measurement tracked in callbacks
        es_mod: direction of performance being tracked in callbacks
    
    '''
    print("setting parameters for experiment ", experiment_number)
    # default settings for early stopping:
    es_monitor = "val_loss"
    es_mode = "min"
    if experiment_number == 0:
        #
        early_stop = False
        #
        one_weight = 1.0
        #
        epochs = 1
    elif experiment_number == 9:
        #
        early_stop = True
        es_monitor="val_accuracy"
        es_mode = "max"
        #
        one_weight = (count_no_delay/count_delay) + one_weight_offset
        #
        get_test_train_acc = False
        #
        epochs = 20    
    elif experiment_number == 1:
        #
        early_stop = False
        #
        one_weight = 1.0
        #
        epochs = 10
    elif experiment_number == 2:
        #
        early_stop = False
        #
        one_weight = 1.0
        #
        epochs = 50
    elif experiment_number == 3:
        #
        early_stop = False
        #
        one_weight = (count_no_delay/count_delay) + one_weight_offset
        #
        epochs = 50
    elif experiment_number == 4:
        #
        early_stop = True
        es_monitor = "val_loss"
        es_mode = "min"
        #
        one_weight = (count_no_delay/count_delay) + one_weight_offset
        #
        epochs = 50
    elif experiment_number == 5:
        #
        early_stop = True
        # if early stopping fails because the level of TensorFlow/Python, comment out the following
        # line and uncomment the subsequent if statement
        es_monitor="val_accuracy"
        '''
        if sys.version_info >= (3,7):
            es_monitor="val_accuracy"
        else:
            es_monitor = "val_acc"
        '''
        es_mode = "max"
        #
        one_weight = (count_no_delay/count_delay) + one_weight_offset
        #
        epochs = 50
    else:
        early_stop = True
    return(early_stop, one_weight, epochs,es_monitor,es_mode)






# Ingest data and create refactored dataframe <a name='ingestdash' />
- Ingest data for route information and delay information
- Create refactored dataframe with one row per route / direction / timeslot combination


<a href=#linkanchor>Back to link list</a>

In [39]:
def ingest_data(path):
    '''load list of valid routes and directions into dataframe
    Args:
        path: path for data files
    
    Returns:
        merged_data: dataframe loaded from pickle file
    '''
    file_name = os.path.join(path,pickled_dataframe)
    merged_data = pd.read_pickle(file_name)
    merged_data.head()
    return(merged_data)

In [40]:
def prep_merged_data(merged_data,target_col):
    '''add derived columns to merged_data dataframe
    Args:
        merged_data: input dataframe
        target_col: column that is the target
    
    Returns:
        merged_data: dataframe with derived columns added
    '''
    if targetcontinuous:
        merged_data['target'] = merged_data[target_col]
    else:
        merged_data['target'] = np.where(merged_data[target_col] >= merged_data[target_col].mean(), 1, 0 )
    return(merged_data)

# Master Prep Cell
Contains calls to functions to load data, prep input dataframes, and create refactored dataframe

In [41]:
# master calls

path = get_path()
print("path is",path)
# load route direction and delay data datframes
merged_data = ingest_data(path)
merged_data = prep_merged_data(merged_data,target_col)


'''
print("shape of pre refactored dataset", merged_data.shape)
#merged_data['year'].value_counts()
#merged_data.groupby(['Route','Direction']).size().reset_index().rename(columns={0:'count'}).tail(50)
# create refactored dataframe with one row for each route / direction / timeslot combination
print("shape of refactored dataset", merged_data.shape)
count_no_delay = merged_data[merged_data['target']==0].shape[0]
count_delay = merged_data[merged_data['target']==1].shape[0]
print("count under mean ",count_no_delay)
print("count over mean ",count_delay)
# define parameters for the current experiment
experiment_number = current_experiment
early_stop, one_weight, epochs,es_monitor,es_mode = set_experiment_parameters(experiment_number, count_no_delay, count_delay)
print("early_stop is ",early_stop)
print("one_weight is ",one_weight)
print("epochs is ",epochs)
print("es_monitor is ",es_monitor)
print("es_mode is ",es_mode)
'''

path is /content/drive/MyDrive/machine_learning_tabular_book/code/fastai_basics/data


'\nprint("shape of pre refactored dataset", merged_data.shape)\n#merged_data[\'year\'].value_counts()\n#merged_data.groupby([\'Route\',\'Direction\']).size().reset_index().rename(columns={0:\'count\'}).tail(50)\n# create refactored dataframe with one row for each route / direction / timeslot combination\nprint("shape of refactored dataset", merged_data.shape)\ncount_no_delay = merged_data[merged_data[\'target\']==0].shape[0]\ncount_delay = merged_data[merged_data[\'target\']==1].shape[0]\nprint("count under mean ",count_no_delay)\nprint("count over mean ",count_delay)\n# define parameters for the current experiment\nexperiment_number = current_experiment\nearly_stop, one_weight, epochs,es_monitor,es_mode = set_experiment_parameters(experiment_number, count_no_delay, count_delay)\nprint("early_stop is ",early_stop)\nprint("one_weight is ",one_weight)\nprint("epochs is ",epochs)\nprint("es_monitor is ",es_monitor)\nprint("es_mode is ",es_mode)\n'

In [42]:
merged_data.shape

(48895, 18)

In [43]:
merged_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,"(latitude, longitude)",target
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,"(40.64749, -73.97237)",0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,"(40.75362, -73.98377)",1
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,2019-01-01,0.0,1,365,"(40.80902, -73.9419)",0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,"(40.68514, -73.95976)",0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,"(40.79851, -73.94399)",0


# Define training, validation, and test subsets of the dataset

In [44]:
def get_train_validation_test(dataset):
    '''get training and test data set
    Args:
        dataset: input dataframe
    
    Returns:
        dtrain: training subset of dataset
        dvalid: validation subset of dataset
        dtest: test subset of dataset
    '''
    train, test = train_test_split(dataset, test_size = testproportion)
    dtrain, dvalid = train_test_split(train, random_state=123, train_size=trainproportion)
    print("Through train test split. Test proportion:")
    print(testproportion)
    return(dtrain,dvalid,test)



# Build Pipeline <a name='buildpipe' />

Create pipeline objects to perform final data preparation steps for training and inference.

Note that cleanup on the training dataset is completed upstream in the [data cleanup notebook](https://github.com/ryanmark1867/end_to_end_deep_learning_liveproject/blob/master/notebooks/data_cleanup.ipynb). 
- The pipelines only accomplish the subset of preparation that is required for both training and inference
- Because the scoring data coming in for inference is forced by the web deployment to avoid the invalid values that the data cleanup notebook deals with, the pipelines don't have to deal with those problems.

<a href=#linkanchor>Back to link list</a>

In [45]:
# Features are
# neighbourhood_group
# neighbourhood
# room_type
# minimum_nights
# number_of_reviews
# reviews_per_month
# calculated_host_listings_count



In [46]:
# explicitly define cont and cat
# Features are
# neighbourhood_group
# neighbourhood
# room_type
# minimum_nights
# number_of_reviews
# reviews_per_month
# calculated_host_listings_count
dep_var = 'target'
cat = ['neighbourhood_group','neighbourhood','room_type']
cont = ['minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count']
print("continuous columns are: ",cont)
print("categorical columns are: ",cat)

continuous columns are:  ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']
categorical columns are:  ['neighbourhood_group', 'neighbourhood', 'room_type']


In [47]:
dep_var

'target'

In [48]:
# important, if the target column isn't explicitly cast to string, fastai will interpret the 
# problem as regression rather than classification and accuracy will be bad and static
# this is a tricky problem because the values in the target column look like they are string but are integer
# clue is that show_batch shows target column values as floating point if you don't case target to string explicitly
merged_data['target'] =merged_data.target.astype(str)


In [49]:
merged_data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,"(latitude, longitude)",target
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,"(40.64749, -73.97237)",0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,"(40.75362, -73.98377)",1
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,2019-01-01,0.0,1,365,"(40.80902, -73.9419)",0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,"(40.68514, -73.95976)",0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,"(40.79851, -73.94399)",0


##Define and fit model <a name='modelfit' />
- use the unique fastai tabular data capabilities

<a href=#linkanchor>Back to link list</a>

In [50]:
merged_data.shape[0]

48895

In [51]:
range((merged_data.shape[0]-5000),merged_data.shape[0])

range(43895, 48895)

In [59]:
# define TabularDataLoaders object using the dataframe, the list of pre-processing steps, the categorical and continuous
# column lists
# valid_idx: the indices to use for the validation set
path = '.'
procs = [FillMissing,Categorify, Normalize]
dls = TabularDataLoaders.from_df(merged_data,
                                 path,
                                 procs= procs, 
                                 cat_names= cat, 
                                 cont_names = cont, 
                                 y_names = dep_var,
                                 valid_idx=list(range((merged_data.shape[0]-10000),merged_data.shape[0])), 
                                 bs=32)



In [53]:
# display a sample batch
dls.valid.show_batch()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,target
0,Manhattan,Hell's Kitchen,Private room,1.0,18.0,2.49,30.000001,1
1,Manhattan,Hell's Kitchen,Private room,1.0,16.0,2.22,30.000001,1
2,Manhattan,Hell's Kitchen,Private room,1.0,13.0,1.82,30.000001,1
3,Brooklyn,Bushwick,Private room,15.0,1e-06,1.455204e-08,2.0,0
4,Brooklyn,Williamsburg,Private room,19.999999,1.999999,0.42,1.0,0
5,Queens,Maspeth,Private room,30.000001,1e-06,1.455204e-08,103.000005,0
6,Manhattan,Kips Bay,Entire home/apt,2.0,1.999999,1.4,1.0,1
7,Queens,Rosedale,Private room,1.0,42.000001,5.78,2.0,0
8,Manhattan,Roosevelt Island,Private room,21.0,0.999999,0.24,1.0,0
9,Brooklyn,Clinton Hill,Entire home/apt,3.0,0.999999,0.16,1.0,1


In [54]:
# define and fit the model
#loss_func = CrossEntropyLossFlat(reduction='none')
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,time
0,0.434672,0.407458,0.8179,00:17
1,0.424458,0.406396,0.8149,00:11
2,0.385092,0.405317,0.8171,00:10


In [55]:
# show the loss function used by the learner
learn.loss_func

FlattenedLoss of CrossEntropyLoss()

In [56]:
# show a set of results from the model
learn.show_results()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,target,target_pred
0,2.0,14.0,2.0,-0.279778,-0.20918,1.611017,-0.198527,0.0,0.0
1,2.0,27.0,1.0,-0.233155,-0.579368,-0.680717,-0.198527,0.0,0.0
2,3.0,145.0,1.0,-0.093285,-0.579368,-0.680717,-0.198527,1.0,1.0
3,4.0,69.0,2.0,-0.279778,-0.168048,3.710052,-0.045376,0.0,0.0
4,3.0,96.0,2.0,-0.279778,-0.579368,-0.680717,0.260926,1.0,0.0
5,4.0,184.0,1.0,-0.233155,-0.188614,2.38181,-0.198527,1.0,0.0
6,4.0,167.0,1.0,-3.8e-05,-0.538236,0.096959,-0.198527,1.0,0.0
7,3.0,95.0,2.0,-0.093285,-0.51767,0.585586,0.158825,1.0,0.0
8,3.0,95.0,2.0,-3.8e-05,-0.455972,0.227718,-0.198527,0.0,0.0


In [57]:
learn.summary()

TabularModel (Input shape: 32 x 3)
Layer (type)         Output Shape         Param #    Trainable 
                     32 x 4              
Embedding                                 24         True      
____________________________________________________________________________
                     32 x 33             
Embedding                                 7326       True      
____________________________________________________________________________
                     32 x 3              
Embedding                                 12         True      
Dropout                                                        
BatchNorm1d                               8          True      
____________________________________________________________________________
                     32 x 200            
Linear                                    8800       True      
ReLU                                                           
BatchNorm1d                               400        T

In [58]:

# print elapsed time to run the notebook
print("--- %s seconds ---" % (time.time() - start_time))

--- 45.46606183052063 seconds ---
