In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

In [2]:
# This isn't exactly production-grade, but a quick check for development
# These checks can save some head-scratching in development when moving from
# one python environment to another, for example
expected_model_version = '1.0'
model_path = 'C:/Users/Mohammad/DataScienceCapstone/models/Airbnb_pricing_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [5]:
ski_data = pd.read_csv('C:/Users/Mohammad/DataScienceCapstone/data/Airbnb_step3_features.csv')


In [6]:
big_mountain = ski_data[ski_data.id == 108061]

In [7]:
big_mountain.T

Unnamed: 0,0
id,108061
name,Walk to stores/parks/downtown. Fenced yard/Pet...
host_id,320564
host_name,Lisa
neighbourhood,28801
latitude,35.6067
longitude,-82.5556
room_type,Entire home/apt
price,82
minimum_nights,30


In [9]:

X = ski_data.loc[ski_data.id != 108061, model.X_columns]
y = ski_data.loc[ski_data.id != 108061, 'price']

In [10]:
len(X), len(y)

(2081, 2081)

In [11]:
model.fit(X, y)

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n

In [12]:
cv_results = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [13]:
cv_results['test_score']

array([-70.6870024 , -62.65021394, -90.09667308, -80.7395649 ,
       -95.34677404])

In [14]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(79.90404567192401, 12.047524237556132)

In [15]:
X_bm = ski_data.loc[ski_data.id == 108061, model.X_columns]
y_bm = ski_data.loc[ski_data.id == 108061, 'price']

In [16]:
bm_pred = model.predict(X_bm).item()

In [17]:
y_bm = y_bm.values.item()

In [18]:
print(f'Big Mountain Resort modelled price is ${bm_pred:.2f}, actual price is ${y_bm:.2f}.')
print(f'Even with the expected mean absolute error of ${mae_mean:.2f}, this suggests there is room for an increase.')

Big Mountain Resort modelled price is $132.20, actual price is $82.00.
Even with the expected mean absolute error of $79.90, this suggests there is room for an increase.
