In [3]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

In [26]:
# This isn't exactly production-grade, but a quick check for development
# These checks can save some head-scratching in development when moving from
# one python environment to another, for example
expected_model_version = '1.0'
model_path = 'C:/Users/Mohammad/DataScienceCapstone/models/df_pricing.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [27]:
ski_data = pd.read_csv('C:/Users/Mohammad/DataScienceCapstone/data1/df1.csv')
ski_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,id_x,neighbourhood_x,price,minimum_nights_x,number_of_reviews_x,reviews_per_month_x,calculated_host_listings_count_x,availability_365_x,...,minimum_nights_y,number_of_reviews_y,reviews_per_month_y,calculated_host_listings_count_y,availability_365_y,diff_y,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,0,0,108061,28801,82,30,89,0.79,2,173,...,30,89,0.79,2,173,516.0,1,0,0,0
1,1,1,155305,28806,90,1,279,2.44,6,352,...,1,279,2.44,6,352,144.0,1,0,0,0
2,2,2,156926,28806,30,1,290,2.78,6,0,...,1,290,2.78,6,0,179.0,0,0,0,1
3,3,3,160594,28801,125,30,58,0.51,1,0,...,30,58,0.51,1,0,2019.0,0,0,1,0
4,4,4,209068,28804,134,30,56,0.5,1,345,...,30,56,0.5,1,345,216.0,1,0,0,0


In [29]:
big_mountain = ski_data[ski_data.id_x == 160594]

In [30]:
big_mountain.T

Unnamed: 0,3
Unnamed: 0,3.0
Unnamed: 0_x,3.0
id_x,160594.0
neighbourhood_x,28801.0
price,125.0
minimum_nights_x,30.0
number_of_reviews_x,58.0
reviews_per_month_x,0.51
calculated_host_listings_count_x,1.0
availability_365_x,0.0


In [31]:

X = ski_data.loc[ski_data.id_x != 160594, model.X_columns]
y = ski_data.loc[ski_data.id_x != 160594, 'price']

In [42]:
len(X), len(y)

(2257, 2257)

In [43]:
model.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', None),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=784, random_state=47))])

In [44]:
cv_results = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [45]:
cv_results['test_score']

array([ -64.04967999,  -62.74500237, -108.49533633, -102.73020838,
       -151.15573951])

In [46]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(97.83519331702033, 32.721439086057735)

In [47]:
X_bm = ski_data.loc[ski_data.id_x == 160594, model.X_columns]
y_bm = ski_data.loc[ski_data.id_x == 160594, 'price']

In [48]:
bm_pred = model.predict(X_bm).item()

In [49]:
y_bm = y_bm.values.item()

In [51]:
print(f'Big Mountain Resort modelled price is ${bm_pred:.2f}, actual price is ${y_bm:.2f}.')
print(f'Even with the expected mean absolute error of ${mae_mean:.2f}, this suggests there is room for an change.')

Big Mountain Resort modelled price is $74.68, actual price is $125.00.
Even with the expected mean absolute error of $97.84, this suggests there is room for an change.
