In [1]:
import pandas as pd

In [2]:
from sklearn import linear_model

In [3]:
import numpy as np

In [4]:
import sklearn.metrics as skm

In [5]:
from sklearn.model_selection import GridSearchCV

### Data Reading and Subsetting

Run this to for debug:

In [81]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0,
                      nrows = 2500)

Run this to use the whole dataset:

In [6]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0)

  mask |= (ar1 == a)


In [7]:
(quick_df['price'] > 2500).sum()

1177

In [8]:
quick_df = quick_df[quick_df['price'] < 2500]

In [9]:
print(quick_df.shape)
quick_df.drop("date", axis = 1, inplace=True)   #we already have day month year column
quick_df.drop("zipcode", axis = 1, inplace=True)
print(quick_df.shape)

(1126711, 284)
(1126711, 282)


In [10]:
quick_df = quick_df.sample(frac = 0.6)

Partition the model into training, validation and test. Each set should contain different listing_ids:

In [11]:
listing_ids = pd.Series(list(set(quick_df["listing_id"])))
listing_ids

0         1048577
1         1966084
2        12713991
3        17432584
4        12713995
5        15073292
6          655374
7         8650772
8        10747925
9        20840469
10       16646171
11       15859741
12        8257567
13       15597600
14        9044001
15       15073315
16       10747942
17       14549031
18       11010086
19        8126505
20        1835049
21        6291497
22        2359340
23        8650800
24        8519730
25        1179701
26        6553657
27       14549056
28        6291522
29       16121924
           ...   
58702    11534269
58703    17301440
58704    15466434
58705    12713927
58706    15466441
58707    15990734
58708      524239
58709     1310672
58710     6553553
58711     4325329
58712     1572815
58713     1703893
58714     6553559
58715    11010008
58716    15204319
58717    19005408
58718    16383969
58719     4849634
58720    16777183
58721    16515048
58722    17956844
58723     6422511
58724     4718577
58725     5636082
58726     

In [12]:
len(listing_ids)

58732

In [13]:
train_id, validate_id, test_id = np.split(listing_ids.sample(frac=1), 
                                         [int(.6*len(listing_ids)), int(.8*len(listing_ids))])

In [14]:
train_id.shape

(35239,)

In [15]:
validate_id.shape

(11746,)

In [16]:
test_id.shape

(11747,)

In [17]:
set(train_id.values).intersection(validate_id.values)

set()

In [18]:
set(train_id.values).intersection(test_id.values)

set()

In [19]:
set(test_id.values).intersection(validate_id.values)

set()

In [20]:
train_df = quick_df[quick_df["listing_id"].isin(train_id)]
train_df.shape

(404723, 282)

In [21]:
validate_df = quick_df[quick_df["listing_id"].isin(validate_id)]
validate_df.shape

(136257, 282)

In [22]:
test_df = quick_df[quick_df["listing_id"].isin(test_id)]
test_df.shape

(135047, 282)

In [23]:
train_val_df = quick_df[quick_df["listing_id"].isin(train_id.append(validate_id))]
train_val_df.shape

(540980, 282)

### Generating Custom Folds

To make sure that each house's every listing only is in the training or validation set and is not in both sets a custom Kfold function was applied:

In [24]:
from sklearn.model_selection import GroupKFold

In [25]:
custom_cv = list(GroupKFold(n_splits = 3).split(train_val_df.drop("price",axis=1),
                                                train_val_df["price"],
                                                train_val_df["listing_id"]))
custom_cv

[(array([     0,      2,      3, ..., 540975, 540976, 540978]),
  array([     1,      6,      8, ..., 540973, 540977, 540979])),
 (array([     0,      1,      5, ..., 540977, 540978, 540979]),
  array([     2,      3,      4, ..., 540961, 540962, 540974])),
 (array([     1,      2,      3, ..., 540974, 540977, 540979]),
  array([     0,      5,     10, ..., 540975, 540976, 540978]))]

## Error Functions

In [26]:
def mean_abs_err(validate_df,y_pred):
    return sum(abs(validate_df["price"] - y_pred)/validate_df["price"])/len(y_pred)

In [27]:
def price_conf_int_check(validate_df, upper_perc, lower_perc, y_pred):
    upper_bound = validate_df["price"] * (1 + upper_perc)
    lower_bound = validate_df["price"] * (1 - lower_perc)
    
    return sum(list(lower_bound < y_pred) & (y_pred < upper_bound)) / len(y_pred)

## Random Forest

In [75]:
from sklearn.ensemble import RandomForestRegressor

In [76]:
from sklearn.model_selection import ParameterGrid

### Single Random Forest Fit

In [None]:
#rand_for = RandomForestRegressor(max_depth=10, random_state=0)
rand_for = RandomForestRegressor()
rand_for

In [None]:
rand_for.fit(train_df.drop("price",axis=1),train_df["price"])

In [None]:
rand_for.feature_importances_

In [None]:
y_pred_randfor = rand_for.predict(validate_df.drop("price",axis=1))
y_pred_randfor

In [None]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_randfor)

In [None]:
mean_abs_err(validate_df,y_pred_lasso)

In [None]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_randfor)

### Grid Search CV for RF

In [26]:
param_rand_for = {'n_estimators': [20, 50, 100], 
                  'max_depth': [None, 5], 
                  'min_samples_split': [10, 50]}
param_rand_for

{'max_depth': [None, 5],
 'min_samples_split': [10, 50],
 'n_estimators': [20, 50, 100]}

In [27]:
#rand_for = RandomForestRegressor(max_depth=10, random_state=0)
rand_for = RandomForestRegressor()
rand_for

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

For the Random Forest we can use the model's default scoring metric since it is MSE:

In [61]:
grid_rand_for = GridSearchCV(rand_for, 
                             param_rand_for, 
                             cv = custom_cv,
                             scoring = "neg_mean_squared_error")

In [62]:
grid_rand_for.fit(train_val_df.drop("price",axis=1),
                  train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 8202, 8203]), array([ 390,  391, ..., 8209, 8210])), (array([  26,   27, ..., 8209, 8210]), array([   0,    1, ..., 8171, 8172])), (array([   0,    1, ..., 8209, 8210]), array([  26,   27, ..., 8140, 8141])), (array([   0,    1, ..., 8209, 8210]), array([  64,   65, ..., 8202, 8203])), (array([   0,    1, ..., 8209, 8210]), array([ 189,  190, ..., 8034, 8035]))],
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [20, 50, 100], 'max_depth': [None, 5], 'min_samples_split': [10, 50]},
       pre_di

In [63]:
grid_rand_for.best_score_

-15242.25342682474

In [64]:
grid_rand_for.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [65]:
grid_rand_for.best_params_

{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}

## Linear Regression with Lasso

### Single Fit for LR with Lasso

In [68]:
lasso_alpha = 0.1

In [70]:
lasso_reg = linear_model.Lasso(alpha = lasso_alpha,
                               max_iter = 100000)

In [71]:
lasso_reg.fit(train_df.drop("price",axis=1),
              train_df["price"])

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [72]:
y_pred_lasso = lasso_reg.predict(validate_df.drop("price",axis=1))
y_pred_lasso

array([  87.44729073,  489.52224396,  350.67032685, ...,  278.92729904,
        122.63471519,  344.51550627])

In [73]:
(y_pred_lasso < 0).sum()

3341

In [74]:
y_pred_lasso[y_pred_lasso < 0] = 10

In [75]:
(y_pred_lasso < 0).sum()

0

In [76]:
validate_df["price"].values

array([  59.,  407.,  325., ...,  336.,   74.,  900.])

In [77]:
r2_score_lasso = skm.r2_score(validate_df["price"], y_pred_lasso)
r2_score_lasso

0.56039138706147062

In [78]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_lasso)

17160.638071194138

In [79]:
mean_abs_err(validate_df,y_pred_lasso)

0.54314637565235413

In [80]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_lasso)

0.30108544882097799

In [85]:
coef_dict = {}
for coef, feat in zip(lasso_reg.coef_,train_df.drop("price",axis=1).columns):
    coef_dict[feat] = coef
    
coef_dict

{'Households; Estimate; $10,000 to $14,999': 0.87992440261252602,
 'Households; Estimate; $100,000 to $149,999': -0.65902632652875215,
 'Households; Estimate; $15,000 to $24,999': 3.2081403830079367,
 'Households; Estimate; $150,000 to $199,999': 0.67082999079513606,
 'Households; Estimate; $200,000 or more': 2.4260529487021931,
 'Households; Estimate; $25,000 to $34,999': -2.3726650129623619,
 'Households; Estimate; $35,000 to $49,999': -2.0840076216120993,
 'Households; Estimate; $50,000 to $74,999': -0.2930070878166714,
 'Households; Estimate; $75,000 to $99,999': -1.6752739033019319,
 'Households; Estimate; Less than $10,000': 0.0,
 'Total; Estimate; AGE - 10 to 14 years': -8.0201555182659146,
 'Total; Estimate; AGE - 15 to 19 years': 1.4919931035378458,
 'Total; Estimate; AGE - 20 to 24 years': -2.4528466300948564,
 'Total; Estimate; AGE - 25 to 29 years': 1.7092104433866688,
 'Total; Estimate; AGE - 30 to 34 years': 0.23853919767438808,
 'Total; Estimate; AGE - 35 to 39 years': -

In [87]:
import operator
sorted_x = sorted(coef_dict.items(), key=operator.itemgetter(1))
sorted_x

[('state: CO', -29.678294764006996),
 ('state: TN', -20.802854417686934),
 ('review_scores_value', -17.993085439795397),
 ('Total; Estimate; AGE - 85 years and over', -11.5295896686718),
 ('state: NY', -9.197866251437592),
 ('Total; Estimate; AGE - 10 to 14 years', -8.0201555182659146),
 ('state: CA', -5.4395154021091194),
 ('state: LA', -3.9295226199282332),
 ('Total; Estimate; AGE - Under 5 years', -3.036883744555714),
 ('Total; Estimate; AGE - 70 to 74 years', -2.9014518576492678),
 ('review_scores_communication', -2.7505524135718842),
 ('reviews_per_month', -2.7006565764368329),
 ('review_scores_checkin', -2.4582228875665315),
 ('Total; Estimate; AGE - 20 to 24 years', -2.4528466300948564),
 ('Total; Estimate; AGE - 40 to 44 years', -2.4201267012934875),
 ('Households; Estimate; $25,000 to $34,999', -2.3726650129623619),
 ('amenities: Safety Card', -2.2668145112299491),
 ('amenities: Pool', -2.2289543815526525),
 ('amenities: Free parking on premises', -2.1577284705703548),
 ('Hous

In [84]:
sorted(coef_dict, key=coef_dict.get)

['state: CO',
 'state: TN',
 'review_scores_value',
 'Total; Estimate; AGE - 85 years and over',
 'state: NY',
 'Total; Estimate; AGE - 10 to 14 years',
 'state: CA',
 'state: LA',
 'Total; Estimate; AGE - Under 5 years',
 'Total; Estimate; AGE - 70 to 74 years',
 'review_scores_communication',
 'reviews_per_month',
 'review_scores_checkin',
 'Total; Estimate; AGE - 20 to 24 years',
 'Total; Estimate; AGE - 40 to 44 years',
 'Households; Estimate; $25,000 to $34,999',
 'amenities: Safety Card',
 'amenities: Pool',
 'amenities: Free parking on premises',
 'Households; Estimate; $35,000 to $49,999',
 'room_type: Shared room',
 'Households; Estimate; $75,000 to $99,999',
 'amenities: Washer',
 'amenities: Keypad',
 'host_response_time: within a day',
 'Total; Estimate; AGE - 60 to 64 years',
 'Total; Estimate; AGE - 35 to 39 years',
 'amenities: First aid kit',
 'amenities: 24-hour check-in',
 'host_is_superhost: t',
 'property_type: Guesthouse',
 'Households; Estimate; $100,000 to $149,9

### Grid Search CV for LR with Lasso

In [180]:
param_lasso = {'alpha': [0.1, 1, 10]}
param_lasso

{'alpha': [0.1, 1, 10]}

In [181]:
lasso_reg = linear_model.Lasso(max_iter = 100000)

In [184]:
grid_lasso = GridSearchCV(lasso_reg, 
                          param_lasso, 
                          cv = custom_cv,
                          scoring = "neg_mean_squared_error")

DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20

GridSearchCV, by convention, always tries to maximize its score so loss functions like MSE have to be negated.

In [None]:
grid_lasso.fit(train_val_df.drop("price",axis=1),
               train_val_df["price"])

In [58]:
grid_lasso.best_score_

-14209.247074593983

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [59]:
grid_lasso.best_estimator_

Lasso(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [60]:
grid_lasso.best_params_

{'alpha': 0.25}

## XGBoost

In [23]:
import xgboost as xgb

In [24]:
from xgboost import XGBRegressor

### Single Fit for XGBoost

In [93]:
xgb = XGBRegressor()
xgb

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [94]:
xgb.fit(train_df.drop("price",axis=1),
        train_df["price"])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [95]:
y_pred_xgb = xgb.predict(validate_df.drop("price",axis=1))
y_pred_xgb

array([  48.90805817,  141.31481934,  203.44876099, ...,  195.69056702,
         90.83380127,   85.05639648], dtype=float32)

In [96]:
r2_score_xgb = skm.r2_score(validate_df["price"], y_pred_xgb)
r2_score_xgb

0.67213598586220358

In [97]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_xgb)

11509.385381692235

In [61]:
mean_abs_err(validate_df,y_pred_xgb)

0.38584839320455511

In [62]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_xgb)

0.28437986507524649

### Grid Search CV for XGBoost

In [25]:
#param_xgb = {'learning_rate': [0.1, 0.25, 0.5]}
param_xgb = {'learning_rate': [0.1, 0.01],
             'max_depth':[3,10],
             'n_estimators':[100,200]}
param_xgb

{'learning_rate': [0.1, 0.01],
 'max_depth': [3, 10],
 'n_estimators': [100, 200]}

In [26]:
xgb = XGBRegressor()

In [27]:
grid_xgb = GridSearchCV(xgb, 
                        param_xgb, 
                        cv = custom_cv,
                        scoring = "neg_mean_squared_error")

In [28]:
grid_xgb.fit(train_val_df.drop("price",axis=1),
             train_val_df["price"])

GridSearchCV(cv=[(array([     0,      1, ..., 902442, 902443]), array([    16,     17, ..., 902496, 902497])), (array([    16,     17, ..., 902496, 902497]), array([     0,      1, ..., 902363, 902364])), (array([     0,      1, ..., 902496, 902497]), array([   253,    254, ..., 902173, 902174])), (array([     0,      1, ..., 902496, 902497]), array([   136,    137, ..., 902413, 902414])), (array([     0,      1, ..., 902496, 902497]), array([   235,    236, ..., 902442, 902443]))],
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.01], 'max_depth': [3, 10], 'n_estimators': [100, 200

In [29]:
grid_xgb.best_score_

-514571.8145945365

In [30]:
grid_xgb.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [31]:
grid_xgb.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}

### Fitting the Best Model to Train + Validation Data

In [34]:
opt_xgb = XGBRegressor(**grid_xgb.best_params_)
opt_xgb

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [35]:
opt_xgb.fit(train_val_df.drop("price",axis=1),
            train_val_df["price"])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [38]:
y_pred_xgb_opt = opt_xgb.predict(test_df.drop("price",axis=1))
y_pred_xgb_opt

array([  70.82410431,   70.82410431,   70.82410431, ...,  104.7182312 ,
        104.7182312 ,  104.7182312 ], dtype=float32)

In [87]:
min(y_pred_xgb_opt)

69.333809

In [88]:
max(y_pred_xgb_opt)

2199.0784

In [39]:
skm.mean_squared_error(y_true= test_df["price"],
                       y_pred= y_pred_xgb_opt)

22797.470194279438

## Linear Regression - Baseline Model

In [121]:
linreg = linear_model.LinearRegression(n_jobs=-1)
linreg

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [122]:
linreg.fit(X = train_df.drop("price",axis=1),
           y = train_df["price"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [123]:
linreg.coef_

array([  1.96841592e+01,   4.94718800e-01,   9.33105838e-03,
         5.79997504e-02,   2.58641138e-01,   5.64408590e+01,
         4.47867048e+01,   1.79056006e+00,   5.13162758e-02,
         3.92018546e+00,  -5.81461766e+05,   5.81461867e+05,
        -6.47794359e-07,  -2.36714968e-08,  -3.91980712e-01,
        -1.18365742e-01,   3.13808995e+00,  -2.23152083e+00,
         9.58654086e+00,  -5.21081435e+00,   9.50583157e+00,
         1.91309957e+00,  -1.65081226e+01,  -2.39093686e+00,
        -7.29080719e+00,  -1.70335531e+00,  -3.05900553e+00,
         1.60298062e-01,   3.96410754e+00,  -1.40866695e+00,
        -8.23409047e-01,  -3.73432375e+00,  -4.60844919e+01,
         2.05029942e+00,   8.09690751e-01,  -1.22372644e+01,
         7.33111294e+00,   2.89151837e-04,  -1.43806851e+00,
         4.39308842e+00,  -1.21592963e+01,   1.70983542e-01,
         1.18071666e+01,  -5.36052154e-04,  -1.97082558e+00,
         1.58075686e+00,   2.80365978e+00,   6.72137902e-01,
        -1.45005281e+00,

In [124]:
linreg.intercept_

-44632.92108926949

### Testing Linear Regression

In [130]:
y_pred_linreg = linreg.predict(test_df.drop("price",axis=1))
y_pred_linreg

array([ 222.01962665,   88.60842779,  128.27681623, ...,  422.62179021,
        121.46605026,   84.08537915])

In [136]:
test_df["price"]

234653      150.0
912128       83.0
520614      115.0
1025792     209.0
169008       69.0
502222       85.0
10034        60.0
412533      155.0
907566       85.0
471885      152.0
759821      195.0
812594       25.0
997816       35.0
525170       48.0
1049407     200.0
458590      295.0
573035       68.0
360038      115.0
536078       78.0
40118       125.0
1072817      71.0
14695       135.0
545049      140.0
66473        34.0
329009      199.0
738215       85.0
1092928     169.0
208314       34.0
64411        21.0
105694       55.0
            ...  
606821     1300.0
130036      129.0
1034962      60.0
711696      725.0
595033      130.0
628997      259.0
1102244     280.0
916874       80.0
663050      338.0
607822      203.0
359208      260.0
313972      155.0
926254      150.0
172939      142.0
71950       250.0
518340      630.0
448652       79.0
859816       80.0
668308      561.0
942234       52.0
957519       78.0
659840      450.0
526328       55.0
888453      248.0
747262    

In [131]:
skm.mean_squared_error(y_true= test_df["price"],
                       y_pred= y_pred_linreg)

14913.169213254996

In [132]:
min(y_pred_linreg)

-127.34485259756912

In [133]:
(y_pred_linreg < 0).sum()

3168

In [137]:
mean_abs_err(test_df,y_pred_linreg)

0.54449606953057972

In [138]:
price_conf_int_check(validate_df=test_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_linreg)

0.29592760180995475

## Baseline = Mean

In [96]:
skm.mean_squared_error(y_true= test_df["price"],
                       y_pred= np.ones(len(test_df["price"])) * test_df["price"].values.mean())

35275.249229753244

In [97]:
skm.mean_squared_error(y_true= train_df["price"],
                       y_pred= np.ones(len(train_df["price"])) * train_df["price"].values.mean())

35447.188631247045