In [76]:
import pandas as pd

In [77]:
from sklearn import linear_model

In [78]:
import numpy as np

In [79]:
import sklearn.metrics as skm

In [80]:
from sklearn.model_selection import GridSearchCV

### Data Reading and Subsetting

Run this to for debug:

In [81]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0,
                      nrows = 2500)

Run this to use the whole dataset:

In [None]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0)

In [82]:
print(quick_df.shape)
quick_df.drop("date", axis = 1, inplace=True)   #we already have day month year column
quick_df.drop("zipcode", axis = 1, inplace=True)
print(quick_df.shape)

(2500, 284)
(2500, 282)


Partition the model into training, validation and test. Each set should contain different listing_ids:

In [83]:
listing_ids = pd.Series(list(set(quick_df["listing_id"])))
listing_ids

0        213006
1       6214161
2        284690
3       3455514
4      11730986
5       4764715
6        881707
7       1290287
8       8505396
9       7616053
10      3889212
11      2283071
12      9476164
13       761414
14      1948744
15      2500681
16      3741771
17     11653712
18      3522134
19      4249688
20      5231708
21      2761822
22      2192480
23       726625
24      7387243
25     12250733
26      1422958
27     11839089
28      8051829
29      3569783
         ...   
87     11392366
88      6325103
89      2176375
90      6071161
91      9532795
92      3574658
93      6741903
94     10335139
95      8895397
96      6869926
97      7712679
98      8656299
99      1785262
100      597424
101     8496052
102     9442229
103     7132603
104     2142139
105     9592262
106     1332684
107     3268055
108     3522532
109     6599654
110     4215788
111     7361005
112     6419437
113     3767793
114     7543799
115     6234618
116     5354494
Length: 117, dtype: int6

In [84]:
train_id, validate_id, test_id = np.split(listing_ids.sample(frac=1), 
                                         [int(.6*len(listing_ids)), int(.8*len(listing_ids))])

In [85]:
train_id.shape

(70,)

In [86]:
validate_id.shape

(23,)

In [87]:
test_id.shape

(24,)

In [88]:
set(train_id.values).intersection(validate_id.values)

set()

In [89]:
set(train_id.values).intersection(test_id.values)

set()

In [90]:
set(test_id.values).intersection(validate_id.values)

set()

In [91]:
train_df = quick_df[quick_df["listing_id"].isin(train_id)]
train_df.shape

(1410, 282)

In [92]:
validate_df = quick_df[quick_df["listing_id"].isin(validate_id)]
validate_df.shape

(511, 282)

In [93]:
test_df = quick_df[quick_df["listing_id"].isin(test_id)]
test_df.shape

(579, 282)

In [94]:
train_val_df = quick_df[quick_df["listing_id"].isin(train_id.append(validate_id))]
train_val_df.shape

(1921, 282)

### Generating Custom Folds

To make sure that each house's every listing only is in the training or validation set and is not in both sets a custom Kfold function was applied:

In [95]:
from sklearn.model_selection import GroupKFold

In [96]:
custom_cv = list(GroupKFold(n_splits = 5).split(train_val_df.drop("price",axis=1),
                                                train_val_df["price"],
                                                train_val_df["listing_id"]))
custom_cv

[(array([  16,   17,   18, ..., 1918, 1919, 1920]),
  array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
           11,   12,   13,   14,   15,  343,  344,  345,  346,  347,  348,
          349,  350,  351,  352,  353,  354,  355,  356,  357,  358,  359,
          360,  361,  362,  363,  364,  365,  366,  367,  368,  369,  370,
          371,  372,  373,  374,  375,  376,  377,  378,  379,  380,  381,
          382,  383,  384,  385,  386,  387,  388,  389,  390,  391,  392,
          393,  394,  395,  396,  397,  398,  399,  400,  401,  402,  403,
          404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,
          415,  416,  417,  638,  639,  640,  641,  642,  643,  644,  645,
          646,  647,  648,  649,  650,  651,  652,  653,  654,  655,  656,
          657,  658,  659,  660,  661,  662,  663,  664,  665,  666,  667,
          668,  669,  670,  781,  782,  783,  784,  785,  786,  787,  788,
          789,  790,  791,  792,  793,  794,  79

## Error Functions

In [22]:
def mean_abs_err(validate_df,y_pred):
    return sum(abs(validate_df["price"] - y_pred)/validate_df["price"])/len(y_pred)

In [23]:
def price_conf_int_check(validate_df, upper_perc, lower_perc, y_pred):
    upper_bound = validate_df["price"] * (1 + upper_perc)
    lower_bound = validate_df["price"] * (1 - lower_perc)
    
    return sum(list(lower_bound < y_pred) & (y_pred < upper_bound)) / len(y_pred)

## Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
from sklearn.model_selection import ParameterGrid

### Single Random Forest Fit

In [None]:
#rand_for = RandomForestRegressor(max_depth=10, random_state=0)
rand_for = RandomForestRegressor()
rand_for

In [None]:
rand_for.fit(train_df.drop("price",axis=1),train_df["price"])

In [None]:
rand_for.feature_importances_

In [None]:
y_pred_randfor = rand_for.predict(validate_df.drop("price",axis=1))
y_pred_randfor

In [None]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_randfor)

In [None]:
mean_abs_err(validate_df,y_pred_lasso)

In [None]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_randfor)

### Grid Search CV for RF

In [26]:
param_rand_for = {'n_estimators': [20, 50, 100], 
                  'max_depth': [None, 5], 
                  'min_samples_split': [10, 50]}
param_rand_for

{'max_depth': [None, 5],
 'min_samples_split': [10, 50],
 'n_estimators': [20, 50, 100]}

In [27]:
#rand_for = RandomForestRegressor(max_depth=10, random_state=0)
rand_for = RandomForestRegressor()
rand_for

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

For the Random Forest we can use the model's default scoring metric since it is MSE:

In [61]:
grid_rand_for = GridSearchCV(rand_for, 
                             param_rand_for, 
                             cv = custom_cv,
                             scoring = "neg_mean_squared_error")

In [62]:
grid_rand_for.fit(train_val_df.drop("price",axis=1),
                  train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 8202, 8203]), array([ 390,  391, ..., 8209, 8210])), (array([  26,   27, ..., 8209, 8210]), array([   0,    1, ..., 8171, 8172])), (array([   0,    1, ..., 8209, 8210]), array([  26,   27, ..., 8140, 8141])), (array([   0,    1, ..., 8209, 8210]), array([  64,   65, ..., 8202, 8203])), (array([   0,    1, ..., 8209, 8210]), array([ 189,  190, ..., 8034, 8035]))],
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [20, 50, 100], 'max_depth': [None, 5], 'min_samples_split': [10, 50]},
       pre_di

In [63]:
grid_rand_for.best_score_

-15242.25342682474

In [64]:
grid_rand_for.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [65]:
grid_rand_for.best_params_

{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}

## Linear Regression with Lasso

### Single Fit for LR with Lasso

In [30]:
lasso_alpha = 120

In [31]:
lasso_reg = linear_model.Lasso(alpha = lasso_alpha,
                               max_iter = 5000000)

In [32]:
lasso_reg.fit(train_df.drop("price",axis=1),train_df["price"])

Lasso(alpha=120, copy_X=True, fit_intercept=True, max_iter=5000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
y_pred_lasso = lasso_reg.predict(validate_df.drop("price",axis=1))
y_pred_lasso

array([ 103.85342634,  103.85342634,  103.85342634, ...,  116.81368974,
        116.81368974,  116.81368974])

In [34]:
validate_df["price"].values

array([ 125.,  125.,  125., ...,   75.,   75.,   75.])

In [35]:
r2_score_lasso = skm.r2_score(validate_df["price"], y_pred_lasso)
r2_score_lasso

0.037232337595621501

In [36]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_lasso)

16605.810241244606

In [38]:
mean_abs_err(validate_df,y_pred_lasso)

0.47505132790737165

In [42]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_lasso)

0.29475869226777374

### Grid Search CV for LR with Lasso

In [28]:
#param_lasso = {'alpha': [0.1, 0.25, 0.5, 1]}
param_lasso = {'alpha': [0.1, 0.25]}
param_lasso

{'alpha': [0.1, 0.25]}

In [45]:
lasso_reg = linear_model.Lasso(max_iter = 100000)

In [56]:
grid_lasso = GridSearchCV(lasso_reg, 
                          param_lasso, 
                          cv = custom_cv,
                          scoring = "neg_mean_squared_error")

DeprecationWarning: Scoring method mean_squared_error was renamed to neg_mean_squared_error in version 0.18 and will be removed in 0.20

In [57]:
grid_lasso.fit(train_val_df.drop("price",axis=1),
               train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 8202, 8203]), array([ 390,  391, ..., 8209, 8210])), (array([  26,   27, ..., 8209, 8210]), array([   0,    1, ..., 8171, 8172])), (array([   0,    1, ..., 8209, 8210]), array([  26,   27, ..., 8140, 8141])), (array([   0,    1, ..., 8209, 8210]), array([  64,   65, ..., 8202, 8203])), (array([   0,    1, ..., 8209, 8210]), array([ 189,  190, ..., 8034, 8035]))],
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.25]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [58]:
grid_lasso.best_score_

-14209.247074593983

A non-negative floating point value (the best value is 0.0), or an array of floating point values, one for each individual target.

In [59]:
grid_lasso.best_estimator_

Lasso(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [60]:
grid_lasso.best_params_

{'alpha': 0.25}

## XGBoost

In [97]:
import xgboost as xgb

In [98]:
from xgboost import XGBClassifier

### Single Fit for XGBoost

In [99]:
xgb = XGBClassifier()
xgb

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [100]:
xgb.fit(train_df.drop("price",axis=1),train_df["price"])

KeyboardInterrupt: 

In [58]:
y_pred_xgb = xgb.predict(validate_df.drop("price",axis=1))
y_pred_xgb

array([ 125.,  115.,  115., ...,   70.,   70.,   70.])

In [59]:
r2_score_xgb = skm.r2_score(validate_df["price"], y_pred_xgb)
r2_score_xgb

0.3283092499355158

In [60]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_xgb)

11585.317592112091

In [61]:
mean_abs_err(validate_df,y_pred_xgb)

0.38584839320455511

In [62]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_xgb)

0.28437986507524649

### Grid Search CV for XGBoost

In [101]:
#param_xgb = {'learning_rate': [0.1, 0.25, 0.5]}
param_xgb = {'learning_rate': [0.25, 0.5]}
param_xgb

{'learning_rate': [0.25, 0.5]}

In [102]:
xgb = XGBClassifier()

In [103]:
grid_xgb = GridSearchCV(xgb, 
                          param_xgb, 
                          cv = custom_cv,
                        scoring = "neg_mean_squared_error")

In [104]:
grid_xgb.fit(train_val_df.drop("price",axis=1),
             train_val_df["price"])

GridSearchCV(cv=[(array([  16,   17, ..., 1919, 1920]), array([   0,    1, ..., 1905, 1906])), (array([   0,    1, ..., 1919, 1920]), array([  16,   17, ..., 1875, 1876])), (array([   0,    1, ..., 1919, 1920]), array([ 173,  174, ..., 1846, 1847])), (array([   0,    1, ..., 1919, 1920]), array([ 143,  144, ..., 1776, 1848])), (array([   0,    1, ..., 1905, 1906]), array([ 108,  109, ..., 1919, 1920]))],
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.25, 0.5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=

In [105]:
grid_xgb.best_score_

-2500.1697032795419

In [106]:
grid_xgb.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.25, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [107]:
grid_xgb.best_params_

{'learning_rate': 0.25}