In [1]:
import pandas as pd

In [2]:
from sklearn import linear_model

In [3]:
import numpy as np

In [4]:
import sklearn.metrics as skm

In [5]:
from sklearn.model_selection import GridSearchCV

In [6]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0,
                      nrows = 10000)

In [None]:
quick_df = pd.read_csv("../fully_processed_data4.csv",
                      index_col = 0)

In [7]:
print(quick_df.shape)
quick_df.drop("date", axis = 1, inplace=True)   #we already have day month year column
quick_df.drop("zipcode", axis = 1, inplace=True)
print(quick_df.shape)

(10000, 284)
(10000, 282)


Partition the model into training, validation and test. Each set should contain different listing_ids:

In [8]:
listing_ids = pd.Series(list(set(quick_df["listing_id"])))
listing_ids

0        213006
1        284690
2      10496018
3       7757846
4       5333015
5       6869019
6       8503329
7        667683
8       1054757
9        725030
10     11730986
11      1290287
12      8505396
13       483384
14      6576184
15      8304698
16      3889212
17      9476164
18      7665735
19      2500681
20      3741771
21      4241484
22      4249688
23      9363548
24     12105828
25      3649639
26      7387243
27      5226605
28      3569783
29      8841337
         ...   
428     9756504
429     3403609
430     4124504
431     3645277
432    10647402
433     5685113
434     4917118
435     3563399
436     4937608
437     6741903
438    11186065
439      233377
440     7712679
441     5279661
442     7940019
443    12189623
444     2142139
445     4515772
446      632766
447     1212352
448      731073
449    10901464
450     1415137
451     3522532
452    11208688
453     2691057
454     8634354
455     2222067
456      563189
457    11167742
Length: 458, dtype: int6

In [9]:
train_id, validate_id, test_id = np.split(listing_ids.sample(frac=1), 
                                         [int(.6*len(listing_ids)), int(.8*len(listing_ids))])

In [10]:
train_id.shape

(274,)

In [11]:
validate_id.shape

(92,)

In [12]:
test_id.shape

(92,)

In [13]:
set(train_id.values).intersection(validate_id.values)

set()

In [14]:
set(train_id.values).intersection(test_id.values)

set()

In [15]:
set(test_id.values).intersection(validate_id.values)

set()

In [16]:
train_df = quick_df[quick_df["listing_id"].isin(train_id)]
train_df.shape

(5872, 282)

In [17]:
validate_df = quick_df[quick_df["listing_id"].isin(validate_id)]
validate_df.shape

(1927, 282)

In [18]:
test_df = quick_df[quick_df["listing_id"].isin(test_id)]
test_df.shape

(2201, 282)

In [19]:
train_val_df = quick_df[quick_df["listing_id"].isin(train_id.append(validate_id))]
train_val_df.shape

(7799, 282)

In [20]:
from sklearn.model_selection import GroupKFold

In [21]:
custom_cv = list(GroupKFold(n_splits = 5).split(train_val_df.drop("price",axis=1),
                                                train_val_df["price"],
                                                train_val_df["listing_id"]))
custom_cv

[(array([   0,    1,    2, ..., 7796, 7797, 7798]),
  array([  42,   43,   44, ..., 7571, 7572, 7573])),
 (array([   0,    1,    2, ..., 7758, 7759, 7760]),
  array([  71,   72,   73, ..., 7796, 7797, 7798])),
 (array([   0,    1,    2, ..., 7796, 7797, 7798]),
  array([ 313,  314,  315, ..., 7758, 7759, 7760])),
 (array([  26,   27,   28, ..., 7796, 7797, 7798]),
  array([   0,    1,    2, ..., 7646, 7647, 7648])),
 (array([   0,    1,    2, ..., 7796, 7797, 7798]),
  array([  26,   27,   28, ..., 7674, 7675, 7676]))]

## Error Functions

In [39]:
def mean_abs_err(validate_df,y_pred):
    return sum(abs(validate_df["price"] - y_pred)/validate_df["price"])/len(y_pred)

In [41]:
def price_conf_int_check(validate_df, upper_perc, lower_perc, y_pred):
    upper_bound = validate_df["price"] * (1 + upper_perc)
    lower_bound = validate_df["price"] * (1 - lower_perc)
    
    return sum(list(lower_bound < y_pred) & (y_pred < upper_bound)) / len(y_pred)

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
from sklearn.model_selection import ParameterGrid

### Grid Search CV for RF

In [24]:
param_rand_for = {'n_estimators': [20, 50, 100], 
                  'max_depth': [None, 5], 
                  'min_samples_split': [10, 50]}
param_rand_for

{'max_depth': [None, 1],
 'min_samples_split': [2, 3],
 'n_estimators': [5, 10, 15]}

In [25]:
#rand_for = RandomForestRegressor(max_depth=10, random_state=0)
rand_for = RandomForestRegressor()
rand_for

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [26]:
grid_rand_for = GridSearchCV(rand_for, 
                             param_rand_for, 
                             cv = custom_cv)

In [27]:
grid_rand_for.fit(train_val_df.drop("price",axis=1),
                  train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 7797, 7798]), array([  42,   43, ..., 7572, 7573])), (array([   0,    1, ..., 7759, 7760]), array([  71,   72, ..., 7797, 7798])), (array([   0,    1, ..., 7797, 7798]), array([ 313,  314, ..., 7759, 7760])), (array([  26,   27, ..., 7797, 7798]), array([   0,    1, ..., 7647, 7648])), (array([   0,    1, ..., 7797, 7798]), array([  26,   27, ..., 7675, 7676]))],
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15], 'max_depth': [None, 1], 'min_samples_split': [2, 3]},
       pre_dispat

In [28]:
grid_rand_for.best_score_

0.46874375532083934

In [29]:
grid_rand_for.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Single Random Forest Fit

In [266]:
rand_for.fit(train_df.drop("price",axis=1),train_df["price"])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [267]:
rand_for.feature_importances_

array([  1.66925899e-02,   2.33919121e-02,   1.23643506e-02,
         1.74241828e-03,   3.91349686e-03,   2.31432118e-01,
         3.63408816e-01,   1.22900372e-01,   3.02365375e-03,
         8.68667395e-02,   9.58410732e-04,   1.73454222e-03,
         1.00542631e-02,   5.67802507e-04,   4.90512808e-03,
         1.11094896e-02,   1.53825103e-04,   9.28853194e-06,
         6.45805122e-04,   2.68411208e-03,   6.80769745e-05,
         5.73294622e-03,   8.18527425e-04,   3.23400207e-02,
         0.00000000e+00,   0.00000000e+00,   1.56393248e-04,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.20941726e-03,   1.24329255e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         3.41533543e-06,   0.00000000e+00,   2.71841740e-05,
         3.93685092e-04,

In [268]:
y_pred_randfor = rand_for.predict(validate_df.drop("price",axis=1))
y_pred_randfor

array([ 64.35340091,  64.35340091,  64.35340091, ...,  72.28264579,
        62.61473187,  62.61473187])

In [269]:
r2_score_randfor = skm.r2_score(validate_df["price"], y_pred_randfor)
r2_score_randfor

0.47784087075411363

In [270]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_randfor)

2911.0389553823429

In [271]:
mean_abs_err(validate_df,y_pred_lasso)

0.40570251855953832

In [272]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_randfor)

0.40640260444926751

## Linear Regression with Lasso

### Single Fit for LR with Lasso

In [30]:
lasso_alpha = 120

In [31]:
lasso_reg = linear_model.Lasso(alpha = lasso_alpha,
                               max_iter = 5000000)

In [32]:
lasso_reg.fit(train_df.drop("price",axis=1),train_df["price"])

Lasso(alpha=120, copy_X=True, fit_intercept=True, max_iter=5000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
y_pred_lasso = lasso_reg.predict(validate_df.drop("price",axis=1))
y_pred_lasso

array([ 103.85342634,  103.85342634,  103.85342634, ...,  116.81368974,
        116.81368974,  116.81368974])

In [34]:
validate_df["price"].values

array([ 125.,  125.,  125., ...,   75.,   75.,   75.])

In [35]:
r2_score_lasso = skm.r2_score(validate_df["price"], y_pred_lasso)
r2_score_lasso

0.037232337595621501

In [36]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_lasso)

16605.810241244606

In [38]:
mean_abs_err(validate_df,y_pred_lasso)

0.47505132790737165

In [42]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_lasso)

0.29475869226777374

### Grid Search CV for LR with Lasso

In [44]:
param_lasso = {'alpha': [0.1, 0.25, 0.5, 1]}
param_lasso

{'alpha': [0.1, 0.25, 0.5, 1]}

In [48]:
lasso_reg = linear_model.Lasso(max_iter = 100000)

In [49]:
grid_lasso = GridSearchCV(lasso_reg, 
                          param_lasso, 
                          cv = custom_cv)

In [50]:
grid_lasso.fit(train_val_df.drop("price",axis=1),
               train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 7797, 7798]), array([  42,   43, ..., 7572, 7573])), (array([   0,    1, ..., 7759, 7760]), array([  71,   72, ..., 7797, 7798])), (array([   0,    1, ..., 7797, 7798]), array([ 313,  314, ..., 7759, 7760])), (array([  26,   27, ..., 7797, 7798]), array([   0,    1, ..., 7647, 7648])), (array([   0,    1, ..., 7797, 7798]), array([  26,   27, ..., 7675, 7676]))],
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 0.25, 0.5, 1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [51]:
grid_lasso.best_score_

0.42711860118044209

In [53]:
grid_lasso.best_estimator_

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## XGBoost

In [54]:
import xgboost as xgb



In [55]:
from xgboost import XGBClassifier

### Single Fit for XGBoost

In [56]:
xgb = XGBClassifier()
xgb

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [57]:
xgb.fit(train_df.drop("price",axis=1),train_df["price"])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [58]:
y_pred_xgb = xgb.predict(validate_df.drop("price",axis=1))
y_pred_xgb

array([ 125.,  115.,  115., ...,   70.,   70.,   70.])

In [59]:
r2_score_xgb = skm.r2_score(validate_df["price"], y_pred_xgb)
r2_score_xgb

0.3283092499355158

In [60]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_xgb)

11585.317592112091

In [61]:
mean_abs_err(validate_df,y_pred_xgb)

0.38584839320455511

In [62]:
price_conf_int_check(validate_df=validate_df,
                    upper_perc = 0.2,
                    lower_perc = 0.2,
                    y_pred = y_pred_xgb)

0.28437986507524649

### Grid Search CV for XGBoost

In [63]:
param_xgb = {'learning_rate': [0.1, 0.25, 0.5]}
param_xgb

{'learning_rate': [0.1, 0.25, 0.5]}

In [64]:
xgb = XGBClassifier()

In [65]:
grid_xgb = GridSearchCV(xgb, 
                          param_xgb, 
                          cv = custom_cv)

In [66]:
grid_xgb.fit(train_val_df.drop("price",axis=1),
             train_val_df["price"])

GridSearchCV(cv=[(array([   0,    1, ..., 7797, 7798]), array([  42,   43, ..., 7572, 7573])), (array([   0,    1, ..., 7759, 7760]), array([  71,   72, ..., 7797, 7798])), (array([   0,    1, ..., 7797, 7798]), array([ 313,  314, ..., 7759, 7760])), (array([  26,   27, ..., 7797, 7798]), array([   0,    1, ..., 7647, 7648])), (array([   0,    1, ..., 7797, 7798]), array([  26,   27, ..., 7675, 7676]))],
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.1, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [68]:
grid_xgb.best_score_

0.088857545839210156

In [69]:
grid_xgb.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

## Feed Forward

In [71]:
from sklearn.neural_network import MLPRegressor

### Single Fit for Feed Forward

In [72]:
mlp_reg = MLPRegressor(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(1000,1000,1000, 200), 
                       random_state=1)
mlp_reg

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 1000, 1000, 200),
       learning_rate='constant', learning_rate_init=0.001, max_iter=200,
       momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [73]:
mlp_reg.fit(train_df.drop("price",axis=1),train_df["price"])

MLPRegressor(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 1000, 1000, 200),
       learning_rate='constant', learning_rate_init=0.001, max_iter=200,
       momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [74]:
y_pred_mlp = mlp_reg.predict(validate_df.drop("price",axis=1))
y_pred_mlp

array([ 113.88730324,  114.60623189,  115.25974435, ...,  115.0971873 ,
        115.75990757,  114.77429905])

In [75]:
[coef.shape for coef in mlp_reg.coefs_]

[(281, 1000), (1000, 1000), (1000, 1000), (1000, 200), (200, 1)]

In [76]:
r2_score_mlp = skm.r2_score(validate_df["price"], y_pred_mlp)
r2_score_mlp

-0.43380852675711501

In [77]:
skm.mean_squared_error(y_true= validate_df["price"],
                       y_pred= y_pred_mlp)

24730.31993840143

In [78]:
mean_abs_err(validate_df,y_pred_mlp)

0.58345062818329052

In [79]:
price_conf_int_check(validate_df=validate_df,
                     upper_perc = 0.4,
                     lower_perc = 0.4,
                     y_pred = y_pred_mlp)

0.3476907109496627

### Grid Search CV for Feed Forward

In [80]:
param_ff = {'alpha': [0.0001, 0.01, 0.1],
            'learning_rate': ['constant','adaptive']}
param_ff

{'alpha': [0.0001, 0.01, 0.1], 'learning_rate': ['constant', 'adaptive']}

In [81]:
ff = MLPRegressor(solver='lbfgs',
                       hidden_layer_sizes=(1000,1000,1000, 200), 
                       random_state=1)
ff

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 1000, 1000, 200),
       learning_rate='constant', learning_rate_init=0.001, max_iter=200,
       momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [82]:
grid_ff = GridSearchCV(ff, 
                          param_ff, 
                          cv = custom_cv)

In [83]:
grid_ff.fit(train_val_df.drop("price",axis=1),
             train_val_df["price"])

KeyboardInterrupt: 

In [None]:
grid_ff.best_score_

In [None]:
grid_ff.best_estimator_