In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge

In [2]:
data = pd.read_csv('./airbnb_v2.csv', low_memory=False, index_col=0)
data = data[data['Y'] <= 200]
data['zipcode'] = pd.to_numeric(data['zipcode'], errors='coerce')
data = data.dropna()
x = data.iloc[:, 0:-1]
y = data.iloc[:, -1]
x.head()

Unnamed: 0,zipcode,accommodates,bathrooms,bedrooms,beds,minimum_nights,number_of_reviews,review_scores_rating,calculated_host_listings_count,property_type_Apartment,...,Suitable for events,Self Check-In,Keypad,Hot tub,Lockbox,Other pet(s),Free parking on street,Doorman Entry,Washer / Dryer,Paid parking off premises
685006,11225.0,2,1.0,1.0,1.0,2,145,92.0,1,1.0,...,0,0,0,0,0,0,0,0,0,0
9461238,11211.0,5,1.0,1.0,2.0,2,50,94.0,1,0.0,...,0,0,0,0,0,0,0,0,0,0
4873690,11101.0,2,1.0,1.0,1.0,1,43,95.0,1,1.0,...,0,0,0,0,0,0,0,0,0,0
15359479,10003.0,4,1.0,1.0,1.0,5,1,100.0,1,1.0,...,0,0,0,0,0,0,0,0,0,0
810483,10009.0,1,1.0,1.0,1.0,2,1,89.0,1,1.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x = x.as_matrix()
y = y.as_matrix()

# Linear Models

In [3]:
class model:
    
    def __init__(self, model):
        self.model = model
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_test = None
        self.y_pred_train = None
        self.y_pred_test = None
        
    
    def data_split(self, x, y, test_size=0.3):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=test_size)

    def fit_model(self, x, y, test_size=0.3):
        self.data_split(x, y, test_size)
        self.model = self.model.fit(self.x_train, self.y_train)
        
        self.y_pred_train = self.model.predict(self.x_train)
        self.y_pred_test = self.model.predict(self.x_test)
    
    def err_reg(self):
        mae_train = metrics.median_absolute_error(self.y_pred_train, self.y_train)
        mae_test = metrics.median_absolute_error(self.y_pred_test, self.y_test)
        
        mse_train = metrics.mean_squared_error(self.y_pred_train, self.y_train)
        mse_test = metrics.mean_squared_error(self.y_pred_test, self.y_test)
        return mae_test, mse_test, mae_train, mse_train

In [4]:
import numpy as np

reg_params = 10.**np.arange(-7, 3, 1)
models = [RidgeCV(alphas=reg_params, cv=5), LassoCV(alphas=reg_params, cv=5), ElasticNet(alpha=1.0, l1_ratio=0.85), BayesianRidge()]
model_labels = ['RidgeCV', 'LassoCV', 'ElasticNet', 'BayesRidge']
err = {}
n = len(models)

In [5]:
for i in range(n):
    new_model = model(models[i])
    new_model.fit_model(x, y)
    err[model_labels[i]] = new_model.err_reg()

In [6]:
err_df = pd.DataFrame.from_dict(err, orient='index')
err_df.columns = ["mae_test", "mse_test", "mae_train", "mse_train"]

In [10]:
err_df

Unnamed: 0,mae_test,mse_test,mae_train,mse_train
RidgeCV,20.590887,1029.138764,20.453169,993.178143
LassoCV,20.57021,1013.042958,20.411991,1000.767289
ElasticNet,23.860418,1175.598578,23.740487,1180.45967
BayesRidge,20.789886,1018.914883,20.37871,999.440961


# KRR(Kernalized Ridge Regression) and SVR(Support Vector Regression)

In [11]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_ridge import KernelRidge

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)

In [13]:
kr = GridSearchCV(KernelRidge(kernel='rbf'), cv=5,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

In [14]:
kr.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='rbf',
      kernel_params=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0, 0.1, 0.01, 0.001], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
means = kr.cv_results_['mean_test_score']
print(kr.best_params_)
print(means)

{'alpha': 1.0, 'gamma': 0.01}
[ 0.48763168 -0.07958779 -4.77877046 -4.80499315 -4.80501869  0.48651936
  0.04616077 -4.76042605 -4.80452116 -4.80456671  0.39514536  0.0431459
 -4.75711448 -4.80452246 -4.80457188  0.18846007  0.04048647 -4.75675641
 -4.80452445 -4.80457429]


In [16]:
model = KernelRidge(kernel='rbf', alpha=1, gamma=0.01)
model.fit(x_train, y_train)

KernelRidge(alpha=1, coef0=1, degree=3, gamma=0.01, kernel='rbf',
      kernel_params=None)

In [17]:
metrics.median_absolute_error(model.predict(x_test), y_test)

21.320797704308973

In [18]:
svr = GridSearchCV(SVR(kernel='rbf'), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

In [19]:
svr.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [20]:
means = svr.cv_results_['mean_test_score']
print(svr.best_params_)
print(means)

{'C': 100.0, 'gamma': 0.01}
[  3.06641167e-01   1.24886473e-01  -3.18045984e-02  -3.19226687e-02
  -3.19228774e-02   4.67257290e-01   3.33468521e-01  -3.66125557e-02
  -3.77836945e-02  -3.77871991e-02   4.98066726e-01   3.91410934e-01
   4.93990345e-03  -3.36738813e-05  -4.03125225e-05   4.25835801e-01
   3.65832960e-01   4.94006484e-03  -3.36678176e-05  -4.03109793e-05]


In [21]:
model = SVR(kernel='rbf', C=100, gamma=0.01)
model.fit(x_train, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [22]:
metrics.median_absolute_error(model.predict(x_test), y_test)

20.456004130275645

# Random Forest Regressor

In [81]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_model(x, y):
    best_score = 0
    best_depth = 0
    # tune for tree depth from 1 to 20
    for j in range(1, 21):
        rf = RandomForestRegressor(n_estimators=50, max_depth=j, oob_score=True)
        rf.fit(x, y)
        score = rf.score(x, y)
        score = rf.oob_score_
        if score > best_score:
            best_score = score
            best_depth = j
    return best_score, best_depth

def random_forest_tuned(best_depth, x_train, x_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=50, max_depth=best_depth)
    rf.fit(x_train, y_train)
    y_pred_test = rf.predict(x_test)
    y_pred_train = rf.predict(x_train)
    
    print ("Train Median Absolute Error", metrics.median_absolute_error(y_pred_train, y_train))
    print ("Train Mean Squared Error", metrics.mean_squared_error(y_pred_train, y_train))
    print ("Test Median Absolute Error", metrics.median_absolute_error(y_pred_test, y_test))
    print ("Test Mean Squared Error", metrics.mean_squared_error(y_pred_test, y_test))
    print ("Train Relative Error", np.mean(np.abs(1 - y_pred_train / y_train)))
    print ("Test Relative Error", np.mean(np.abs(1 - y_pred_test / y_test)))

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)

In [47]:
rf_values = random_forest_model(x, y)

In [50]:
print('Best OOB Score:', rf_values[0])
print('Best Depth', rf_values[1])

Best OOB Score: 0.540501839099
Best Depth 12


In [82]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35)
random_forest_tuned(rf_values[1]+1, x_train, x_test, y_train, y_test)

Train Median Absolute Error 12.4017369616
Train Mean Squared Error 376.388430654
Test Median Absolute Error 16.5102490067
Test Mean Squared Error 763.970458092
Train Relative Error 0.17158246820286588
Test Relative Error 0.23983905053596066


# GBRT

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

In [32]:
def GBRT_model(x, y):
    best_score = 1e5
    best_depth = 0
    for j in range(1, 21):
        model = GradientBoostingRegressor(loss='huber', n_estimators=50, max_depth=j)
        n = len(x)
        n0 = int(n*0.8)
        model.fit(x[:n0], y[:n0])
        score = metrics.median_absolute_error(model.predict(x[n0:]), y[n0:])
        if score < best_score:
            best_score = score
            best_depth = j
    return best_score, best_depth

def GBRT_tuned(best_depth, x_train, x_test, y_train, y_test):
    model = GradientBoostingRegressor(loss='huber', n_estimators=50, max_depth=best_depth)
    model.fit(x_train, y_train)
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    print ("Train Median Absolute Error", metrics.median_absolute_error(y_pred_train, y_train))
    print ("Train Mean Squared Error", metrics.mean_squared_error(y_pred_train, y_train))
    print ("Test Median Absolute Error", metrics.median_absolute_error(y_pred_test, y_test))
    print ("Test Mean Squared Error", metrics.mean_squared_error(y_pred_test, y_test))

In [33]:
values = GBRT_model(x_train, y_train)

In [34]:
print(values)

(19.751988785774479, 9)


In [35]:
GBRT_tuned(values[1], x_train, x_test, y_train, y_test)

Train Median Absolute Error 13.9390961853
Train Mean Squared Error 527.399097372
Test Median Absolute Error 19.3443108926
Test Mean Squared Error 976.065545249


In [5]:
def GBRT_tuned(best_depth, x_train, x_test, y_train, y_test):
    model = GradientBoostingRegressor(loss='huber', n_estimators=50, max_depth=best_depth)
    model.fit(x_train, y_train)
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    print ("Train Median Absolute Error", metrics.median_absolute_error(y_pred_train, y_train))
    print ("Train Mean Squared Error", metrics.mean_squared_error(y_pred_train, y_train))
    print ("Test Median Absolute Error", metrics.median_absolute_error(y_pred_test, y_test))
    print ("Test Mean Squared Error", metrics.mean_squared_error(y_pred_test, y_test))