In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample

In [24]:
import time

In [26]:
data = pd.read_csv('./airbnb_v2.csv', low_memory=False, index_col=0)
data = data[data['Y'] <= 200]
#data['zipcode'] = pd.to_numeric(data['zipcode'], errors='coerce')
#data = data.drop(['review_scores_rating', 'number_of_reviews', 'calculated_host_listings_count'], axis=1)
#data = data.dropna()
x = data.iloc[:, 0:-1]
y = data.iloc[:, -1]

In [28]:
x = x.as_matrix()
y = y.as_matrix()

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [33]:
from sklearn.linear_model import LinearRegression

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = LinearRegression()
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index, 1:5], y_train[bs_index])
    y_pred_test = model.predict(x_test[:, 1:5])
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)

runtime = time.clock() - start
print(runtime)

0.19690771886962466


In [9]:
df = pd.DataFrame(loss_md_ae)
df.to_csv('loss_md_ae.csv')
df = pd.DataFrame(loss_mn_se)
df.to_csv('loss_mn_se.csv')

In [34]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(28.86701933525416, 0.24909380883855486, 1721.778030179049, 1.8063220206326074)

In [62]:
from sklearn.kernel_ridge import KernelRidge

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = KernelRidge(kernel='rbf', alpha=1, gamma=0.01)
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index, 1:], y_train[bs_index])
    y_pred_test = model.predict(x_test[:, 1:])
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)

runtime = time.clock() - start
print(runtime)

947.874364893556


In [51]:
df = pd.DataFrame(loss_md_ae)
df.to_csv('loss_md_ae.csv')
df = pd.DataFrame(loss_mn_se)
df.to_csv('loss_mn_se.csv')

In [52]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(21.337720490487971,
 0.19159754581008218,
 1101.9605491923505,
 5.7464370974973447)

In [63]:
from sklearn.svm import SVR

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = SVR(kernel='rbf', C=100, gamma=0.01)
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index, 1:], y_train[bs_index])
    y_pred_test = model.predict(x_test[:, 1:])
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)

runtime = time.clock() - start
print(runtime)

2678.8472520478354


In [54]:
df = pd.DataFrame(loss_md_ae)
df.to_csv('loss_md_ae.csv')
df = pd.DataFrame(loss_mn_se)
df.to_csv('loss_mn_se.csv')

In [55]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(20.688845990376159,
 0.20524879905319376,
 1102.9789200601385,
 7.7247054935751818)

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = GradientBoostingRegressor(loss='huber', n_estimators=50, max_depth=9)
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index], y_train[bs_index])
    y_pred_test = model.predict(x_test)
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)
    
runtime = time.clock() - start
print(runtime)

772.4821224689872


In [12]:
df = pd.DataFrame(loss_md_ae)
df.to_csv('loss_md_ae.csv')
df = pd.DataFrame(loss_mn_se)
df.to_csv('loss_mn_se.csv')

In [13]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(16.175643305727377,
 0.17355293283801901,
 768.36865134130335,
 7.7443497375198378)

In [14]:
import sys
sys.path.append('C:\\Users\\xsyso\\xgboost\\python-package')
import xgboost as xgb

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, colsample_bylevel=0.9, colsample_bytree=0.5)
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index], y_train[bs_index])
    y_pred_test = model.predict(x_test)
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)
    
runtime = time.clock() - start
print(runtime)

76.05476614928693


In [15]:
df = pd.DataFrame(loss_md_ae)
df.to_csv('loss_md_ae.csv')
df = pd.DataFrame(loss_mn_se)
df.to_csv('loss_mn_se.csv')

In [16]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(16.193305015563965,
 0.14038869116448621,
 732.17730822885528,
 5.6841995638853122)

In [35]:
from sklearn.ensemble import RandomForestRegressor

loss_md_ae = np.zeros(50)
loss_mn_se = np.zeros(50)

start = time.clock()

for i in range(50):
    model = RandomForestRegressor(n_estimators=100, max_depth=13)
    bs_index = resample(np.arange(len(x_train)), replace=True)
    model.fit(x_train[bs_index], y_train[bs_index])
    y_pred_test = model.predict(x_test)
    
    loss_md_ae[i] = metrics.median_absolute_error(y_pred_test, y_test)
    loss_mn_se[i] = metrics.mean_squared_error(y_pred_test, y_test)
    
runtime = time.clock() - start
print(runtime)

400.12627723574406


In [36]:
np.mean(loss_md_ae), np.std(loss_md_ae), np.mean(loss_mn_se), np.std(loss_mn_se)

(18.273857982296377,
 0.14891315623546078,
 901.34982976304013,
 6.0348240109450266)