In [2]:
import pandas as pd
import numpy as np

In [95]:
data = pd.read_csv('./airbnb_v1.csv')
data = data[data['Y'] <= 200]
data = data.iloc[:, 1::]
x = data.iloc[:, 0:-1]
y = data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [104]:
x_train.to_csv('./x_train.csv', index=False)
x_test.to_csv('./x_test.csv', index=False)
y_train.to_csv('./y_train.csv', index=False)
y_test.to_csv('./y_test.csv', index=False)

In [3]:
x_train = pd.read_csv('./x_train.csv')
x_test = pd.read_csv('./x_test.csv')
y_train = pd.read_csv('./y_train.csv', header = None).iloc[:, -1]
y_test = pd.read_csv('./y_test.csv', header = None).iloc[:, -1]

In [4]:
from sklearn.cross_validation import train_test_split
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.utils import resample



# Linear Models

In [5]:
class model:
    
    def __init__(self, model, x_train, y_train, x_test, y_test):
        self.model = model
        self.x_train = x_train.as_matrix()
        self.y_train = y_train.as_matrix()
        self.x_test = x_test.as_matrix()
        self.y_test = y_test.as_matrix()
        self.y_pred_train = None
        self.y_pred_test = None

    def fit_model(self):
        self.model = self.model.fit(self.x_train, self.y_train)
        
        self.y_pred_train = self.model.predict(self.x_train)
        self.y_pred_test = self.model.predict(self.x_test)
        return self
    
    def error(self):
        mae_train = metrics.median_absolute_error(self.y_pred_train, self.y_train)
        mae_test = metrics.median_absolute_error(self.y_pred_test, self.y_test)
        
        mse_train = metrics.mean_squared_error(self.y_pred_train, self.y_train)
        mse_test = metrics.mean_squared_error(self.y_pred_test, self.y_test)
        return mae_test, mse_test, mae_train, mse_train

In [6]:
def bootstrap(algo, x_train, y_train, x_test, y_test, B=50):
    mae_test = np.zeros(B)
    mse_test = np.zeros(B)
    for i in range(B):
        x_train_b, y_train_b = resample(x_train, y_train)
        new_model = model(algo, x_train_b, y_train_b, x_test, y_test)
        new_model = new_model.fit_model()
        err = new_model.error()
        mae_test[i] = err[0]
        mse_test[i] = err[1]
    return mae_test, mse_test

In [7]:
x_train_reg = x_train.iloc[:,1:]
x_test_reg = x_test.iloc[:,1:]

## Ridge

In [9]:
reg_params = 10.**np.arange(-7, 3, 1)
mae, mse = bootstrap(RidgeCV(alphas=reg_params, cv=5), x_train_new, y_train, x_test_new, y_test)

In [10]:
mae = np.append(mae, np.mean(mae))
mae = np.append(mae, np.std(mae))
mse = np.append(mse, np.mean(mse))
mse = np.append(mse, np.std(mse))
    
df_mae = pd.DataFrame(mae)
df_mse = pd.DataFrame(mse)

df_mae.columns = ['ClusterDum+RidgeCV']
df_mse.columns = ['ClusterDum+RidgeCV']

In [12]:
def add_to_df(mae, mse, df_mae, df_mse, modelname):
    mae = np.append(mae, np.mean(mae))
    mae = np.append(mae, np.std(mae))
    mse = np.append(mse, np.mean(mse))
    mse = np.append(mse, np.std(mse))
    
    df_mae[modelname] = mae
    df_mse[modelname] = mse

## Lasso

In [13]:
mae, mse = bootstrap(LassoCV(alphas=reg_params, cv=5), x_train_new, y_train, x_test_new, y_test)



In [14]:
add_to_df(mae, mse, df_mae, df_mse, 'ClusterDum+LassoCV')

## Elastic Net

In [18]:
err = {}

for alpha in [10**x for x in range(-5, 4)]:
    for l1_ratio in np.arange(0.8, 1.8, 0.1):
        new_model = model(ElasticNet(alpha=alpha, l1_ratio=l1_ratio), x_train_new, y_train, x_test_new, y_test)
        new_model = new_model.fit_model()
        err[alpha, l1_ratio]=new_model.error()



In [21]:
err_df = pd.DataFrame.from_dict(err, orient='index')
err_df.columns = ["mae_test", "mse_test", "mae_train", "mse_train"]
err_df.sort(['mae_test','mae_test']).head()

  app.launch_new_instance()


Unnamed: 0,mae_test,mse_test,mae_train,mse_train
"(0.0001, 0.9)",20.097138,960.111965,19.581966,952.879046
"(1e-05, 1.1)",20.098504,960.268454,19.585478,952.859024
"(1e-05, 1.0)",20.099373,960.254115,19.585451,952.858926
"(1e-05, 0.9)",20.100175,960.241304,19.585331,952.85917
"(0.01, 1.0)",20.10026,959.83848,19.603085,954.246985


In [22]:
mae, mse = bootstrap(ElasticNet(alpha=0.0001, l1_ratio=0.9), x_train_new, y_train, x_test_new, y_test)



In [23]:
add_to_df(mae, mse, df_mae, df_mse, 'ClusterDum+ElasticNet')

## BayesianRidge

In [27]:
params_l = [10**x for x in range(-6, 2)]
params_s = [10**x for x in range(-13, -5)]
params = [10**x for x in range(-9, -1)]

err = {}

for a1 in params_l:
    for a2 in params_s:
        for lam1 in params_s:
            for lam2 in params_l:
                new_model = model(BayesianRidge(alpha_1=a1, alpha_2=a2, lambda_1=lam1, lambda_2=lam2), x_train_new, y_train, x_test_new, y_test)
                new_model = new_model.fit_model()
                err[a1, a2, lam1, lam2]=new_model.error()

In [28]:
err_df = pd.DataFrame.from_dict(err, orient='index')
err_df.columns = ["mae_test", "mse_test", "mae_train", "mse_train"]
err_df.sort(['mae_test','mae_test']).head()

  app.launch_new_instance()


Unnamed: 0,mae_test,mse_test,mae_train,mse_train
"(10, 1e-08, 1e-12, 10)",20.117045,959.694519,19.596052,953.577098
"(10, 1e-06, 1e-12, 10)",20.117045,959.694519,19.596052,953.577098
"(10, 1e-13, 1e-13, 10)",20.117045,959.694519,19.596052,953.577098
"(10, 1e-10, 1e-13, 10)",20.117045,959.694519,19.596052,953.577098
"(10, 1e-11, 1e-13, 10)",20.117045,959.694519,19.596052,953.577098


In [29]:
mae, mse = bootstrap(BayesianRidge(alpha_1=10, alpha_2=1e-8, lambda_1=1e-12, lambda_2=10), x_train_new, y_train, x_test_new, y_test)

In [30]:
add_to_df(mae, mse, df_mae, df_mse, 'Bayesian Ridge')

# Cluster + Linear regression 1

In [103]:
clu_feature = ['zipcode']
reg_feature = list(set(x_train.columns).difference(set(clu_feature)))

In [104]:
def extract_clu_member(i, labels):
    member = []
    for ix, it in enumerate(labels):
        if it == i:
            member.append(ix)
    return member

In [105]:
class model_2:
    
    def __init__(self, model, x_train, y_train, x_test, y_test, clu_feature, reg_feature, k=4):
        self.model = model
        self.x_train_clu = x_train[clu_feature]
        self.x_train_reg = x_train[reg_feature]
        self.y_train = y_train
        self.x_test_clu = x_test[clu_feature]
        self.x_test_reg = x_test[reg_feature]
        self.y_test = y_test
        self.k = k
        
        self.reg_model = [None]*k
        self.x_train_clued = [None]*k
        self.y_train_clued = [None]*k
        self.x_test_clued = [None]*k
        self.y_test_clued = [None]*k
        self.y_pred_train=np.zeros(0)
        self.y_pred_test=np.zeros(0)
        self.y_train_concat = np.zeros(0)
        self.y_test_concat = np.zeros(0)
        
        
    def fit_model_2(self):
        kmeans = KMeans(n_clusters=self.k, random_state=0).fit(self.x_train_clu)
        for i in range(self.k):
            clu_mem = extract_clu_member(i, kmeans.labels_)
            self.x_train_clued[i] = self.x_train_reg.iloc[clu_mem]
            self.y_train_clued[i] = self.y_train.iloc[clu_mem]
            
            clu_mem = extract_clu_member(i, kmeans.predict(self.x_test_clu))
            self.x_test_clued[i] = self.x_test_reg.iloc[clu_mem]
            self.y_test_clued[i] = self.y_test.iloc[clu_mem]
            
            self.reg_model[i] = model(self.model, self.x_train_clued[i], self.y_train_clued[i], self.x_test_clued[i], self.y_test_clued[i])
            self.reg_model[i].fit_model()
            
            self.y_pred_train = np.concatenate((self.y_pred_train, self.reg_model[i].y_pred_train))
            self.y_pred_test= np.concatenate((self.y_pred_test, self.reg_model[i].y_pred_test))
            self.y_train_concat= np.concatenate((self.y_train_concat, self.y_train_clued[i].as_matrix()))
            self.y_test_concat= np.concatenate((self.y_test_concat, self.y_test_clued[i].as_matrix()))
        return self
    
    def error_2(self):
        mae_train = metrics.median_absolute_error(self.y_pred_train, self.y_train_concat)
        mae_test = metrics.median_absolute_error(self.y_pred_test, self.y_test_concat)
        mse_train = metrics.mean_squared_error(self.y_pred_train, self.y_train_concat)
        mse_test = metrics.mean_squared_error(self.y_pred_test, self.y_test_concat)
        return mae_test, mse_test, mae_train, mse_train

In [106]:
def bootstrap_2(algo, x_train, y_train, x_test, y_test, clu_feature, reg_feature, B=50):
    mae_test = np.zeros(B)
    mse_test = np.zeros(B)
    for i in range(B):
        x_train_b, y_train_b = resample(x_train, y_train)
        new_model = model_2(algo, x_train_b, y_train_b, x_test, y_test, clu_feature, reg_feature)
        new_model = new_model.fit_model_2()
        err = new_model.error_2()
        mae_test[i] = err[0]
        mse_test[i] = err[1]
    return mae_test, mse_test

## RidgeCV

In [107]:
mae, mse = bootstrap_2(RidgeCV(alphas=reg_params, cv=5), x_train, y_train, x_test, y_test, clu_feature, reg_feature, B=50)

In [108]:
add_to_df(mae, mse, df_mae, df_mse, 'Cluster+RidgeCV')

## LassoCV

In [117]:
mae, mse = bootstrap_2(LassoCV(alphas=reg_params, cv=5), x_train, y_train, x_test, y_test, clu_feature, reg_feature)



In [118]:
add_to_df(mae, mse, df_mae, df_mse, 'Cluster+LassoCV')

## ElasticNet

In [121]:
err = {}

for alpha in [10**x for x in range(-5, 4)]:
    for l1_ratio in np.arange(0.1, 1, 0.1):
        new_model = model_2(ElasticNet(alpha=alpha, l1_ratio=l1_ratio), x_train, y_train, x_test, y_test, clu_feature, reg_feature)
        new_model = new_model.fit_model_2()
        err[alpha, l1_ratio]=new_model.error_2()




In [122]:
err_df = pd.DataFrame.from_dict(err, orient='index')
err_df.columns = ["mae_test", "mse_test", "mae_train", "mse_train"]
err_df.sort(['mae_test','mae_test']).head()

  app.launch_new_instance()


Unnamed: 0,mae_test,mse_test,mae_train,mse_train
"(0.01, 0.7)",19.67491,949.878388,19.37628,943.804542
"(1e-05, 0.9)",19.702201,949.009762,19.266355,941.193038
"(1e-05, 0.8)",19.703498,949.007759,19.266259,941.193309
"(1e-05, 0.7)",19.70474,949.005171,19.266178,941.193721
"(1e-05, 0.6)",19.717754,949.002605,19.266108,941.194261


In [124]:
mae, mse = bootstrap_2(ElasticNet(alpha=0.01, l1_ratio=0.7), x_train, y_train, x_test, y_test, clu_feature, reg_feature)

In [125]:
add_to_df(mae, mse, df_mae, df_mse, 'Cluster+ElasticNet')

## BayesRidge

In [133]:
params_l = [10**x for x in range(-6, -1)]
params_s = [10**x for x in range(-13, -8)]

In [134]:
err = {}

for a1 in params_s:
    for a2 in params_l:
        for lam1 in params_l:
            for lam2 in params_s:
                new_model = model_2(BayesianRidge(alpha_1=a1, alpha_2=a2, lambda_1=lam1, lambda_2=lam2), x_train, y_train, x_test, y_test, clu_feature, reg_feature)
                new_model = new_model.fit_model_2()
                err[a1, a2, lam1, lam2]=new_model.error_2()

In [135]:
err_df = pd.DataFrame.from_dict(err, orient='index')
err_df.columns = ["mae_test", "mse_test", "mae_train", "mse_train"]
err_df.sort(['mae_test','mae_test']).head()

  app.launch_new_instance()


Unnamed: 0,mae_test,mse_test,mae_train,mse_train
"(1e-12, 0.01, 0.01, 1e-12)",19.767728,948.799921,19.341457,942.787457
"(1e-12, 0.01, 0.01, 1e-13)",19.767728,948.799921,19.341457,942.787457
"(1e-13, 0.01, 0.01, 1e-13)",19.767728,948.799921,19.341457,942.787457
"(1e-13, 0.01, 0.01, 1e-11)",19.767728,948.799921,19.341457,942.787457
"(1e-11, 0.01, 0.01, 1e-12)",19.767728,948.799921,19.341457,942.787457


In [139]:
mae, mse = bootstrap_2(BayesianRidge(alpha_1=1e-12, alpha_2=0.01, lambda_1=0.01, lambda_2=1e-12), x_train, y_train, x_test, y_test, clu_feature, reg_feature)

In [140]:
add_to_df(mae, mse, df_mae, df_mse, 'Cluster+BayesianRidge')

# Cluster + Linear regression 2

In [51]:
x_train_reg = x_train.iloc[:,1:]
x_test_reg = x_test.iloc[:,1:]

In [8]:
kmeans_tr = KMeans(n_clusters=5).fit(x_train[['zipcode']])
cluster_tr = pd.get_dummies(kmeans_tr.labels_)
cluster_te = pd.get_dummies(kmeans_tr.predict(x_test[['zipcode']]))
x_train_new = pd.concat([x_train_reg, cluster_tr], axis=1)
x_test_new = pd.concat([x_test_reg, cluster_te], axis=1)

# Random Forest Regressor

In [151]:
from sklearn.ensemble import RandomForestRegressor

def random_forest_model(x, y):
    best_score = 0
    best_depth = 0
    # tune for tree depth from 1 to 20
    for j in range(1, 21):
        rf = RandomForestRegressor(n_estimators=100, max_depth=j, oob_score=True)
        rf.fit(x, y)
        score = rf.score(x, y)
        score = rf.oob_score_
        if score > best_score:
            best_score = score
            best_depth = j
    return best_score, best_depth

def random_forest_tuned(best_depth, x_train, x_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=100, max_depth=best_depth)
    rf.fit(x_train, y_train)
    y_pred_test = rf.predict(x_test)
    y_pred_train = rf.predict(x_train)
    
    mae_train = metrics.median_absolute_error(y_pred_train, y_train)
    mse_train = metrics.mean_squared_error(y_pred_train, y_train)
    mae_test = metrics.median_absolute_error(y_pred_test, y_test)
    mse_test = metrics.mean_squared_error(y_pred_test, y_test)
    
    return mae_test, mse_test, mae_train, mse_train
    
    

In [152]:
rf_values = random_forest_model(x_train, y_train)

In [153]:
dep = rf_values[1]
dep

13

In [154]:
random_forest_tuned(dep, x_train, x_test, y_train, y_test)

(16.638008456792832,
 769.98080225720309,
 12.770363165113242,
 423.68982858467746)

In [155]:
B = 50
mae_test = np.zeros(B)
mse_test = np.zeros(B)

for i in range(B):
    x_train_b, y_train_b = resample(x_train, y_train)
    err = random_forest_tuned(dep, x_train_b, x_test, y_train_b, y_test)
    mae_test[i] = err[0]
    mse_test[i] = err[1]

In [156]:
add_to_df(mae_test, mse_test, df_mae, df_mse, 'RandomForest')

In [157]:
df_mae

Unnamed: 0,RidgeCV,LassoCV,Elastic Net,Bayesian Ridge,Cluster+RidgeCV,Cluster+LassoCV,Cluster+ElasticNet,Cluster+BayesianRidge,RandomForest
0,22.502401,22.277641,22.491768,22.303621,19.808924,19.853719,19.86593,19.920144,17.214124
1,22.260726,22.572311,22.327935,22.387959,19.850527,19.905292,19.871863,19.711863,16.973145
2,22.48432,22.15064,22.225014,22.201448,19.673575,19.828611,19.865896,19.782468,16.660871
3,22.460174,22.462387,22.398942,22.239297,19.794686,19.683496,19.905214,19.679832,16.992076
4,22.347536,22.278736,22.360506,22.586287,19.602049,19.865757,19.879985,19.995347,17.163071
5,22.415236,22.172011,22.28888,22.363778,19.830526,19.827931,19.976835,19.764557,16.81646
6,22.43329,22.424421,22.41397,22.352696,20.039252,19.839518,19.914018,19.768828,17.008459
7,22.406611,22.101011,22.121561,22.270664,19.900161,19.802701,19.879641,19.85007,17.195894
8,22.171628,22.322204,22.333176,22.358787,19.856051,19.883596,19.842464,19.840141,17.109556
9,22.240735,22.440242,22.288964,22.493874,19.763349,19.898675,20.00701,19.951937,17.181746


In [31]:
df_mae.to_csv("median absolute error_add.csv")

In [32]:
df_mse.to_csv("mean squared error_add.csv")