In [81]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [82]:
columns = [
    'neighbourhood_group', 'room_type', 'latitude', 'longitude',
    'minimum_nights', 'number_of_reviews','reviews_per_month',
    'calculated_host_listings_count', 'availability_365',
    'price'
]

df = pd.read_csv('AB_NYC_2019.csv', usecols=columns)
df.reviews_per_month = df.reviews_per_month.fillna(0)

In [83]:
df.fillna(0)
df['price'] = df['price'].apply(np.log1p)

In [84]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df,test_size=0.2,random_state=1)
train,val = train_test_split(train,test_size=0.2,random_state=1)

In [85]:
y_train = train['price']
y_test = test['price']
y_val = val['price']
del train['price']
del test['price']
del val['price']

In [86]:
from sklearn.feature_extraction import DictVectorizer
dict_train = train.to_dict(orient='records') 
dict_val = val.to_dict(orient='records')
v = DictVectorizer(sparse=False)
X_train = v.fit_transform(dict_train)
X_val = v.transform(dict_val)

In [87]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0,max_depth=1)
regressor.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=1, random_state=0)

In [88]:
dict(zip(v.get_feature_names(), regressor.feature_importances_.round(3))) 

{'availability_365': 0.0,
 'calculated_host_listings_count': 0.0,
 'latitude': 0.0,
 'longitude': 0.0,
 'minimum_nights': 0.0,
 'neighbourhood_group=Bronx': 0.0,
 'neighbourhood_group=Brooklyn': 0.0,
 'neighbourhood_group=Manhattan': 0.0,
 'neighbourhood_group=Queens': 0.0,
 'neighbourhood_group=Staten Island': 0.0,
 'number_of_reviews': 0.0,
 'reviews_per_month': 0.0,
 'room_type=Entire home/apt': 1.0,
 'room_type=Private room': 0.0,
 'room_type=Shared room': 0.0}

In [89]:
def predict(estimators=10,max_depth=None):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error
    regressor = RandomForestRegressor(n_estimators=estimators, random_state=1,n_jobs =-1)
    if max_depth:
        regressor = RandomForestRegressor(n_estimators=estimators, random_state=1,n_jobs =-1,max_depth=max_depth)
    regressor.fit(X_train,y_train)
    y_val_pred = regressor.predict(X_val)
    rmse = mean_squared_error(y_val,y_val_pred,squared=False)
    return regressor, rmse

In [90]:
regressor, rmse = predict()
rmse

0.46078067210100937

In [91]:
for estimator in range(10,201,10):
    regressor, rmse = predict(estimator)
    print(estimator,'===',rmse)

10 === 0.46078067210100937
20 === 0.44777342751030147
30 === 0.44348203033518463
40 === 0.4421868299723399
50 === 0.4414563841054784
60 === 0.4409265549453157
70 === 0.44015536393193144
80 === 0.43958432181362317
90 === 0.43935814374764653
100 === 0.43912841008144876
110 === 0.4387094007857012
120 === 0.43848857961017723
130 === 0.4380852951696702
140 === 0.43793312374001275
150 === 0.4379224946086783
160 === 0.4378122599508284
170 === 0.43761517304863007
180 === 0.4376249204741554
190 === 0.4374779847346011
200 === 0.43754618547576885


In [92]:
rmse_df = []
for estimator in range(10,201,10):
    for max_depth in [10, 15, 20, 25]:
        regressor, rmse = predict(estimator,max_depth)
        rmse_df.append([estimator,max_depth,rmse])

In [95]:
rmse_val_df = pd.DataFrame(rmse_df,columns=['estimator','max_depth','rmse'])
idx = rmse_val_df['rmse'].idxmin()
rmse_val_df.iloc[idx,:]

estimator    200.000000
max_depth     15.000000
rmse           0.434755
Name: 77, dtype: float64

In [96]:
regressor, rmse = predict(10,20)

In [102]:
pd.DataFrame(zip(v.get_feature_names(), regressor.feature_importances_.round(3)),columns=['feature','importance']).sort_values(by='importance')

Unnamed: 0,feature,importance
5,neighbourhood_group=Bronx,0.0
9,neighbourhood_group=Staten Island,0.0
6,neighbourhood_group=Brooklyn,0.001
8,neighbourhood_group=Queens,0.001
13,room_type=Private room,0.003
14,room_type=Shared room,0.006
1,calculated_host_listings_count,0.031
7,neighbourhood_group=Manhattan,0.035
10,number_of_reviews,0.043
11,reviews_per_month,0.053


In [119]:
import xgboost as xgb
def evaluate(eta):
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,

        'objective': 'reg:squarederror',
        'nthread': 8,

        'seed': 1,
        'verbosity': 1,
    }
    print(xgb_params)
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=v.feature_names_) 
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=v.feature_names_)
    model = xgb.train(xgb_params, dtrain, num_boost_round=100)
    y_val_pred = model.predict(dval)
    return mean_squared_error(y_val,y_val_pred,squared=False)

In [120]:
evaluate(0.3)

{'eta': 0.3, 'max_depth': 6, 'min_child_weight': 1, 'objective': 'reg:squarederror', 'nthread': 8, 'seed': 1, 'verbosity': 1}


0.43723814828957047

In [121]:
evaluate(0.1)

{'eta': 0.1, 'max_depth': 6, 'min_child_weight': 1, 'objective': 'reg:squarederror', 'nthread': 8, 'seed': 1, 'verbosity': 1}


0.4345855408382242

In [122]:
evaluate(0.01)

{'eta': 0.01, 'max_depth': 6, 'min_child_weight': 1, 'objective': 'reg:squarederror', 'nthread': 8, 'seed': 1, 'verbosity': 1}


1.631293499371654