In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data/AB_NYC_2019.csv')

In [2]:
df_cleaned = df[['neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365']]
df_cleaned = df_cleaned.fillna(0)

In [3]:
df_cleaned.describe()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,40.728949,-73.95217,152.720687,7.029962,23.274466,1.09091,7.143982,112.781327
std,0.05453,0.046157,240.15417,20.51055,44.550582,1.597283,32.952519,131.622289
min,40.49979,-74.24442,0.0,1.0,0.0,0.0,1.0,0.0
25%,40.6901,-73.98307,69.0,1.0,1.0,0.04,1.0,0.0
50%,40.72307,-73.95568,106.0,3.0,5.0,0.37,1.0,45.0
75%,40.763115,-73.936275,175.0,5.0,24.0,1.58,2.0,227.0
max,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [17]:


#Convert categorical values to labels
df_clean_labeled = pd.concat([df_cleaned.select_dtypes(include='number'),df_cleaned.select_dtypes(include='object').apply(LabelEncoder().fit_transform)],axis=1)


    

In [25]:
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, Lars, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate, train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

X = df_clean_labeled.drop(columns=['price'])
y = df_clean_labeled['price']

models = [
    {'name': 'Linear Regression', 'model': LinearRegression()}
    ,{'name': 'Lasso Regression', 'model': Lasso()}
    ,{'name': 'Ridge Regression', 'model': Ridge()}
    ,{'name': 'ElasticNet Regression', 'model': ElasticNet()}
    ,{'name': 'Lars Regression', 'model': Lars()}
    ,{'name': 'Decision Tree Regressor', 'model': DecisionTreeRegressor(max_depth=4)}
    ,{'name': 'KNeighbors Regression', 'model': KNeighborsRegressor()}
    #,{'name': 'SGD Regression', 'model': SGDRegressor(max_iter = 10000)}
    #,{'name': 'Linear SVR Regression', 'model': LinearSVR()}
]



In [26]:
results = list()
for m in models:
    print("Calculating "+m['name'])
    tr = cross_validate(m['model'], X, y, scoring ='r2', cv = KFold(5), return_train_score = True)
    tr['name'] = m['name']
    results.append(tr)

Calculating Linear Regression
Calculating Lasso Regression
Calculating Ridge Regression
Calculating ElasticNet Regression
Calculating Lars Regression
Calculating Decision Tree Regressor
Calculating KNeighbors Regression


In [27]:
from IPython.display import display, Markdown
import numpy as np

def printResults(r):
    for i in r:
        display(Markdown('## '+i['name']))
        display(Markdown('**Avg Fit Time:** %2.4f '%(np.average(i['fit_time']))))
        display(Markdown('**Avg Test Score:** %2.4f '%(np.average(i['test_score']))))
        display(Markdown('**Avg Train Score:** %2.4f '%(np.average(i['train_score']))))
                
display(Markdown('# Results Sorted by r2 score'))
printResults(sorted(results, key = lambda r: np.average(r['test_score']),reverse=True))

# Results Sorted by r2 score

## Ridge Regression

**Avg Fit Time:** 0.0136 

**Avg Test Score:** 0.0872 

**Avg Train Score:** 0.0903 

## Linear Regression

**Avg Fit Time:** 0.0244 

**Avg Test Score:** 0.0871 

**Avg Train Score:** 0.0903 

## Lars Regression

**Avg Fit Time:** 0.0220 

**Avg Test Score:** 0.0871 

**Avg Train Score:** 0.0903 

## Lasso Regression

**Avg Fit Time:** 0.0217 

**Avg Test Score:** 0.0775 

**Avg Train Score:** 0.0806 

## ElasticNet Regression

**Avg Fit Time:** 0.0185 

**Avg Test Score:** 0.0502 

**Avg Train Score:** 0.0532 

## Decision Tree Regressor

**Avg Fit Time:** 0.0902 

**Avg Test Score:** -0.0674 

**Avg Train Score:** 0.1752 

## KNeighbors Regression

**Avg Fit Time:** 0.1939 

**Avg Test Score:** -0.1197 

**Avg Train Score:** 0.3012 

In [31]:
from sklearn.model_selection import GridSearchCV

KNeighborsGrid = GridSearchCV(KNeighborsRegressor(), {'n_neighbors':[5, 10, 20, 50]}, scoring = 'r2', return_train_score = True, cv=KFold(5)).fit(X,y)
DecisionTreeGrid = GridSearchCV(DecisionTreeRegressor(), {'max_depth':[4, 8, 12, 24]}, scoring = 'r2', return_train_score = True, cv=KFold(5)).fit(X,y)

display(Markdown('# Hyperparameter Tuning'))

display(Markdown('## KNeighbors Grid Search Results'))
display(Markdown('**Best Params:** %s '%KNeighborsGrid.best_params_))
display(Markdown('**Best Score:** %s '%KNeighborsGrid.best_score_))

display(Markdown('## Desicion Tree Grid Search Results'))
display(Markdown('**Best Params:** %s '%DecisionTreeGrid.best_params_))
display(Markdown('**Best Score:** %s '%DecisionTreeGrid.best_score_))

# Hyperparameter Tuning

## KNeighbors Grid Search Results

**Best Params:** {'n_neighbors': 50} 

**Best Score:** 0.032899972545914855 

## Desicion Tree Grid Search Results

**Best Params:** {'max_depth': 4} 

**Best Score:** -0.06743423886122246 