In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
pd.set_option('display.max_columns', None)
import os
import chart_studio.plotly as py
# import plotly.graph_ogjs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
#Importing the data
final_df=pd.DataFrame()
a_df=pd.DataFrame()
b_df=pd.DataFrame()
c_df=pd.DataFrame()

for file_n in os.listdir('listings'):
    if file_n[0]=='.':
        continue
    else:
        add_df = pd.read_csv(f'listings/{file_n}')
        if len(add_df.columns)==106:
            final_df = pd.concat([final_df, add_df], axis=0)
        elif len(add_df.columns)==96:
            a_df = pd.concat([a_df, add_df], axis=0)
        elif len(add_df.columns)==95:
            b_df = pd.concat([b_df, add_df], axis=0)
        else:
            c_df = pd.concat([c_df, add_df], axis=0)


Columns (43) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (61,62) have mixed types. Specify dtype option on import or set low_memory=False.


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [16]:
#These are the columns not present in 2018 reports. They will be dropped.
not_in_2018 = ['minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights',
       'minimum_nights_avg_ntm','maximum_nights_avg_ntm', 'number_of_reviews_ltm', 
       'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms', 
       'calculated_host_listings_count_shared_rooms']
sf_df = final_df.drop(columns=not_in_2018)
sf_df = pd.concat([sf_df, a_df], axis=0)

#drop one column not in 2017
sf_df = sf_df.drop(columns=['is_business_travel_ready'])
sf_df = pd.concat([sf_df, b_df], axis=0)

#drop 3 columns not in 2016 & Prior
sf_df = sf_df.drop(columns=['access', 'interaction', 'house_rules'])
sf_df = pd.concat([sf_df, c_df], axis=0)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





## Data Cleaning

In [17]:
pd.set_option('display.max_rows', 20)
#Converting Existing Data Columns to Dates
sf_df['last_scraped'] = pd.to_datetime(sf_df['last_scraped'])
sf_df['host_since'] = pd.to_datetime(sf_df['host_since'])
#Adding new date features
sf_df['year'] = sf_df['last_scraped'].transform(lambda x: x.year)
sf_df['month-year'] = sf_df['last_scraped'].transform(lambda x: f'{x.month} {x.year}')
sf_df['month'] = sf_df['last_scraped'].transform(lambda x: x.month)
sf_df['day_of_week'] = sf_df['last_scraped'].transform(lambda x: x.dt.dayofweek)
sf_df['day'] = sf_df['last_scraped'].transform(lambda x: x.day)

#Converting All Price Related Columns from Objects to Floats
sf_df['price'] = sf_df['price'].transform(lambda x: float(x.replace(',', '').replace('$', '')))
sf_df['extra_people'] = sf_df['extra_people'].transform(lambda x: float(x.replace(',', '').replace('$', '')))

#Fill NaNs in fee columns with 0's, because no additional fee, then convert
sf_df[['security_deposit','cleaning_fee']] = sf_df[['security_deposit','cleaning_fee']].fillna(int(0))
sf_df['security_deposit'] = sf_df['security_deposit'].transform(lambda x: 0 if x==int(0) else float(x.replace(',', '').replace('$', '')))
sf_df['cleaning_fee'] = sf_df['cleaning_fee'].transform(lambda x: 0 if x==int(0) else float(x.replace(',', '').replace('$', '')))

#Dropping columns that have over 75% null
over_70_null = sf_df.columns[sf_df.isnull().sum()/len(sf_df) > 0.70]
sf_df = sf_df.drop(columns=over_70_null)

#Removing Outliers (0 & over 2000 daily rate)
outlier_thresh=2000
sf_df = sf_df.loc[~((sf_df['price'] == 0) | (sf_df['price'] > outlier_thresh))]

In [103]:
sf_df['month-year'].nunique()

52

## Train Test Split #1: Use 2015 - 2018 to predict 2019 & 2020

In [48]:
test_years=[2020,2019]
train_df = sf_df[~sf_df['year'].isin(test_years)]
test_df = sf_df[sf_df['year'].isin(test_years)]

In [49]:
#Filling the NaNs in beds, bathrooms, bedrooms
#Assume that if the tenant has the full apartment, they have a bathroom/bedroom
cond1= (train_df['room_type']=='Entire home/apt')
train_df.loc[cond1 & (train_df['beds']==0), train_df.columns=='beds'] = 1
train_df.loc[cond1 & (train_df['bathrooms']==0), train_df.columns=='bathrooms']
#For the rest, just fill na
train_df[['beds','bathrooms', 'bedrooms']] = train_df[['beds','bathrooms', 'bedrooms']].fillna(0)

#For review scores, let's just fillin with the average
review_lst = ['review_scores_accuracy', 'review_scores_checkin',
       'review_scores_cleanliness', 'review_scores_communication',
       'review_scores_location', 'review_scores_rating', 'review_scores_value']
for review in review_lst:
    train_df[review] = train_df[review].fillna(train_df[review].mean())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Baseline Models: No Parameter Tuning or Feature Engineering

In [84]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [77]:
features=['accommodates','bathrooms', 'bed_type','bedrooms', 'beds','cleaning_fee',
          'extra_people', 'host_response_time',
         'neighbourhood_cleansed','property_type','review_scores_cleanliness',
          'review_scores_rating', 'room_type', 'security_deposit', 'month-year',
          'month','day_of_week']
X = train_df[features]
y = train_df['price'].apply(np.log)

In [78]:
X = pd.get_dummies(X, columns=['bed_type','host_response_time', 
                           'neighbourhood_cleansed','property_type',
                           'room_type', 'month-year','month','day_of_week'])

In [79]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

In [80]:
X_train.isnull().sum().sort_values(ascending=False)

day_of_week_6                              0
neighbourhood_cleansed_South of Market     0
property_type_Apartment                    0
property_type_Aparthotel                   0
neighbourhood_cleansed_Western Addition    0
                                          ..
month-year_10 2016                         0
month-year_1 2018                          0
month-year_1 2017                          0
room_type_Shared room                      0
accommodates                               0
Length: 158, dtype: int64

In [96]:
estimator = LinearRegression()
model = estimator.fit(X_train,y_train)
preds = model.predict(X_val)
r2 = model.score(X_val, y_val)
rmse = mean_squared_error(preds, y_val)
cs_val = cross_val_score(estimator, X_train, y_train, cv=3)
cs_val2 = cross_val_score(estimator, X, y, cv=3)
print(f'R2: {r2}, RMSE: {np.exp(rmse)},\nCross Val Scores: {cs_val}\n {cs_val2}')

R2: 0.6173952663311504, RMSE: 1.2014911787387206,
Cross Val Scores: [0.61926593 0.62268511 0.62378648]
 [0.52507051 0.59334955 0.27202679]


In [97]:
estimator = DecisionTreeRegressor()
model = estimator.fit(X_train,y_train)
preds = model.predict(X_val)
r2 = model.score(X_val, y_val)
rmse = mean_squared_error(preds, y_val)
cs_val = cross_val_score(estimator, X_train, y_train, cv=3)
cs_val2 = cross_val_score(estimator, X, y, cv=3)
print(f'R2: {r2}, RMSE: {np.exp(rmse)},\nCross Val Scores: {cs_val}\n {cs_val2}')

R2: 0.8095543120834257, RMSE: 1.095675102991079,
Cross Val Scores: [0.78017047 0.77630018 0.77717544]
 [0.60992562 0.80553189 0.72205495]


In [99]:
estimator = RandomForestRegressor()
model = estimator.fit(X_train,y_train)
preds = model.predict(X_val)
r2 = model.score(X_val, y_val)
rmse = mean_squared_error(preds, y_val)
cs_val = cross_val_score(estimator, X_train, y_train, cv=3)
cs_val2 = cross_val_score(estimator, X, y, cv=3)
print(f'R2: {r2}, RMSE: {np.exp(rmse)},\nCross Val Scores: {cs_val}\n {cs_val2}')


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



R2: 0.8689382716749707, RMSE: 1.064898919273087,
Cross Val Scores: [0.85360751 0.85187293 0.85314819]
 [0.72735395 0.8669464  0.81268163]


In [100]:
estimator = GradientBoostingRegressor()
model = estimator.fit(X_train,y_train)
preds = model.predict(X_val)
r2 = model.score(X_val, y_val)
rmse = mean_squared_error(preds, y_val)
cs_val = cross_val_score(estimator, X_train, y_train, cv=3)
cs_val2 = cross_val_score(estimator, X, y, cv=3)
print(f'R2: {r2}, RMSE: {np.exp(rmse)},\nCross Val Scores: {cs_val}\n {cs_val2}')

R2: 0.656331075665477, RMSE: 1.1792552355223438,
Cross Val Scores: [0.65950634 0.65872468 0.65577119]
 [0.63179432 0.6356818  0.65560885]
