## Capstone: Airbnb in DC
## Modeling

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

%matplotlib inline

### Functions

In [2]:
def evaluate_model(model, X_train, y_train, X_test, y_test):

    # Score model: R-squared (coefficient of determination)

    print('train R^2:', model.score(X_train, y_train))
    print('test R^2:', model.score(X_test, y_test))
    print('')

    # compare train & test RMSE (root mean squared error)

    y_pred_train = model.predict(X_train)
    train_mse = mean_squared_error(np.exp(y_train), np.exp(y_pred_train))
    train_rmse = np.sqrt(train_mse)

    y_pred_test = model.predict(X_test)
    test_mse = mean_squared_error(np.exp(y_test), np.exp(y_pred_test))
    test_rmse = np.sqrt(test_mse)

    print('train RMSE:', train_rmse)
    print('test RMSE:', test_rmse)
    print('')

    # compare train & test MAE (mean absolute error)

    y_pred_train = model.predict(X_train)
    train_mae = mean_absolute_error(np.exp(y_train), np.exp(y_pred_train))

    y_pred_test = model.predict(X_test)
    test_mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred_test))

    print('train MAE:', train_mae)
    print('test MAE:', test_mae)

### Import cleaned data

In [3]:
dflist = pd.read_csv('../data/airbnb01.csv')
dflist.shape

  interactivity=interactivity, compiler=compiler, result=result)


(82942, 103)

### Data preprocessing for linear regression

In [4]:
# numeric features
features_num = ['calculated_host_listings_count', 'accommodates', 'bathrooms', 'bedrooms', 'beds',  
                'availability_365', 'number_of_reviews', 'review_scores_rating']

# categorical features
features_cat = ['host_is_superhost', 'neighbourhood_cleansed', 'zipcode', 'property_type', 'room_type', 'cancellation_policy']

# all features including target (log_price)
features = features_num + features_cat
features.extend(['log_price'])
features

['calculated_host_listings_count',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'availability_365',
 'number_of_reviews',
 'review_scores_rating',
 'host_is_superhost',
 'neighbourhood_cleansed',
 'zipcode',
 'property_type',
 'room_type',
 'cancellation_policy',
 'log_price']

In [5]:
X = dflist[features].dropna()
y = X['log_price']
X = X.drop(columns='log_price')

X = pd.get_dummies(X, columns=features_cat, drop_first=True)
X.head()

Unnamed: 0,calculated_host_listings_count,accommodates,bathrooms,bedrooms,beds,availability_365,number_of_reviews,review_scores_rating,host_is_superhost_t,"neighbourhood_cleansed_Brookland, Brentwood, Langdon",...,property_type_Serviced apartment,property_type_Townhouse,property_type_other,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,18,4,1.0,1.0,2.0,283,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6,3.0,3.0,3.0,343,65,94.0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,2,1.0,1.0,1.0,365,0,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,4,1.0,1.0,1.0,351,0,0.0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,1,4,1.0,2.0,4.0,361,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [6]:
X = dflist[features].dropna()
y = X['log_price']
X = X.drop(columns='log_price')

X = pd.get_dummies(X, columns=features_cat, drop_first=True)
X.head()

Unnamed: 0,calculated_host_listings_count,accommodates,bathrooms,bedrooms,beds,availability_365,number_of_reviews,review_scores_rating,host_is_superhost_t,"neighbourhood_cleansed_Brookland, Brentwood, Langdon",...,property_type_Serviced apartment,property_type_Townhouse,property_type_other,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,18,4,1.0,1.0,2.0,283,0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6,3.0,3.0,3.0,343,65,94.0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,2,1.0,1.0,1.0,365,0,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,4,1.0,1.0,1.0,351,0,0.0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,1,4,1.0,2.0,4.0,361,0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Standardize features to have mean = 0 and standard deviation = 1
# This is important for comparing features with different units or scales
# and also a general requirement for many machine learning algorithms.

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Baseline regression model

In [9]:
# Create Dummy Regression that always predicts the mean value of target
# ref: https://chrisalbon.com/machine_learning/model_evaluation/create_baseline_regression_model/

# Create a dummy regressor
dummy_mean = DummyRegressor(strategy='mean')

# "Train" dummy regressor
dummy_mean.fit(X_train_sc, y_train)

# Get R-squared (coefficient of determination)
print('baseline R^2:', dummy_mean.score(X_test_sc, y_test))

# get baseline RMSE (root mean squared error)
y_pred = dummy_mean.predict(X_test_sc)
mse = mean_squared_error(np.exp(y_test), np.exp(y_pred))
rmse = np.sqrt(mse)
print('baseline RMSE:', rmse)

# get baseline MAE (mean absolute error)
mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred))
print('baseline MAE:', mae)

baseline R^2: -5.143814048769179e-05
baseline RMSE: 393.1244874668375
baseline MAE: 141.9438372966505


### Model 01: Linear regression

In [10]:
# fit model
model = LinearRegression()
model.fit(X_train_sc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
evaluate_model(model, X_train_sc, y_train, X_test_sc, y_test)

train R^2: 0.5665116260426284
test R^2: 0.5664101819341408

train RMSE: 304.6476528132647
test RMSE: 331.0587985350242

train MAE: 105.48749832225363
test MAE: 107.24797471496684


### Model 02: Lasso

In [12]:
# Instantiate model
lasso_model = LassoCV(cv=3)

# Fit model using best alpha
lasso_model = lasso_model.fit(X_train_sc, y_train)

# optimal value of alpha
lasso_model.alpha_

0.0004103850950474946

In [13]:
evaluate_model(lasso_model, X_train_sc, y_train, X_test_sc, y_test)

train R^2: 0.5659975083788471
test R^2: 0.5659293958876668

train RMSE: 304.98919475309043
test RMSE: 331.33616722949535

train MAE: 105.51339964569439
test MAE: 107.28701618277356


In [14]:
# look at coefficients

lasso_coefs = pd.DataFrame(data=lasso_model.coef_, index=X_train.columns, columns=['coef'])
lasso_coefs['abs_coef'] = np.abs(lasso_coefs['coef'])
lasso_coefs.sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,abs_coef
review_scores_rating,-0.241026,0.241026
room_type_Private room,-0.203300,0.203300
bedrooms,0.189268,0.189268
room_type_Shared room,-0.187615,0.187615
accommodates,0.181607,0.181607
"neighbourhood_cleansed_Downtown, Chinatown, Penn Quarters, Mount Vernon Square, North Capitol Street",0.110560,0.110560
"neighbourhood_cleansed_Capitol Hill, Lincoln Park",0.102147,0.102147
"neighbourhood_cleansed_Shaw, Logan Circle",0.091821,0.091821
bathrooms,0.089964,0.089964
"neighbourhood_cleansed_Union Station, Stanton Park, Kingman Park",0.087670,0.087670


In [15]:
# look at zero coefficients

lasso_coefs[lasso_coefs['abs_coef'] == 0]

Unnamed: 0,coef,abs_coef
"neighbourhood_cleansed_Friendship Heights, American University Park, Tenleytown",0.0,0.0


### Data preprocessing for model 03

In [16]:
features.extend(['year_2015', 'year_2018', 'q_2', 'q_3', 'q_4'])
X = dflist[features].dropna()
y = X['log_price']
X = X.drop(columns='log_price')

X = pd.get_dummies(X, columns=features_cat, drop_first=True)
X.head()

Unnamed: 0,calculated_host_listings_count,accommodates,bathrooms,bedrooms,beds,availability_365,number_of_reviews,review_scores_rating,year_2015,year_2018,...,property_type_Serviced apartment,property_type_Townhouse,property_type_other,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,18,4,1.0,1.0,2.0,283,0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6,3.0,3.0,3.0,343,65,94.0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,2,1.0,1.0,1.0,365,0,0.0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,1,4,1.0,1.0,1.0,351,0,0.0,1,0,...,0,1,0,0,0,0,0,0,0,0
5,1,4,1.0,2.0,4.0,361,0,0.0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Model 03: Lasso (with time features)

In [19]:
# Instantiate model
lasso_model = LassoCV(cv=3)

# Fit model using best alpha
lasso_model = lasso_model.fit(X_train_sc, y_train)

# optimal value of alpha
lasso_model.alpha_

0.0004103850950474946

In [20]:
evaluate_model(lasso_model, X_train_sc, y_train, X_test_sc, y_test)

train R^2: 0.5679197167645623
test R^2: 0.5675369115322966

train RMSE: 304.2621516245557
test RMSE: 330.82651264691464

train MAE: 105.21655130504932
test MAE: 107.03556963017107


In [21]:
# look at coefficients

lasso_coefs = pd.DataFrame(data=lasso_model.coef_, index=X_train.columns, columns=['coef'])
lasso_coefs['abs_coef'] = np.abs(lasso_coefs['coef'])
lasso_coefs.sort_values('abs_coef', ascending=False)

Unnamed: 0,coef,abs_coef
review_scores_rating,-0.239697,0.239697
room_type_Private room,-0.203378,0.203378
bedrooms,0.189527,0.189527
room_type_Shared room,-0.187872,0.187872
accommodates,0.180149,0.180149
"neighbourhood_cleansed_Downtown, Chinatown, Penn Quarters, Mount Vernon Square, North Capitol Street",0.109824,0.109824
"neighbourhood_cleansed_Capitol Hill, Lincoln Park",0.101238,0.101238
"neighbourhood_cleansed_Shaw, Logan Circle",0.091110,0.091110
bathrooms,0.090239,0.090239
"neighbourhood_cleansed_Union Station, Stanton Park, Kingman Park",0.086829,0.086829


In [22]:
# look at zero coefficients

lasso_coefs[lasso_coefs['abs_coef'] == 0]

Unnamed: 0,coef,abs_coef
"neighbourhood_cleansed_Friendship Heights, American University Park, Tenleytown",-0.0,0.0


### Data preprocessing for model 04

In [23]:
X = dflist[features].dropna()
y = X['log_price']
X = X.drop(columns='log_price')

X = pd.get_dummies(X, columns=features_cat, drop_first=True)
X.head()

Unnamed: 0,calculated_host_listings_count,accommodates,bathrooms,bedrooms,beds,availability_365,number_of_reviews,review_scores_rating,year_2015,year_2018,...,property_type_Serviced apartment,property_type_Townhouse,property_type_other,room_type_Private room,room_type_Shared room,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,18,4,1.0,1.0,2.0,283,0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6,3.0,3.0,3.0,343,65,94.0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,1,2,1.0,1.0,1.0,365,0,0.0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,1,4,1.0,1.0,1.0,351,0,0.0,1,0,...,0,1,0,0,0,0,0,0,0,0
5,1,4,1.0,2.0,4.0,361,0,0.0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
# to reduce total number of polynomial features:
# include interaction terms only (not squared terms) 
poly = PolynomialFeatures(interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [26]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train_poly)
X_test_sc = ss.transform(X_test_poly)

### Model 04: Lasso (with polynomial features)

In [27]:
# Instantiate model
lasso_model = LassoCV(cv=3)

# Fit model using best alpha
lasso_model = lasso_model.fit(X_train_sc, y_train)

# optimal value of alpha
lasso_model.alpha_



0.00041038509504750636

In [28]:
evaluate_model(lasso_model, X_train_sc, y_train, X_test_sc, y_test)

train R^2: 0.6557417109384336
test R^2: 0.6451750458361454

train RMSE: 268.70595444503414
test RMSE: 297.2311919155132

train MAE: 92.70622024405951
test MAE: 95.99006562887536


In [29]:
# look at coefficients

lasso_coefs = pd.DataFrame(poly.get_feature_names(X_train.columns))
lasso_coefs['coef'] = lasso_model.coef_
lasso_coefs['abs_coef'] = np.abs(lasso_coefs['coef'])
lasso_coefs.sort_values('abs_coef', ascending=False)

Unnamed: 0,0,coef,abs_coef
2,accommodates,0.447982,0.447982
8,review_scores_rating,-0.229277,0.229277
167,accommodates review_scores_rating,-0.216619,0.216619
7,number_of_reviews,-0.182784,0.182784
75,room_type_Private room,-0.158738,0.158738
3,bathrooms,0.156852,0.156852
162,accommodates bathrooms,-0.147533,0.147533
626,review_scores_rating host_is_superhost_t,0.145233,0.145233
547,number_of_reviews review_scores_rating,0.141494,0.141494
14,host_is_superhost_t,-0.138282,0.138282


In [30]:
# look at zero coefficients

lasso_coefs[lasso_coefs['abs_coef'] == 0]

Unnamed: 0,0,coef,abs_coef
0,1,0.0,0.0
12,q_3,-0.0,0.0
15,"neighbourhood_cleansed_Brookland, Brentwood, L...",0.0,0.0
19,"neighbourhood_cleansed_Cleveland Park, Woodley...",0.0,0.0
25,"neighbourhood_cleansed_Edgewood, Bloomingdale,...",0.0,0.0
27,"neighbourhood_cleansed_Georgetown, Burleith/Hi...",0.0,0.0
29,neighbourhood_cleansed_Historic Anacostia,-0.0,0.0
35,"neighbourhood_cleansed_North Cleveland Park, F...",-0.0,0.0
36,"neighbourhood_cleansed_North Michigan Park, Mi...",-0.0,0.0
38,neighbourhood_cleansed_Southwest Employment Ar...,0.0,0.0


### Final model and create predictions

In [31]:
X = dflist[features].dropna()
y = X['log_price']
X = X.drop(columns='log_price')

X = pd.get_dummies(X, columns=features_cat, drop_first=True)
X.shape

(80649, 81)

In [32]:
# to reduce total number of polynomial features:
# include interaction terms only (not squared terms) 
poly = PolynomialFeatures(interaction_only=True)
X_poly = poly.fit_transform(X)

In [33]:
ss = StandardScaler()
X_sc = ss.fit_transform(X_poly)

In [34]:
# Instantiate model
lasso_model = LassoCV(cv=3)

# Fit model using best alpha
lasso_model = lasso_model.fit(X_sc, y)

# optimal value of alpha
lasso_model.alpha_



0.0010870747187579346

In [35]:
# Get R-squared (coefficient of determination)
print('R^2:', lasso_model.score(X_sc, y))

# RMSE (root mean squared error)
y_pred = lasso_model.predict(X_sc)
mse = mean_squared_error(np.exp(y), np.exp(y_pred))
rmse = np.sqrt(mse)
print('RMSE:', rmse)

# MAE (mean absolute error)
mae = mean_absolute_error(np.exp(y), np.exp(y_pred))
print('MAE:', mae)

R^2: 0.6454394003587867
RMSE: 278.54784339596824
MAE: 94.85770012771434
