In [22]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer, normalize
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Others
from sklearn_pandas import DataFrameMapper
import statsmodels.api as sm

In [28]:
# Globals
SEED = 42

In [2]:
# Read in room data
rooms = pd.read_csv('../data/room_data.csv')
rooms = rooms.drop_duplicates()
rooms['isSuperhost'] = rooms['isSuperhost'].astype(int)
rooms['avgRating'] = rooms['avgRating'].fillna(rooms['avgRating'].mean())
rooms = rooms.set_index('id')
rooms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2435 entries, 18669058 to 16214536
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cleaning_fee    2435 non-null   int64  
 1   price           2435 non-null   int64  
 2   bedrooms        2428 non-null   float64
 3   beds            2415 non-null   float64
 4   bathrooms       2435 non-null   float64
 5   personCapacity  2435 non-null   int64  
 6   reviewsCount    2435 non-null   int64  
 7   isSuperhost     2435 non-null   int64  
 8   avgRating       2435 non-null   float64
 9   lat             2435 non-null   float64
 10  lng             2435 non-null   float64
dtypes: float64(6), int64(5)
memory usage: 228.3 KB


In [3]:
# Check our dataframe contents.
rooms.head()

Unnamed: 0_level_0,cleaning_fee,price,bedrooms,beds,bathrooms,personCapacity,reviewsCount,isSuperhost,avgRating,lat,lng
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18669058,5,10,1.0,1.0,1.0,2,82,0,4.71,4.6503,-74.06013
32308869,12,12,1.0,1.0,1.0,2,52,0,4.5,4.64668,-74.07234
35159913,5,12,1.0,1.0,1.0,2,8,0,5.0,4.58942,-74.07299
37721845,11,11,1.0,1.0,1.0,2,20,0,4.7,4.63778,-74.06339
35522458,8,12,1.0,1.0,1.0,2,18,0,4.94,4.64031,-74.06816


In [4]:
# Read in amenities data from a json object.
with open('../data/amenities_strings.json', 'r') as f:
    amenities_dict = json.load(f)

In [5]:
# Convert each entry in the amenities_dict to a document. Replace spaces with underscores
# any other non-alphanumeric character with and underscore.
corpus = []
key_list = []
for key, word_list in amenities_dict.items():
    
    try:
        integer_key = int(key)
        key_list.append(integer_key)
    except Exception as e:
        raise e
    cleaned_word_list = [re.sub(r'(\s+|\W+)', '_', doc) for doc in word_list]
    document = ' '.join(cleaned_word_list)
    corpus.append(document)

In [6]:
# Fit a CountVectorizer
cv = CountVectorizer()
amenities_vec = cv.fit_transform(corpus)
cv.get_feature_names()[:5]

['air_conditioning',
 'baby_bath',
 'baby_monitor',
 'babysitter_recommendations',
 'baking_sheet']

In [7]:
# Convert transformed CountVecortizer data to a DataFrame with room ids as
# the index.
amenities_vec = amenities_vec.toarray()
amenities = pd.DataFrame(amenities_vec, index=key_list)
amenities.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,80
6922245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
31137797,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
31440906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
33579033,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
7200800,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [8]:
# Validate that the length of the imput list of amenities is equal to the sum
# of the amenities in the row for the equivalent id.
for key, val in amenities_dict.items():
    assert len(val) == sum(amenities.loc[int(key)])

In [19]:
# Compare correlation values for price and log of the price with the features before
# and after taking the log of the feaures.
# price_r and log_price_r os the correlation to normal features.
# price_l and log_price_l is for the correlation with the log of the features. 
fields_to_compare = ['price', 'log_price',  'bedrooms', 'beds', 'bathrooms','personCapacity',
                     'reviewsCount', 'isSuperhost', 'avgRating', 'cleaning_fee']
log_rooms = rooms[fields_to_compare].copy()
features_to_log = ['bedrooms', 'beds', 'bathrooms', 'personCapacity', 'cleaning_fee']
for feature in features_to_log:
    log_rooms[feature] = np.log1p(log_rooms[feature])
r = rooms[fields_to_compare].corr()
l = log_rooms.corr()
features = ['price', 'log_price']
r[features].merge(l[features], left_index=True, right_index=True, suffixes=('_r', '_l'))

Unnamed: 0,price_r,log_price_r,price_l,log_price_l
price,1.0,0.937462,1.0,0.937462
log_price,0.937462,1.0,0.937462,1.0
bedrooms,0.291814,0.272612,0.310533,0.305958
beds,0.241047,0.235329,0.267179,0.266311
bathrooms,0.406953,0.410623,0.450905,0.469571
personCapacity,0.295645,0.286233,0.304884,0.310077
reviewsCount,-0.088857,-0.084103,-0.088857,-0.084103
isSuperhost,-0.06127,-0.0467,-0.06127,-0.0467
avgRating,0.070585,0.076367,0.070585,0.076367
cleaning_fee,0.212214,0.21342,0.015984,0.027427


**Based on the correlation values above price will be fit with the following features:**  
log of bedrooms  
log of beds  
log of bathrooms  
log of personCapacity  
reviewsCount - low correlation so likely noise  
isSuperhost - low correlation so likely noise  
avgRating - low correlation so likely noise  
cleaning_fee  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(rooms)

In [23]:
rooms.head()

Unnamed: 0_level_0,cleaning_fee,price,bedrooms,beds,bathrooms,personCapacity,reviewsCount,isSuperhost,avgRating,lat,lng,log_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
18669058,5,10,1.0,1.0,1.0,2,82,0,4.71,4.6503,-74.06013,2.302585
32308869,12,12,1.0,1.0,1.0,2,52,0,4.5,4.64668,-74.07234,2.484907
35159913,5,12,1.0,1.0,1.0,2,8,0,5.0,4.58942,-74.07299,2.484907
37721845,11,11,1.0,1.0,1.0,2,20,0,4.7,4.63778,-74.06339,2.397895
35522458,8,12,1.0,1.0,1.0,2,18,0,4.94,4.64031,-74.06816,2.484907


In [27]:
# Create DataFrames for train_test_split.
# One unchanged from original data as a control.
# One with the log of the selected features.
logs = ['bedrooms', 'beds', 'bathrooms', 'personCapacity']
X_r = rooms.drop(['price', 'log_price', 'lat', 'lng'], axis=1)
y_r = rooms['price']
X_l = rooms.drop(['price', 'log_price', 'lat', 'lng'], axis=1)
for item in logs:
    X_l[item] = np.log1p(X_l[item])
y_l = rooms['price']

In [35]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_r, y_r, test_size=0.2, random_state=SEED, stratify=np.floor(np.log2(y_r)))
Xl_train, Xl_test, yl_train, yl_test = train_test_split(X_l, y_l, test_size=0.2, random_state=SEED, stratify=np.floor(np.log2(y_l)))

id
18669058    3.0
32308869    3.0
35159913    3.0
37721845    3.0
35522458    3.0
           ... 
16052355    6.0
17890512    6.0
17420956    6.0
22904699    7.0
16214536    6.0
Name: price, Length: 2435, dtype: float64