In [11]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import scipy
import statsmodels.formula.api as smf
import re


In [12]:
listings = pd.read_pickle('data/listings_cleaned.pkl')
calendar = pd.read_pickle('data/calendar_cleaned.pkl')

In [13]:
####################################################################################
#################################### Cleaning ######################################
####################################################################################

# listings.host_since = pd.to_datetime(listings.host_since)
# listings.first_review = pd.to_datetime(listings.first_review)
# listings.last_review = pd.to_datetime(listings.last_review)

# listings.price = listings.price.replace('[^0-9.]+','',regex=True).astype(float)
# listings.weekly_price = listings.weekly_price.replace('[^0-9.]+','',regex=True).astype(float)
# listings.monthly_price = listings.monthly_price.replace('[^0-9.]+','',regex=True).astype(float)
# listings.cleaning_fee = listings.cleaning_fee.replace('[^0-9.]+','',regex=True).astype(float)
# listings.cleaning_fee.fillna(0, inplace = True)
# listings.cleaning_fee_perc = listings.cleaning_fee / listings.price

#Audience Segmentation
listings['user_type'] = listings.calculated_host_listings_count.apply(lambda x:  'Sharers' if x <= 2 else 'Businesses')
# listings['user_type'][listings.calculated_host_listings_count > 2] = "Businesses"
# listings['user_type'][listings.calculated_host_listings_count <= 2] = "Sharers"

In [7]:
listings['user_type'].value_counts()

Sharers       2234
Businesses    1351
Name: user_type, dtype: int64

In [8]:
#Important KPI metrics have ~20% Nas
kpi = ['reviews_per_month', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location']
Review_Score_NA = pd.DataFrame([[x, listings[x].isnull().sum()/len(listings[x])] for x in kpi])
Review_Score_NA.columns =['review_score_metrics', '%_of_column_NA']
Review_Score_NA

Unnamed: 0,review_score_metrics,%_of_column_NA
0,reviews_per_month,0.210879
1,review_scores_rating,0.226778
2,review_scores_accuracy,0.229568
3,review_scores_cleanliness,0.228173
4,review_scores_checkin,0.228731
5,review_scores_communication,0.228173
6,review_scores_location,0.229289


In [9]:
####################################################################################
###################################### Dumify ######################################
####################################################################################

# Amenities
listings['amenities'] = listings['amenities'].map(
    lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                           for amn in amns.split(",")]))
                           
amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|"))))[1:]
amenity_arr = np.array([listings['amenities'].map(lambda amns: amn in amns) for amn in amenities[0:43]])

final_dummies = pd.DataFrame()
for i in ['instant_bookable', 'cancellation_policy', 'property_type', 'room_type']:
    df = pd.concat([listings.id, pd.get_dummies(listings[i])], axis = 1)
    final_dummies = pd.concat([final_dummies, df], axis = 1)
final_dummies = final_dummies.T.drop_duplicates().T
final_dummies.columns = ['id', 'instant_bookable_f', 'instant_bookable_t', \
'cancel_pol_flexible', 'cancel_pol_moderate','cancel_pol_strict', \
'cancel_pol_super_strict_30', 'Prop_type_Apartment', 'Prop_type_Bed_&_Breakfast', \
'Prop_type_Boat', 'Prop_type_Camper/RV', 'Prop_type_Condominium', 'Prop_type_Dorm', \
'Prop_type_Entire_Floor', 'Prop_type_Guesthouse', 'Prop_type_House', \
'Prop_type_Loft', 'Prop_type_Other', 'Prop_type_Townhouse', 'Prop_type_Villa', \
'Room_type_Entire_home/apt', 'Room_type_Private room', 'Room_type_Shared_room']

####################################################################################
###################################### Merging #####################################
####################################################################################

#Features is the generic base dataframe, Features2, and Features_3 are modified dfs
#for stats and analysis

#Select specific listing columns
listingColumns = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'cleaning_fee', 'guests_included', \
'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', \
'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'user_type']
listings[listingColumns]

#Turning Amenities dummies to a dataframe to concatenate
amenitiesDummies = pd.DataFrame(data=amenity_arr.T, columns=amenities[0:43])

features = pd.concat([listings[listingColumns], final_dummies, amenitiesDummies], axis=1)
features.drop('id', 1, inplace = True)
features.T.drop_duplicates().T
features.ix[:, 40:83] = features.ix[:, 40:83].astype(int)
features.columns = [i.replace(" ", "_") for i in features.columns]
features.head()

AttributeError: 'list' object has no attribute 'split'

In [43]:
features.user_type.value_counts()

Sharers       2234
Businesses    1351
Name: user_type, dtype: int64

In [46]:
####################################################################################
################################# Audience Analysis ################################
####################################################################################

listingColumns2 = ['neighbourhood_cleansed','accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'cleaning_fee', \
'number_of_reviews', 'instant_bookable', 'property_type', 'room_type', 'review_scores_rating', \
'cancellation_policy', 'review_scores_value']
listings[listingColumns2].columns

features2 = pd.concat([listings[listingColumns2], amenitiesDummies], axis = 1)
features2.ix[:, 14:57] = features2.ix[:, 14:57].astype(int)
features2.price = features2.price.replace('[^0-9.]+','',regex=True).astype(float)
features2.cleaning_fee = features2.cleaning_fee.replace('[^0-9.]+','',regex=True).astype(float)
features2.columns = [i.replace(" ", "_").replace("/","_").replace("(", "").replace(")", "").replace("__", "_").replace("&", "_").replace("-", "_").replace("24", "TwentyFour") for i in features2.columns]

features['user_count'] = 1
featureMean = features.groupby('user_type', as_index = False)['accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'cleaning_fee', 'guests_included', 'minimum_nights',
       'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value'].mean().round(2)

featureSum = features.groupby('user_type', as_index = False)['user_count','instant_bookable_f', 'instant_bookable_t', 'cancel_pol_flexible',
       'cancel_pol_moderate', 'cancel_pol_strict',
       'cancel_pol_super_strict_30', 'Prop_type_Apartment',
       'Prop_type_Bed_&_Breakfast', 'Prop_type_Boat', 'Prop_type_Camper/RV',
       'Prop_type_Condominium', 'Prop_type_Dorm', 'Prop_type_Entire_Floor',
       'Prop_type_Guesthouse', 'Prop_type_House', 'Prop_type_Loft',
       'Prop_type_Other', 'Prop_type_Townhouse', 'Prop_type_Villa',
       'Room_type_Entire_home/apt', 'Room_type_Private_room',
       'Room_type_Shared_room', '24-Hour_Check-in', 'Air_Conditioning',
       'Breakfast', 'Buzzer/Wireless_Intercom', 'Cable_TV',
       'Carbon_Monoxide_Detector', 'Cat(s)', 'Dog(s)', 'Doorman', 'Dryer',
       'Elevator_in_Building', 'Essentials', 'Family/Kid_Friendly',
       'Fire_Extinguisher', 'First_Aid_Kit', 'Free_Parking_on_Premises',
       'Free_Parking_on_Street', 'Gym', 'Hair_Dryer', 'Hangers', 'Heating',
       'Hot_Tub', 'Indoor_Fireplace', 'Internet', 'Iron', 'Kitchen',
       'Laptop_Friendly_Workspace', 'Lock_on_Bedroom_Door', 'Other_pet(s)',
       'Paid_Parking_Off_Premises', 'Pets_Allowed',
       'Pets_live_on_this_property', 'Pool', 'Safety_Card', 'Shampoo',
       'Smoke_Detector', 'Smoking_Allowed', 'Suitable_for_Events', 'TV',
       'Washer', 'Washer_/_Dryer', 'Wheelchair_Accessible',
       'Wireless_Internet'].sum().round(2)

#For the amenity dummies, divide by each audience by user counts
featureSum.ix[2, :] = featureSum.sum()
featureSum.ix[0, 2:] = featureSum.ix[0, 2:]/featureSum.ix[0, 1]
featureSum.ix[1, 2:] = featureSum.ix[1, 2:]/featureSum.ix[1, 1]

user_summary = pd.concat([featureMean,featureSum], axis = 1).round(2).T
user_summary

Unnamed: 0,0,1,2
user_type,Businesses,Sharers,
accommodates,3.29,2.89,
bathrooms,1.33,1.16,
bedrooms,1.3,1.23,
beds,1.69,1.56,
price,185.66,166.83,
cleaning_fee,69.99,33.52,
guests_included,1.41,1.44,
minimum_nights,4.01,2.67,
number_of_reviews,21.13,17.78,


In [16]:
####################################################################################
####################################### Stats ######################################
####################################################################################

import scipy
import statsmodels.formula.api as smf

#Multicollinearity
features3 = features.drop(features[['reviews_per_month', 'user_type', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']], axis = 1)
features3.columns = [i.replace("/","_").replace("(", "").replace(")", "").replace("__", "_").replace("&", "_").replace("-", "_").replace("24", "TwentyFour") for i in features3.columns]

CorrCoef = pd.DataFrame({i: scipy.stats.pearsonr(features3[i], features3['review_scores_rating']) for i in features3.columns}).T
CorrCoef.columns = ['Correlation_Coefficient', '2-Tailed-P_Value']
CorrCoef.sort_values(['Correlation_Coefficient'], ascending = False, inplace = True)
CorrCoef

#From the looks of it, there doesn't appear to be multicollinearity as the max coef is ~.15. 
#We're safe to go ahead and forward select

Unnamed: 0,Correlation_Coefficient,2-Tailed-P_Value
review_scores_rating,1.000000,0.000000e+00
Essentials,0.153612,6.514051e-16
Air_Conditioning,0.144154,3.552423e-14
Shampoo,0.135918,9.377090e-13
Smoke_Detector,0.131174,5.660978e-12
Fire_Extinguisher,0.131147,5.718904e-12
Iron,0.129737,9.635219e-12
Wireless_Internet,0.123447,9.234492e-11
Dryer,0.118991,4.281449e-10
Carbon_Monoxide_Detector,0.118484,5.080909e-10


In [13]:
def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

model = forward_selected(features2, 'review_scores_rating')
print(model.model.formula)
print(model.rsquared_adj)

results = smf.ols('review_scores_rating ~ review_scores_value + price + Pets_live_on_this_property + accommodates + Wireless_Internet + Kitchen + cleaning_fee + instant_bookable + Doorman + bathrooms + neighbourhood_cleansed + cancellation_policy + room_type + 1', data=features2).fit()
print(results.summary())

review_scores_rating ~ review_scores_value + Air_Conditioning + price + Dryer + Pets_live_on_this_property + Shampoo + accommodates + Wireless_Internet + Kitchen + cleaning_fee + Fire_Extinguisher + instant_bookable + Doorman + Pool + Essentials + bathrooms + Indoor_Fireplace + Family_Kid_Friendly + neighbourhood_cleansed + Laptop_Friendly_Workspace + cancellation_policy + Dogs + room_type + 1
0.640469039645
                             OLS Regression Results                             
Dep. Variable:     review_scores_rating   R-squared:                       0.637
Model:                              OLS   Adj. R-squared:                  0.632
Method:                   Least Squares   F-statistic:                     121.5
Date:                  Wed, 26 Apr 2017   Prob (F-statistic):               0.00
Time:                          15:45:29   Log-Likelihood:                -8667.8
No. Observations:                  2740   AIC:                         1.742e+04
Df Residuals:        