# Imports

In [1]:
#Enable matplotlib to display in jupyter notebook & import it
%matplotlib inline

import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim #used in filling missing zipcodes
import math


# Reading Files

In [2]:
listings = pd.read_pickle('data/listings_cleaned.pkl')
calendar = pd.read_pickle('data/calendar_cleaned.pkl')


# Creating Quarterly Revenue

In [3]:
quarter_dates = ['2016-09-06','2016-12-06','2017-03-06','2017-06-06','2017-09-06']

quarters = [calendar[calendar.date.isin(pd.date_range(quarter_dates[n], quarter_dates[n+1]))] for n in range(4)]
quarters_revenue = pd.DataFrame(index = pd.Index(calendar.listing_id.unique(), name = 'listing_id'))
quarters_revenue = quarters_revenue.join(listings['host_id'])
for n,q in enumerate(quarters):
    quarters_revenue = (quarters_revenue.join(pd.DataFrame(quarters[n].groupby('listing_id').day_revenue.sum()))).rename(columns={'day_revenue':'q'+str(n+1)+'_revenue'})



In [4]:
amenities = list(set([item for item_list in listings.amenities for item in item_list]))
for val in ['translation missing enhosting_amenity_49','translation missing enhosting_amenity_50', '']:
    amenities.remove(val)

amn_frame = pd.DataFrame(index = listings.index)

#create the dummy for each amenity and rename the column as you go
#for amn in amenities:
#    amn_frame = amn_frame.join(listings.amenities.apply(lambda amns: amn in amns)).rename(columns={'amenities':amn})


# CLEANED REGRESSION OUTPUTS

In [5]:
#IMPORTS
import numpy as np
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy import stats


listings['analysis_table'] = listings.index
listings['analysis_table'] = pd.DataFrame(listings.analysis_table.map(lambda x: amn_frame.loc[x]))

    
#listings['analysis_table'] = listings.index
#listings['analysis_table'] = pd.DataFrame(listings.analysis_table.map(lambda x: amn_frame.loc[x]))
analysis_table = pd.merge(listings, amn_frame, left_index=True, right_index=True)

#Trying to normalize ratings for each neighborhood
#ratings_table = analysis_table.groupby('neighbourhood_cleansed')['review_scores_rating'].agg([np.mean,np.std]).reset_index()
#analysis_table = pd.merge(analysis_table, ratings_table, left_on = 'neighbourhood_cleansed', right_on = 'neighbourhood_cleansed').rename(columns = {'mean':'review_scores_mean','std':'standard_deviation'})
#analysis_table['reviews_neighborhood_zscore'] = (analysis_table.review_scores_rating - analysis_table.review_scores_mean)/analysis_table.standard_deviation




In [6]:
analysis_table = pd.merge(analysis_table, pd.get_dummies(analysis_table.room_type), left_index = True, right_index = True)
#analysis_table = pd.merge(analysis_table, pd.get_dummies(analysis_table.neighbourhood_cleansed), left_index = True, right_index = True)
analysis_table = pd.merge(analysis_table, pd.get_dummies(analysis_table.property_type), left_index = True, right_index = True)
#analysis_table = pd.merge(analysis_table, pd.get_dummies(analysis_table.zipcode), left_index = True, right_index = True)

In [7]:
#Dummy Variables are intended to be n-1 #zipcode: '02472', 'Hyde Park'
cols_to_remove = ['host_id','zipcode','neighbourhood_cleansed', 'latitude', 'longitude', 'amenities', 'first_review', 'room_type','property_type', 'listed_price', 'Carbon Monoxide Detector', 'Entire home/apt','Apartment']
analysis_table['cleaning_fee'] = (analysis_table.cleaning_fee > 0)
#(pd.DataFrame(analysis_table.columns)).to_csv('analysis_table_columns.csv')
for i in cols_to_remove:
    if i in analysis_table.columns:
        del analysis_table[i]

In [13]:
combined_data = quarters_revenue.join(analysis_table, how='left',lsuffix='', rsuffix='_lis')
#combined_data = combined_data[combined_data['reviews_per_month'] > 3]
#CHANGED THIS FOR ANALYSIS
combined_data = combined_data[(combined_data['calculated_host_listings_count'] > 2)]# & (combined_data['reviews_per_month'] > 2.0)]
#plt.scatter(combined_data['calculated_host_listings_count'], combined_data['reviews_per_month'])

analysis_data = pd.DataFrame(combined_data)#, columns=['q1_revenue','number_of_reviews','number_of_reviews', 'review_scores_rating','review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location','review_scores_value'])
analysis_data.dropna(inplace=True)

#analysis_data_y = (analysis_data['q1_revenue'].apply(lambda x: math.log(x + 1))).copy()
analysis_data['review_scores_rating_zscore'] = stats.zscore(analysis_data.review_scores_rating)
analysis_data_y = analysis_data['review_scores_rating_zscore'].copy()

#withholding 'number_of_reviews_'
analysis_data_x = analysis_data.copy()
del analysis_data_x['q1_revenue']
del analysis_data_x['q2_revenue']
del analysis_data_x['q3_revenue']
del analysis_data_x['q4_revenue']

#USED FOR ESTIMATING REVIEW_SCORES_RATING
del analysis_data_x['review_scores_rating']
del analysis_data_x['review_scores_rating_zscore']
#del analysis_data_x['review_scores_accuracy']
#del analysis_data_x['review_scores_cleanliness']
#del analysis_data_x['review_scores_checkin']
#del analysis_data_x['review_scores_communication']
#del analysis_data_x['review_scores_location']
#del analysis_data_x['review_scores_value']

Unnamed: 0_level_0,review_scores_rating_zscore
listing_id,Unnamed: 1_level_1
13865867,0.611685
10706413,0.127468
1964878,0.224311
9841140,0.708529
8789821,0.999060
3278624,-0.259907
12979965,0.321155
13924495,0.999060
11987762,-4.811554
7853971,0.321155


In [15]:
#Divided the review_scores_rating by 100 to normalize
#analysis_data_x['review_scores_rating'] = analysis_data_x['review_scores_rating'] / 10
#analysis_data['q1_revenue'] = analysis_data['q1_revenue'].apply(lambda x: math.log(x+1))


#Propensity for columns to replicate- this deals with that issue
analysis_data_x = analysis_data_x.loc[:,~analysis_data_x.columns.duplicated()]

reg = linear_model.LinearRegression()

#partition the data into training and test sets
x_train,x_test,y_train,y_test = train_test_split(analysis_data_x, analysis_data_y, test_size = 0.2, random_state = 4)

output = reg.fit(x_train,y_train)
output.coef_
predicted_answers = reg.predict(x_test)
predicted_answers[0]
y_test.iloc[2]
#mean-squared error
np.mean((predicted_answers-y_test)**2)

ValueError: setting an array element with a sequence.

In [None]:
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('analysis_data_y ~  review_scores_location  +  review_scores_checkin  + review_scores_cleanliness + review_scores_communication + review_scores_value + review_scores_accuracy', data=analysis_data).fit()

# Inspect the results
print(results.summary())

In [None]:
import matplotlib.pylab as plt
from sklearn.linear_model import LassoLarsCV

#lasso regression with 8 cross-validations selected
model=LassoLarsCV(cv=8, precompute=False).fit(x_train,y_train)

#this code is intended to extract the coefficients and their names and then to place them into a table
coefficients = dict(zip(x_train.columns, model.coef_))
coefficients_results = pd.DataFrame([(key, val) for (key, val) in coefficients.items()], 
                                    columns = ['coefficient_name','estimated_value'])

#plots coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
 label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Cofficients Progression for Lasso Paths')

#plots mean square error for each fold of the cross-validation
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
 label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
 label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error on each fold')

# MSE from training and test data
train_error = mean_squared_error(y_train, model.predict(x_train))
test_error = mean_squared_error(y_test, model.predict(x_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)

# R-square from training and test data
rsquared_train=model.score(x_train,y_train)
rsquared_test=model.score(x_test,y_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
print('Features Most Significant')
coefficients_results[coefficients_results['estimated_value'] != 0].sort_values('estimated_value', ascending = False)
#ASSISTANCE FROM: https://cognosworld.wordpress.com/2016/02/15/machine-learning-lasso-regression-using-python/

In [None]:
#analysis_data['review_scores_rating_zscore'].plot(kind='hist')
?plt.plot()

In [None]:
#revenue = analysis_data.q1_revenue.apply(lambda x: math.log(x+1))
plt.scatter(analysis_data.review_scores_rating_zscore, analysis_data.q1_revenue)#analysis_data.q1_revenue)
plt.xlabel('Overall Review Scores Rating (Z-Score)')
plt.ylabel('Quarter 1 Revenue Projection')
plt.title('Overall Ratings (Z-Score) vs. Q1 Revenue')
plt.ylim([0,50000])
plt.xlim([-8,1])



In [None]:
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('q1_revenue ~ review_scores_rating_zscore ', data=analysis_data).fit()

# Inspect the results
print(results.summary())
analysis_data.q1_revenue.median()

In [None]:
#analysis_data_x[analysis_data_x['Wheelchair Accessible']]['Wheelchair Accessible'].shape
combined_data.q1_revenue.plot(kind='hist', bins = 100)

comb_mean = combined_data.q1_revenue.mean()
comb_med = combined_data.q1_revenue.median()
comb_max = combined_data.q1_revenue.max()
y_max = 20
plt.ylim([0,y_max])
plt.vlines(comb_mean, 0, y_max, color='magenta', label = 'mean revenue')
plt.vlines(comb_med, 0, y_max, color='green', label = 'median revenue')
plt.vlines(comb_max, 0, y_max, color = 'red',label='max revenue')
plt.xlabel('Estimated Q1 Revenue ($)')
plt.title('Distribution of Q1 Revenues for Commercial Properties with >2 Reviews Per Month')
plt.legend(loc='upper right')



In [None]:
#-np.log10(model.cv_alphas_)
listings.columns