In [None]:
import pandas as pd
import seaborn as sns
import statsmodels.tsa.api as smt  
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import metrics
import requests
import io
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [None]:
Newyork = pd.read_csv("Newyork.csv")
Newyork.head(2)
pd.set_option('max_columns',1000)
pd.set_option('max_rows',1000)
np.set_printoptions(threshold=np.inf)
pd.set_option('display.width', 1000)

In [97]:
Newyork.shape

(48377, 106)

In [98]:
df = Newyork.copy()

In [99]:
df.shape

(48377, 106)

In [100]:
column_names = df.columns
print(column_names)

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview',
       ...
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'], dtype='object', length=106)


In [101]:
# id - listing identifier that can be used to create a join with other files
# last_scraped - we will use it to calculate reviews_per_month
# listing_url - interesting if we want to analyse the pictures as well but out of scope otherwise
# scrape_id - same for all the records
# name - textual description already extracted as continous variables in other columns
# summary - as above
# space - as above
# description - as above
# experiences_offered - contains only none value
# neighborhood_overview - requires lot of preprocessing to turn into useful a feature
# notes - requires lot of preprocessing to turn into useful a feature
# transit - requires lot of preprocessing to turn into useful a feature
# access - requires lot of preprocessing to turn into useful a feature
# interaction - requires lot of preprocessing to turn into useful a feature
# house_rules - requires lot of preprocessing to turn into useful a feature
# thumbnail_url - contains no values
# medium_url - contains no values
# picture_url - interesting if we want to analyse the pictures as well but out of scope otherwise
# xl_picture_url - contains no values
# host_id - id that is not used anywhere else

In [102]:
df.drop('listing_url', inplace=True, axis=1) # dropping as it is not usable
df.drop('scrape_id', inplace=True, axis=1) # dropping as it is not usable
df.drop('name',inplace=True, axis=1) # dropping as it is not usable
df.drop('summary',inplace=True, axis=1) # dropping as it is not usable
df.drop('description',inplace=True, axis=1) # dropping as it is not usable
df.drop('experiences_offered',inplace=True, axis=1) # dropping as it is not usable
df.drop('neighborhood_overview',inplace=True, axis=1) # dropping as it is not usable
df.drop('notes',inplace=True, axis=1) # dropping as it is not usable
df.drop('access',inplace=True, axis=1) # dropping as it is not usable
df.drop('interaction',inplace=True, axis=1) # dropping as it is not usable
df.drop('house_rules',inplace=True, axis=1) # dropping as it is not usable
df.drop('thumbnail_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('medium_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('picture_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('xl_picture_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_id',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_location',inplace=True, axis=1) # dropping as it is not usable

In [103]:
df.head()

Unnamed: 0,id,last_scraped,space,transit,host_url,host_name,host_since,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3647,2019-09-13,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,,https://www.airbnb.com/users/show/4632,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,a few days or more,0%,,f,https://a0.muscache.com/im/users/4632/profile_...,https://a0.muscache.com/im/users/4632/profile_...,Harlem,1.0,1.0,"['email', 'phone', 'google', 'reviews', 'jumio...",t,t,"New York, NY, United States",Harlem,Harlem,Manhattan,New York,NY,10027,New York,"New York, NY",US,United States,40.80902,-73.9419,t,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",,$150.00,,,$200.00,$75.00,2,$20.00,3,7,3,3,7,7,3.0,7.0,37 months ago,t,30,60,90,365,2019-09-13,0,0,,,,,,,,,,f,,,f,f,strict_14_with_grace_period,t,t,1,0,1,0,
1,3831,2019-09-13,Greetings! We own a double-duplex brownst...,B52 bus for a 10-minute ride to downtown Brook...,https://www.airbnb.com/users/show/4869,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,within an hour,92%,,f,https://a0.muscache.com/im/users/4869/profile_...,https://a0.muscache.com/im/users/4869/profile_...,Clinton Hill,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",t,t,"Brooklyn, NY, United States",Brooklyn,Clinton Hill,Brooklyn,Brooklyn,NY,11238,New York,"Brooklyn, NY",US,United States,40.68514,-73.95976,t,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",500.0,$89.00,$575.00,"$2,100.00",$500.00,,1,$0.00,1,730,1,1,730,730,1.0,730.0,3 days ago,t,1,3,8,192,2019-09-13,279,67,2014-09-30,2019-08-29,90.0,9.0,9.0,10.0,9.0,10.0,9.0,f,,,f,f,moderate,f,f,1,1,0,0,4.62
2,5022,2019-09-13,Loft apartment with high ceiling and wood floo...,,https://www.airbnb.com/users/show/7192,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,,,,f,https://a0.muscache.com/im/users/7192/profile_...,https://a0.muscache.com/im/users/7192/profile_...,East Harlem,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,"New York, NY, United States",East Harlem,East Harlem,Manhattan,New York,NY,10029,New York,"New York, NY",US,United States,40.79851,-73.94399,t,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",,$80.00,$600.00,"$1,600.00",$100.00,$80.00,1,$20.00,10,120,10,10,120,120,10.0,120.0,6 months ago,t,0,0,0,0,2019-09-13,9,2,2012-03-20,2018-11-19,93.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.1
3,5099,2019-09-13,I have a large 1 bedroom apartment centrally l...,From the apartment is a 10 minute walk to Gran...,https://www.airbnb.com/users/show/7322,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",within a day,78%,,f,https://a0.muscache.com/im/pictures/user/26745...,https://a0.muscache.com/im/pictures/user/26745...,Flatiron District,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,f,"New York, NY, United States",Midtown East,Murray Hill,Manhattan,New York,NY,10016,New York,"New York, NY",US,United States,40.74767,-73.975,f,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",,$200.00,,,$300.00,$125.00,2,$100.00,3,21,3,3,21,21,3.0,21.0,today,t,3,3,13,13,2019-09-13,75,10,2009-04-20,2019-07-21,89.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,f,f,moderate,t,t,1,1,0,0,0.59
4,5121,2019-09-13,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,,https://www.airbnb.com/users/show/7356,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",a few days or more,0%,,f,https://a0.muscache.com/im/pictures/72a61bea-c...,https://a0.muscache.com/im/pictures/72a61bea-c...,Bedford-Stuyvesant,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'off...",t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Bedford-Stuyvesant,Brooklyn,Brooklyn,NY,11216,New York,"Brooklyn, NY",US,United States,40.68688,-73.95596,f,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",,$60.00,,,$450.00,$0.00,1,$30.00,45,730,45,45,730,730,45.0,730.0,20 months ago,t,0,0,0,0,2019-09-13,49,0,2009-05-28,2017-10-05,90.0,8.0,8.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.39


In [104]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'space', 'transit', 'host_url', 'host_name', 'host_since', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'm

In [105]:
# From the next 20 columns we will keep the following:

# host_name - can be used to identify words associated with the host in reviews
# host_since - can be used to calculate host experience based on duration since the first listing
# host_location - we can use it to establish if host is local or not
# host_about - since its only a text we will count number of characters
# host_is_superhost - categorical t or f - describing highly rated and relaible hosts - https://www.airbnb.co.uk/superhost
# host_has_profile_pic - categorical t or f - profiles with pictures are seen as more credible
# host_identity_verified - categorical t or f - another credibility metric

# And remove all the below:

# host_url - host profile is out of scope
# host_response_time - this value could be useful but contains high percentage of N/A and is contained within score_communication
# host_response_rate - same as above
# host_acceptance_rate - eaither NA or blank
# host_thumbnail_url - host picture is out of scope
# host_picture_url - host picture is out of scope
# host_neighbourhood - host_location to be instead
# host_listings_count - we will use more accurate calculated_host_listings_count
# host_total_listings_count - as above
# host_verifications - list of host verification methods - information already contained in host_identity_verified
# street - neighbourhood_cleansed will be used instead
# neighbourhood - neighbourhood_cleansed will be used instead

In [106]:
df.drop('host_url', inplace=True, axis=1) # dropping as it is not usable
df.drop('host_response_time', inplace=True, axis=1) # dropping as it is not usable
df.drop('host_response_rate',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_acceptance_rate',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_thumbnail_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_picture_url',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_neighbourhood',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_listings_count',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_total_listings_count',inplace=True, axis=1) # dropping as it is not usable
df.drop('host_verifications',inplace=True, axis=1) # dropping as it is not usable
df.drop('neighbourhood',inplace=True, axis=1) # dropping as it is not usable

In [107]:
df.head()

Unnamed: 0,id,last_scraped,space,transit,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,street,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3647,2019-09-13,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,"New York, NY, United States",Harlem,Manhattan,New York,NY,10027,New York,"New York, NY",US,United States,40.80902,-73.9419,t,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",,$150.00,,,$200.00,$75.00,2,$20.00,3,7,3,3,7,7,3.0,7.0,37 months ago,t,30,60,90,365,2019-09-13,0,0,,,,,,,,,,f,,,f,f,strict_14_with_grace_period,t,t,1,0,1,0,
1,3831,2019-09-13,Greetings! We own a double-duplex brownst...,B52 bus for a 10-minute ride to downtown Brook...,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,"Brooklyn, NY, United States",Clinton Hill,Brooklyn,Brooklyn,NY,11238,New York,"Brooklyn, NY",US,United States,40.68514,-73.95976,t,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",500.0,$89.00,$575.00,"$2,100.00",$500.00,,1,$0.00,1,730,1,1,730,730,1.0,730.0,3 days ago,t,1,3,8,192,2019-09-13,279,67,2014-09-30,2019-08-29,90.0,9.0,9.0,10.0,9.0,10.0,9.0,f,,,f,f,moderate,f,f,1,1,0,0,4.62
2,5022,2019-09-13,Loft apartment with high ceiling and wood floo...,,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,"New York, NY, United States",East Harlem,Manhattan,New York,NY,10029,New York,"New York, NY",US,United States,40.79851,-73.94399,t,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",,$80.00,$600.00,"$1,600.00",$100.00,$80.00,1,$20.00,10,120,10,10,120,120,10.0,120.0,6 months ago,t,0,0,0,0,2019-09-13,9,2,2012-03-20,2018-11-19,93.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.1
3,5099,2019-09-13,I have a large 1 bedroom apartment centrally l...,From the apartment is a 10 minute walk to Gran...,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,"New York, NY, United States",Murray Hill,Manhattan,New York,NY,10016,New York,"New York, NY",US,United States,40.74767,-73.975,f,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",,$200.00,,,$300.00,$125.00,2,$100.00,3,21,3,3,21,21,3.0,21.0,today,t,3,3,13,13,2019-09-13,75,10,2009-04-20,2019-07-21,89.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,f,f,moderate,t,t,1,1,0,0,0.59
4,5121,2019-09-13,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Brooklyn,Brooklyn,NY,11216,New York,"Brooklyn, NY",US,United States,40.68688,-73.95596,f,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",,$60.00,,,$450.00,$0.00,1,$30.00,45,730,45,45,730,730,45.0,730.0,20 months ago,t,0,0,0,0,2019-09-13,49,0,2009-05-28,2017-10-05,90.0,8.0,8.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.39


In [108]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'space', 'transit', 'host_name', 'host_since', 'host_about', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'first_review',
       'last_r

In [109]:
# From the next 20 columns we will keep the following:

# neighbourhood_cleansed - we will use only for visualisation due to number of neighbourhoods while we use gruoupped neighbourhoods instead
# neighbourhood_group_cleansed - categorical value which will be used to identify most popular parts of Barclona
# latitude - we will use it later to visualise the data on the map
# longitude - we will use it later to visualise the data on the map
# property_type - categorical variable
# room_type - categorical variable
# accommodates - discrete value describing property
# bathrooms - another discrete value describing property
# bedrooms - another discrete value describing property
# beds - another discrete value describing property
# bed_type - categorical value describing property
# amenities - due to number of unique features (over 100) we will only concentrate on the total number of amenities
# And remove all the below:

# city - we already know the city
# state - and region being Catalonia
# zipcode - we will use neighbourhood
# market - it is mainly Barcelona
# smart_location - it is mainly Barcelona
# country_code - we already know the country
# country - as above
# is_location_exact - unimportant as it could be inacurate up to 150 meters http://insideairbnb.com/about.html#disclaimers

In [110]:
df.drop('city', inplace=True, axis=1) # dropping as it is not usable
df.drop('state', inplace=True, axis=1) # dropping as it is not usable
df.drop('zipcode',inplace=True, axis=1) # dropping as it is not usable
df.drop('market',inplace=True, axis=1) # dropping as it is not usable
df.drop('smart_location',inplace=True, axis=1) # dropping as it is not usable
df.drop('country_code',inplace=True, axis=1) # dropping as it is not usable
df.drop('country',inplace=True, axis=1) # dropping as it is not usable
df.drop('is_location_exact',inplace=True, axis=1) # dropping as it is not usable

In [111]:
df.head()

Unnamed: 0,id,last_scraped,space,transit,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,street,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3647,2019-09-13,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,"New York, NY, United States",Harlem,Manhattan,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",,$150.00,,,$200.00,$75.00,2,$20.00,3,7,3,3,7,7,3.0,7.0,37 months ago,t,30,60,90,365,2019-09-13,0,0,,,,,,,,,,f,,,f,f,strict_14_with_grace_period,t,t,1,0,1,0,
1,3831,2019-09-13,Greetings! We own a double-duplex brownst...,B52 bus for a 10-minute ride to downtown Brook...,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,"Brooklyn, NY, United States",Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",500.0,$89.00,$575.00,"$2,100.00",$500.00,,1,$0.00,1,730,1,1,730,730,1.0,730.0,3 days ago,t,1,3,8,192,2019-09-13,279,67,2014-09-30,2019-08-29,90.0,9.0,9.0,10.0,9.0,10.0,9.0,f,,,f,f,moderate,f,f,1,1,0,0,4.62
2,5022,2019-09-13,Loft apartment with high ceiling and wood floo...,,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,"New York, NY, United States",East Harlem,Manhattan,40.79851,-73.94399,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",,$80.00,$600.00,"$1,600.00",$100.00,$80.00,1,$20.00,10,120,10,10,120,120,10.0,120.0,6 months ago,t,0,0,0,0,2019-09-13,9,2,2012-03-20,2018-11-19,93.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.1
3,5099,2019-09-13,I have a large 1 bedroom apartment centrally l...,From the apartment is a 10 minute walk to Gran...,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,"New York, NY, United States",Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",,$200.00,,,$300.00,$125.00,2,$100.00,3,21,3,3,21,21,3.0,21.0,today,t,3,3,13,13,2019-09-13,75,10,2009-04-20,2019-07-21,89.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,f,f,moderate,t,t,1,1,0,0,0.59
4,5121,2019-09-13,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Brooklyn,40.68688,-73.95596,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",,$60.00,,,$450.00,$0.00,1,$30.00,45,730,45,45,730,730,45.0,730.0,20 months ago,t,0,0,0,0,2019-09-13,49,0,2009-05-28,2017-10-05,90.0,8.0,8.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.39


In [112]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'space', 'transit', 'host_name', 'host_since', 'host_about', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_ch

In [113]:
# From the next 20 columns we will keep the following:

# price - price per night for number of included guests
# security_deposit - another continous value assiociated with the cost
# cleaning_fee - additional cost at the top of rent
# guests_included - descrete value which we will use to evaluate the cost per person
# extra_people - cost of additional person per night
# minimum_nights - another discrete value that is cost related. Listing with high value of minimum nights are likely sublettings
# first_review - we will use it to calculate reviews_per_month
# last_review - we will use this field to filter out no longer active listings
# number_of_reviews - total number of reviews in entire listing history
# And remove all the below:

# square_feet - could be used to evaluate the property size but most of the values are missing
# weekly_price - mostly blank so we will use price instead
# monthly_price - mostly blank so we will use price instead
# maximum_nights - most of the values are above 30 days suggesting its used as an open bracket
# calendar_updated - we are not interested in future data that is a subject to daily updates
# has_availability - as above
# availability_30 - as above
# availability_60 - as above
# availability_90 - as above
# availability_365 - as above
# calendar_last_scraped - as above

In [114]:
df.drop('square_feet', inplace=True, axis=1) # dropping as it is not usable
df.drop('weekly_price', inplace=True, axis=1) # dropping as it is not usable
df.drop('monthly_price',inplace=True, axis=1) # dropping as it is not usable
df.drop('maximum_nights',inplace=True, axis=1) # dropping as it is not usable
df.drop('calendar_updated',inplace=True, axis=1) # dropping as it is not usable
df.drop('has_availability',inplace=True, axis=1) # dropping as it is not usable
df.drop('availability_30',inplace=True, axis=1) # dropping as it is not usable
df.drop('availability_60',inplace=True, axis=1) # dropping as it is not usable
df.drop('availability_90',inplace=True, axis=1) # dropping as it is not usable
df.drop('availability_365',inplace=True, axis=1) # dropping as it is not usable
df.drop('calendar_last_scraped',inplace=True, axis=1) # dropping as it is not usable

In [115]:
df.head()

Unnamed: 0,id,last_scraped,space,transit,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,street,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3647,2019-09-13,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,"New York, NY, United States",Harlem,Manhattan,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",$150.00,$200.00,$75.00,2,$20.00,3,3,3,7,7,3.0,7.0,0,0,,,,,,,,,,f,,,f,f,strict_14_with_grace_period,t,t,1,0,1,0,
1,3831,2019-09-13,Greetings! We own a double-duplex brownst...,B52 bus for a 10-minute ride to downtown Brook...,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,"Brooklyn, NY, United States",Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$89.00,$500.00,,1,$0.00,1,1,1,730,730,1.0,730.0,279,67,2014-09-30,2019-08-29,90.0,9.0,9.0,10.0,9.0,10.0,9.0,f,,,f,f,moderate,f,f,1,1,0,0,4.62
2,5022,2019-09-13,Loft apartment with high ceiling and wood floo...,,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,"New York, NY, United States",East Harlem,Manhattan,40.79851,-73.94399,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",$80.00,$100.00,$80.00,1,$20.00,10,10,10,120,120,10.0,120.0,9,2,2012-03-20,2018-11-19,93.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,f,strict_14_with_grace_period,t,t,1,1,0,0,0.1
3,5099,2019-09-13,I have a large 1 bedroom apartment centrally l...,From the apartment is a 10 minute walk to Gran...,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,"New York, NY, United States",Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",$200.00,$300.00,$125.00,2,$100.00,3,3,3,21,21,3.0,21.0,75,10,2009-04-20,2019-07-21,89.0,10.0,9.0,10.0,10.0,9.0,9.0,f,,,f,f,moderate,t,t,1,1,0,0,0.59
4,5121,2019-09-13,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Brooklyn,40.68688,-73.95596,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",$60.00,$450.00,$0.00,1,$30.00,45,45,45,730,730,45.0,730.0,49,0,2009-05-28,2017-10-05,90.0,8.0,8.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,1,0,1,0,0.39


In [116]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'space', 'transit', 'host_name', 'host_since', 'host_about', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews', 'number_of_reviews_ltm', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'requires_license', 'license', 'jurisdiction_names', 'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
   

In [117]:
# From the final set of columns we will keep the following:

# review_scores_accuracy - discrete value - numbers between 2 and 10
# review_scores_cleanliness - discrete value - numbers between 2 and 10
# review_scores_checkin - discrete value - numbers between 2 and 10
# review_scores_communication - discrete value - numbers between 2 and 10
# review_scores_location - discrete value - numbers between 2 and 10
# review_scores_value - discrete value - numbers between 2 and 10
# instant_bookable - categorical value - t or false
# cancellation_policy - ordinal value with 5 categories that can be ordered from lowest to highest level of flexibility
# require_guest_profile_picture - categorical value - t or false
# require_guest_phone_verification categorical value - t or false
# calculated_host_listings_count - continious value which is actual number of host listings - another metric to measure host experience or to distinguish buisness from individual
# And remove all the below:

# review_scores_rating - this value is calculated as weighted sum of other scores
# requires_license - all values are t
# license - textual value that is mostly null
# jurisdiction_names - contains only nulls
# is_business_travel_ready - contains one value of f
# reviews_per_month - we will re-calculate this field using our formula

In [118]:
df.drop('review_scores_rating', inplace=True, axis=1) # dropping as it is not usable
df.drop('requires_license', inplace=True, axis=1) # dropping as it is not usable
df.drop('license',inplace=True, axis=1) # dropping as it is not usable
df.drop('minimum_minimum_nights',inplace=True, axis=1) # dropping as it is not usable
df.drop('maximum_minimum_nights',inplace=True, axis=1) # dropping as it is not usable
df.drop('minimum_maximum_nights',inplace=True, axis=1) # dropping as it is not usable
df.drop('maximum_maximum_nights',inplace=True, axis=1) # dropping as it is not usable
df.drop('minimum_nights_avg_ntm',inplace=True, axis=1) # dropping as it is not usable
df.drop('maximum_nights_avg_ntm',inplace=True, axis=1) # dropping as it is not usable
df.drop('jurisdiction_names',inplace=True, axis=1) # dropping as it is not usable
df.drop('is_business_travel_ready',inplace=True, axis=1) # dropping as it is not usable
df.drop('reviews_per_month',inplace=True, axis=1) # dropping as it is not usable

In [119]:
df.head()

Unnamed: 0,id,last_scraped,space,transit,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,street,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
0,3647,2019-09-13,WELCOME TO OUR INTERNATIONAL URBAN COMMUNITY T...,,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,"New York, NY, United States",Harlem,Manhattan,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",$150.00,$200.00,$75.00,2,$20.00,3,0,0,,,,,,,,,f,strict_14_with_grace_period,t,t,1,0,1,0
1,3831,2019-09-13,Greetings! We own a double-duplex brownst...,B52 bus for a 10-minute ride to downtown Brook...,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,"Brooklyn, NY, United States",Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$89.00,$500.00,,1,$0.00,1,279,67,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,f,moderate,f,f,1,1,0,0
2,5022,2019-09-13,Loft apartment with high ceiling and wood floo...,,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,"New York, NY, United States",East Harlem,Manhattan,40.79851,-73.94399,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",$80.00,$100.00,$80.00,1,$20.00,10,9,2,2012-03-20,2018-11-19,10.0,9.0,10.0,10.0,9.0,10.0,f,strict_14_with_grace_period,t,t,1,1,0,0
3,5099,2019-09-13,I have a large 1 bedroom apartment centrally l...,From the apartment is a 10 minute walk to Gran...,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,"New York, NY, United States",Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",$200.00,$300.00,$125.00,2,$100.00,3,75,10,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,t,t,1,1,0,0
4,5121,2019-09-13,HELLO EVERYONE AND THANKS FOR VISITING BLISS A...,,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,"Brooklyn, NY, United States",Bedford-Stuyvesant,Brooklyn,40.68688,-73.95596,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",$60.00,$450.00,$0.00,1,$30.00,45,49,0,2009-05-28,2017-10-05,8.0,8.0,10.0,10.0,9.0,9.0,f,strict_14_with_grace_period,f,f,1,0,1,0


In [120]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'space', 'transit', 'host_name', 'host_since', 'host_about', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'number_of_reviews', 'number_of_reviews_ltm', 'first_review', 'last_review', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'], dtype='object')


In [121]:
df.drop('number_of_reviews_ltm', inplace=True, axis=1) # dropping as it is not usable
df.drop('street', inplace=True, axis=1) # dropping as it is not usable
df.drop('transit',inplace=True, axis=1) # dropping as it is not usable
df.drop('calculated_host_listings_count_entire_homes',inplace=True, axis=1) # dropping as it is not usable
df.drop('calculated_host_listings_count_private_rooms',inplace=True, axis=1) # dropping as it is not usable
df.drop('calculated_host_listings_count_shared_rooms',inplace=True, axis=1) # dropping as it is not usable
df.drop('space',inplace=True, axis=1) # dropping as it is not usable

In [122]:
df.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
0,3647,2019-09-13,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,Harlem,Manhattan,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",$150.00,$200.00,$75.00,2,$20.00,3,0,,,,,,,,,f,strict_14_with_grace_period,t,t,1
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$89.00,$500.00,,1,$0.00,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,f,moderate,f,f,1
2,5022,2019-09-13,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,East Harlem,Manhattan,40.79851,-73.94399,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",$80.00,$100.00,$80.00,1,$20.00,10,9,2012-03-20,2018-11-19,10.0,9.0,10.0,10.0,9.0,10.0,f,strict_14_with_grace_period,t,t,1
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",$200.00,$300.00,$125.00,2,$100.00,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,t,t,1
4,5121,2019-09-13,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,Bedford-Stuyvesant,Brooklyn,40.68688,-73.95596,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",$60.00,$450.00,$0.00,1,$30.00,45,49,2009-05-28,2017-10-05,8.0,8.0,10.0,10.0,9.0,9.0,f,strict_14_with_grace_period,f,f,1


In [123]:
column_names = df.columns
print(column_names)

Index(['id', 'last_scraped', 'host_name', 'host_since', 'host_about', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'number_of_reviews', 'first_review', 'last_review', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count'], dtype='object')


In [124]:
df_sel = df.copy()

In [125]:
df_sel.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
0,3647,2019-09-13,Elisabeth,2008-11-25,Make Up Artist National/ (Website hidden by Ai...,f,t,t,Harlem,Manhattan,40.80902,-73.9419,Apartment,Private room,2,1.0,1.0,1.0,Pull-out Sofa,"{""Cable TV"",Internet,Wifi,""Air conditioning"",K...",$150.00,$200.00,$75.00,2,$20.00,3,0,,,,,,,,,f,strict_14_with_grace_period,t,t,1
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$89.00,$500.00,,1,$0.00,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,f,moderate,f,f,1
2,5022,2019-09-13,Laura,2009-01-29,I have been a NYer for almost 10 years. I came...,f,t,t,East Harlem,Manhattan,40.79851,-73.94399,Apartment,Entire home/apt,1,1.0,,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,Elev...",$80.00,$100.00,$80.00,1,$20.00,10,9,2012-03-20,2018-11-19,10.0,9.0,10.0,10.0,9.0,10.0,f,strict_14_with_grace_period,t,t,1
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",$200.00,$300.00,$125.00,2,$100.00,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,t,t,1
4,5121,2019-09-13,Garon,2009-02-03,"I am an artist(painter, filmmaker) and curato...",f,t,f,Bedford-Stuyvesant,Brooklyn,40.68688,-73.95596,Apartment,Private room,2,,1.0,1.0,Futon,"{Wifi,""Air conditioning"",Kitchen,""Pets live on...",$60.00,$450.00,$0.00,1,$30.00,45,49,2009-05-28,2017-10-05,8.0,8.0,10.0,10.0,9.0,9.0,f,strict_14_with_grace_period,f,f,1


In [126]:
df_sel.drop(df_sel[df_sel['number_of_reviews'] <= 0].index, inplace = True) 
# dropping all values less than or equal to 0 as it is equal to NAN or NA

In [127]:
df_sel['number_of_reviews'].unique()

array([279,   9,  75,  49, 443, 118,  94, 161,  54, 198, 171, 116,  27,
       160, 203, 277,  62, 134,  74,  96,  21,  58,  30, 248,  95, 206,
       172,  67, 239, 218, 253,  15,  25,  84, 100,  11, 258,  12, 138,
       121,  80, 349,  19, 108,  20, 298, 142,  23,  44,   5, 143, 192,
         4, 357, 155, 110, 197,  50,  52, 109, 146,  63,  55,  73,  17,
        61, 201,   2, 178, 175,  22, 168, 123,  82,   1,  31, 139,  46,
       339,  88, 164,  29, 177, 342, 469,   7,  38, 331, 356,  16,  14,
       156, 266,  78, 194,   3, 428, 238,  85, 127, 243,  28,  34, 135,
       126, 240, 322, 305, 152,  59, 234, 193,  87, 117, 219,  18, 222,
        33,  53, 128,  51, 208, 113, 290, 378,  10,   6,  39, 369, 236,
       211, 261, 115, 130,  37, 213,  71, 225, 205,  98, 231, 475, 184,
        92, 151,  81, 183, 187, 125, 106, 360, 448,  45, 259, 145,  36,
       292, 233,  64, 227,  72,  56,  99,  65, 133,  48, 122,  40,  32,
        70,  69,  26, 209, 232, 295, 409, 185,   8,  35,  66, 39

In [128]:
df_sel['price']=df_sel['price'].str.replace('$','')
df_sel['price']=df_sel['price'].str.replace(',','')
df_sel['price']=df_sel['price'].str.replace('.','').astype(float)

In [129]:
df_sel['extra_people']=df_sel['extra_people'].str.replace('$','')
df_sel['extra_people']=df_sel['extra_people'].str.replace(',','')
df_sel['extra_people']=df_sel['extra_people'].str.replace('.','').astype(float)

In [130]:
# security_deposit - conversion from $ to numeric values
df_sel['security_deposit']=df_sel['security_deposit'].str.replace('$','')
df_sel['security_deposit']=df_sel['security_deposit'].str.replace(',','')
df_sel['security_deposit']=df_sel['security_deposit'].str.replace('.','').astype(float)

In [131]:
df_sel['cleaning_fee']=df_sel['cleaning_fee'].str.replace('$','')
df_sel['cleaning_fee']=df_sel['cleaning_fee'].str.replace(',','')
df_sel['cleaning_fee']=df_sel['cleaning_fee'].str.replace('.','').astype(float)

In [132]:
df_sel['security_deposit'].isnull().sum()

11838

In [133]:
df_sel['cleaning_fee'].isnull().sum()

6315

In [134]:
df_sel['cleaning_fee'] = df_sel ['cleaning_fee'].fillna(df_sel['cleaning_fee'].mean()).astype(float)

In [135]:
df_sel['cleaning_fee'].isnull().sum()

0

In [136]:
df_sel['security_deposit'] = df_sel ['security_deposit'].fillna(df_sel['security_deposit'].mean()).astype(float)

In [137]:
df_sel['security_deposit'].isnull().sum()

0

In [138]:
df_sel['host_about'].isnull().sum()

13914

In [139]:
df_sel['host_about'] = df_sel.host_about.fillna('')

In [140]:
df_sel['host_about'].isnull().sum()

0

In [141]:
df_sel1 = df_sel.copy()

In [142]:
df_sel = df_sel.dropna()

In [143]:
df_sel.isnull().sum()

id                                  0
last_scraped                        0
host_name                           0
host_since                          0
host_about                          0
host_is_superhost                   0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
neighbourhood_group_cleansed        0
latitude                            0
longitude                           0
property_type                       0
room_type                           0
accommodates                        0
bathrooms                           0
bedrooms                            0
beds                                0
bed_type                            0
amenities                           0
price                               0
security_deposit                    0
cleaning_fee                        0
guests_included                     0
extra_people                        0
minimum_nights                      0
number_of_re

In [144]:
df_sel.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,f,t,t,Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",8900.0,50000.0,6331.088211,1,0.0,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,f,moderate,f,f,1
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",f,t,f,Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",20000.0,30000.0,12500.0,2,10000.0,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,t,t,1
5,5178,2019-09-13,Shunichi,2009-03-03,I used to work for a financial industry but no...,f,t,f,Hell's Kitchen,Manhattan,40.76489,-73.98493,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",7900.0,24972.470991,1500.0,1,1200.0,2,443,2009-05-06,2019-08-27,8.0,7.0,9.0,9.0,10.0,8.0,f,strict_14_with_grace_period,f,f,1
6,5203,2019-09-13,MaryEllen,2009-02-05,Welcome to family life with my oldest two away...,f,t,t,Upper West Side,Manhattan,40.80178,-73.96723,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",7900.0,24972.470991,6331.088211,1,0.0,2,118,2009-09-07,2017-07-21,10.0,10.0,10.0,10.0,10.0,10.0,f,flexible,t,t,1
7,5222,2019-09-13,Marilyn,2009-02-06,I'm a writer who came to NYC for graduate scho...,f,t,t,East Village,Manhattan,40.72764,-73.97949,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",11600.0,50000.0,7500.0,2,1500.0,30,94,2009-02-23,2016-06-15,9.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,f,f,1


In [145]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
df_sel['host_is_superhost']= label_encoder.fit_transform(df_sel['host_is_superhost'])

In [146]:
df_sel.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,0,t,t,Clinton Hill,Brooklyn,40.68514,-73.95976,Guest suite,Entire home/apt,3,1.0,1.0,4.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",8900.0,50000.0,6331.088211,1,0.0,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,f,moderate,f,f,1
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",0,t,f,Murray Hill,Manhattan,40.74767,-73.975,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",20000.0,30000.0,12500.0,2,10000.0,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,f,moderate,t,t,1
5,5178,2019-09-13,Shunichi,2009-03-03,I used to work for a financial industry but no...,0,t,f,Hell's Kitchen,Manhattan,40.76489,-73.98493,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",7900.0,24972.470991,1500.0,1,1200.0,2,443,2009-05-06,2019-08-27,8.0,7.0,9.0,9.0,10.0,8.0,f,strict_14_with_grace_period,f,f,1
6,5203,2019-09-13,MaryEllen,2009-02-05,Welcome to family life with my oldest two away...,0,t,t,Upper West Side,Manhattan,40.80178,-73.96723,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",7900.0,24972.470991,6331.088211,1,0.0,2,118,2009-09-07,2017-07-21,10.0,10.0,10.0,10.0,10.0,10.0,f,flexible,t,t,1
7,5222,2019-09-13,Marilyn,2009-02-06,I'm a writer who came to NYC for graduate scho...,0,t,t,East Village,Manhattan,40.72764,-73.97949,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",11600.0,50000.0,7500.0,2,1500.0,30,94,2009-02-23,2016-06-15,9.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,f,f,1


In [147]:
df_sel['host_has_profile_pic'] = label_encoder.fit_transform(df_sel['host_has_profile_pic'])

In [148]:
df_sel['host_identity_verified'] = label_encoder.fit_transform(df_sel['host_identity_verified'])

In [149]:
df_sel['instant_bookable'] = label_encoder.fit_transform(df_sel['instant_bookable'])

In [150]:
df_sel['require_guest_profile_picture'] = label_encoder.fit_transform(df_sel['require_guest_profile_picture'])

In [151]:
df_sel['require_guest_phone_verification'] = label_encoder.fit_transform(df_sel['require_guest_phone_verification'])

In [152]:
df_sel['cancellation_policy'] = label_encoder.fit_transform(df_sel['cancellation_policy'])

In [153]:
df_sel['bed_type'] = label_encoder.fit_transform(df_sel['bed_type'])

In [154]:
df_sel['room_type'] = label_encoder.fit_transform(df_sel['room_type'])

In [155]:
df_sel['neighbourhood_group_cleansed'] = label_encoder.fit_transform(df_sel['neighbourhood_group_cleansed'])

In [156]:
df_sel['property_type'] = label_encoder.fit_transform(df_sel['property_type'])

In [157]:
df_sel.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,0,1,1,Clinton Hill,1,40.68514,-73.95976,17,0,3,1.0,1.0,4.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",8900.0,50000.0,6331.088211,1,0.0,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,0,1,0,0,1
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",0,1,0,Murray Hill,2,40.74767,-73.975,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",20000.0,30000.0,12500.0,2,10000.0,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,0,1,1,1,1
5,5178,2019-09-13,Shunichi,2009-03-03,I used to work for a financial industry but no...,0,1,0,Hell's Kitchen,2,40.76489,-73.98493,1,2,2,1.0,1.0,1.0,4,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",7900.0,24972.470991,1500.0,1,1200.0,2,443,2009-05-06,2019-08-27,8.0,7.0,9.0,9.0,10.0,8.0,0,3,0,0,1
6,5203,2019-09-13,MaryEllen,2009-02-05,Welcome to family life with my oldest two away...,0,1,1,Upper West Side,2,40.80178,-73.96723,1,2,1,1.0,1.0,1.0,4,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",7900.0,24972.470991,6331.088211,1,0.0,2,118,2009-09-07,2017-07-21,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1
7,5222,2019-09-13,Marilyn,2009-02-06,I'm a writer who came to NYC for graduate scho...,0,1,1,East Village,2,40.72764,-73.97949,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",11600.0,50000.0,7500.0,2,1500.0,30,94,2009-02-23,2016-06-15,9.0,9.0,10.0,10.0,10.0,9.0,0,3,0,0,1


In [158]:
df_sel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37659 entries, 1 to 48176
Data columns (total 40 columns):
id                                  37659 non-null int64
last_scraped                        37659 non-null object
host_name                           37659 non-null object
host_since                          37659 non-null object
host_about                          37659 non-null object
host_is_superhost                   37659 non-null int32
host_has_profile_pic                37659 non-null int32
host_identity_verified              37659 non-null int32
neighbourhood_cleansed              37659 non-null object
neighbourhood_group_cleansed        37659 non-null int32
latitude                            37659 non-null float64
longitude                           37659 non-null float64
property_type                       37659 non-null int32
room_type                           37659 non-null int32
accommodates                        37659 non-null int64
bathrooms                  

In [159]:
df_sel.select_dtypes(include='object').columns

Index(['last_scraped', 'host_name', 'host_since', 'host_about', 'neighbourhood_cleansed', 'amenities', 'first_review', 'last_review'], dtype='object')

In [160]:
# listing_duration = 
df_sel['last_review']= pd.to_datetime(df_sel['last_review'])
df_sel['first_review']= pd.to_datetime(df_sel['first_review'])
df_sel['listing_duration'] = df_sel['last_review'] - df_sel['first_review']

In [161]:
# hosting_duration = 
df_sel['host_since']= pd.to_datetime(df_sel['host_since'])
df_sel['hosting_duration'] = df_sel['last_review'] - df_sel['host_since']

In [162]:
# host_about_len = 
df_sel['host_about_len']=df_sel['host_about'].str.replace('NA','0')

In [163]:
df.drop('host_about',inplace=True, axis=1) # dropping as it is not usable

In [164]:
# price_per_person - (price/accommodates)
df_sel['price_per_person'] =df_sel['price'] / df_sel['accommodates']

In [165]:
a_longitude= 40.7128
a_latitude= 74.0060

In [166]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

In [167]:
for index, row in df_sel.iterrows():
    df_sel.loc[index, 'distance'] = haversine(a_longitude, a_latitude, row['longitude'], row['latitude'])

In [170]:
df_sel.head(5)

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,listing_duration,hosting_duration,host_about_len,price_per_person,distance
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,0,1,1,Clinton Hill,1,40.68514,-73.95976,17,0,3,1.0,1.0,4.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",8900.0,50000.0,6331.088211,1,0.0,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,0,1,0,0,1,1794 days,3917 days,Laid-back bi-coastal actor/professor/attorney.,2966.666667,6373.457483
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",0,1,0,Murray Hill,2,40.74767,-73.975,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",20000.0,30000.0,12500.0,2,10000.0,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,0,1,1,1,1,3744 days,3821 days,"I'm an artist, writer, traveler, and a native ...",10000.0,6367.205106
5,5178,2019-09-13,Shunichi,2009-03-03,I used to work for a financial industry but no...,0,1,0,Hell's Kitchen,2,40.76489,-73.98493,1,2,2,1.0,1.0,1.0,4,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",7900.0,24972.470991,1500.0,1,1200.0,2,443,2009-05-06,2019-08-27,8.0,7.0,9.0,9.0,10.0,8.0,0,3,0,0,1,3765 days,3829 days,I used to work for a financial industry but no...,3950.0,6365.626879
6,5203,2019-09-13,MaryEllen,2009-02-05,Welcome to family life with my oldest two away...,0,1,1,Upper West Side,2,40.80178,-73.96723,1,2,1,1.0,1.0,1.0,4,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",7900.0,24972.470991,6331.088211,1,0.0,2,118,2009-09-07,2017-07-21,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,2874 days,3088 days,Welcome to family life with my oldest two away...,7900.0,6361.269989
7,5222,2019-09-13,Marilyn,2009-02-06,I'm a writer who came to NYC for graduate scho...,0,1,1,East Village,2,40.72764,-73.97949,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",11600.0,50000.0,7500.0,2,1500.0,30,94,2009-02-23,2016-06-15,9.0,9.0,10.0,10.0,10.0,9.0,0,3,0,0,1,2669 days,2686 days,I'm a writer who came to NYC for graduate scho...,5800.0,6369.442641


In [171]:
df_sel['last_scraped']= pd.DatetimeIndex(df_sel.last_scraped)
df_sel['first_review']= pd.DatetimeIndex(df_sel.first_review)
df_sel['last_review']= pd.DatetimeIndex(df_sel.last_review)
df_sel['host_since']= pd.DatetimeIndex(df_sel.host_since)

In [172]:
df_sel.head()

Unnamed: 0,id,last_scraped,host_name,host_since,host_about,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,first_review,last_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,listing_duration,hosting_duration,host_about_len,price_per_person,distance
1,3831,2019-09-13,LisaRoxanne,2008-12-07,Laid-back bi-coastal actor/professor/attorney.,0,1,1,Clinton Hill,1,40.68514,-73.95976,17,0,3,1.0,1.0,4.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",8900.0,50000.0,6331.088211,1,0.0,1,279,2014-09-30,2019-08-29,9.0,9.0,10.0,9.0,10.0,9.0,0,1,0,0,1,1794 days,3917 days,Laid-back bi-coastal actor/professor/attorney.,2966.666667,6373.457483
3,5099,2019-09-13,Chris,2009-02-02,"I'm an artist, writer, traveler, and a native ...",0,1,0,Murray Hill,2,40.74767,-73.975,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Buzzer/w...",20000.0,30000.0,12500.0,2,10000.0,3,75,2009-04-20,2019-07-21,10.0,9.0,10.0,10.0,9.0,9.0,0,1,1,1,1,3744 days,3821 days,"I'm an artist, writer, traveler, and a native ...",10000.0,6367.205106
5,5178,2019-09-13,Shunichi,2009-03-03,I used to work for a financial industry but no...,0,1,0,Hell's Kitchen,2,40.76489,-73.98493,1,2,2,1.0,1.0,1.0,4,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",7900.0,24972.470991,1500.0,1,1200.0,2,443,2009-05-06,2019-08-27,8.0,7.0,9.0,9.0,10.0,8.0,0,3,0,0,1,3765 days,3829 days,I used to work for a financial industry but no...,3950.0,6365.626879
6,5203,2019-09-13,MaryEllen,2009-02-05,Welcome to family life with my oldest two away...,0,1,1,Upper West Side,2,40.80178,-73.96723,1,2,1,1.0,1.0,1.0,4,"{Internet,Wifi,""Air conditioning"",""Paid parkin...",7900.0,24972.470991,6331.088211,1,0.0,2,118,2009-09-07,2017-07-21,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,2874 days,3088 days,Welcome to family life with my oldest two away...,7900.0,6361.269989
7,5222,2019-09-13,Marilyn,2009-02-06,I'm a writer who came to NYC for graduate scho...,0,1,1,East Village,2,40.72764,-73.97949,1,0,2,1.0,1.0,1.0,4,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",11600.0,50000.0,7500.0,2,1500.0,30,94,2009-02-23,2016-06-15,9.0,9.0,10.0,10.0,10.0,9.0,0,3,0,0,1,2669 days,2686 days,I'm a writer who came to NYC for graduate scho...,5800.0,6369.442641


In [173]:
df_sel = df_sel.drop(['last_scraped','host_name','host_since','host_about','neighbourhood_cleansed','amenities','first_review','last_review','listing_duration','hosting_duration','host_about_len'], axis=1)

In [174]:
df_sel.head()

Unnamed: 0,id,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,price_per_person,distance
1,3831,0,1,1,1,40.68514,-73.95976,17,0,3,1.0,1.0,4.0,4,8900.0,50000.0,6331.088211,1,0.0,1,279,9.0,9.0,10.0,9.0,10.0,9.0,0,1,0,0,1,2966.666667,6373.457483
3,5099,0,1,0,2,40.74767,-73.975,1,0,2,1.0,1.0,1.0,4,20000.0,30000.0,12500.0,2,10000.0,3,75,10.0,9.0,10.0,10.0,9.0,9.0,0,1,1,1,1,10000.0,6367.205106
5,5178,0,1,0,2,40.76489,-73.98493,1,2,2,1.0,1.0,1.0,4,7900.0,24972.470991,1500.0,1,1200.0,2,443,8.0,7.0,9.0,9.0,10.0,8.0,0,3,0,0,1,3950.0,6365.626879
6,5203,0,1,1,2,40.80178,-73.96723,1,2,1,1.0,1.0,1.0,4,7900.0,24972.470991,6331.088211,1,0.0,2,118,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,7900.0,6361.269989
7,5222,0,1,1,2,40.72764,-73.97949,1,0,2,1.0,1.0,1.0,4,11600.0,50000.0,7500.0,2,1500.0,30,94,9.0,9.0,10.0,10.0,10.0,9.0,0,3,0,0,1,5800.0,6369.442641


In [175]:
X=df_sel.drop(['price'],1)
y = df_sel['price']

In [176]:
X.head(5)

Unnamed: 0,id,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,number_of_reviews,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,price_per_person,distance
1,3831,0,1,1,1,40.68514,-73.95976,17,0,3,1.0,1.0,4.0,4,50000.0,6331.088211,1,0.0,1,279,9.0,9.0,10.0,9.0,10.0,9.0,0,1,0,0,1,2966.666667,6373.457483
3,5099,0,1,0,2,40.74767,-73.975,1,0,2,1.0,1.0,1.0,4,30000.0,12500.0,2,10000.0,3,75,10.0,9.0,10.0,10.0,9.0,9.0,0,1,1,1,1,10000.0,6367.205106
5,5178,0,1,0,2,40.76489,-73.98493,1,2,2,1.0,1.0,1.0,4,24972.470991,1500.0,1,1200.0,2,443,8.0,7.0,9.0,9.0,10.0,8.0,0,3,0,0,1,3950.0,6365.626879
6,5203,0,1,1,2,40.80178,-73.96723,1,2,1,1.0,1.0,1.0,4,24972.470991,6331.088211,1,0.0,2,118,10.0,10.0,10.0,10.0,10.0,10.0,0,0,1,1,1,7900.0,6361.269989
7,5222,0,1,1,2,40.72764,-73.97949,1,0,2,1.0,1.0,1.0,4,50000.0,7500.0,2,1500.0,30,94,9.0,9.0,10.0,10.0,10.0,9.0,0,3,0,0,1,5800.0,6369.442641


In [177]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)

(26361, 33)
(11298, 33)
(11298,)


In [178]:
lin_reg = LinearRegression()
model = lin_reg.fit(X_train,y_train)
print(f'R^2 score for train: {lin_reg.score(X_train, y_train)}')
print(f'R^2 score for test: {lin_reg.score(X_test, y_test)}')

R^2 score for train: 0.6987829570744384
R^2 score for test: 0.7899394341859203


In [179]:
X.columns

Index(['id', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights', 'number_of_reviews', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'cancellation_policy', 'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count', 'price_per_person', 'distance'], dtype='object')

In [180]:
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm
X=df_sel.drop(['price','id'],1)
y = df_sel['price']
X_constant = sm.add_constant(X)
lin_reg = sm.OLS(y,X_constant).fit()
predictions = lin_reg.predict(X_constant)
lin_reg.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.732
Model:,OLS,Adj. R-squared:,0.732
Method:,Least Squares,F-statistic:,3213.0
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,0.0
Time:,11:14:50,Log-Likelihood:,-401170.0
No. Observations:,37659,AIC:,802400.0
Df Residuals:,37626,BIC:,802700.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.026e+10,8.35e+08,12.282,0.000,8.62e+09,1.19e+10
host_is_superhost,314.5517,136.069,2.312,0.021,47.853,581.251
host_has_profile_pic,1309.5827,1254.441,1.044,0.297,-1149.156,3768.322
host_identity_verified,66.2476,110.712,0.598,0.550,-150.751,283.246
neighbourhood_group_cleansed,-237.5717,90.005,-2.640,0.008,-413.984,-61.160
latitude,-1.232e+08,1e+07,-12.284,0.000,-1.43e+08,-1.04e+08
longitude,-2.912e+07,2.37e+06,-12.293,0.000,-3.38e+07,-2.45e+07
property_type,44.8329,6.257,7.165,0.000,32.568,57.098
room_type,-609.5209,65.653,-9.284,0.000,-738.203,-480.839

0,1,2,3
Omnibus:,73157.883,Durbin-Watson:,1.872
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2862431836.224
Skew:,14.192,Prob(JB):,0.0
Kurtosis:,1353.339,Cond. No.,733000000000.0


In [181]:
lin_reg = LinearRegression()

In [182]:
rfe = RFE(lin_reg, 5)

In [183]:
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
lin_reg.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)

[False False False False  True  True False False False  True  True False
 False False False False False False False False False False False False
 False False False False False False False  True]
[16 14 15 11  1  1 20  2  4  1  1  8 12 28 26 18 27 24 22 21  6  7 17  9
 13 10 19  3  5 23 25  1]


In [184]:
nof_list=np.arange(1,32)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
    lin_reg = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    lin_reg.fit(X_train_rfe,y_train)
    score = lin_reg.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 31
Score with 31 features: 0.580514


In [185]:
cols = list(X.columns)
lin_reg = LinearRegression()
#Initializing RFE model
rfe = RFE(lin_reg, 20)             
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
lin_reg.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'require_guest_profile_picture', 'require_guest_phone_verification', 'distance'], dtype='object')


In [186]:
X = df_sel[['host_identity_verified', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 
            'accommodates', 'bathrooms', 'bedrooms', 'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
            'extra_people', 'minimum_nights', 'number_of_reviews', 'review_scores_cleanliness', 'cancellation_policy', 
            'require_guest_profile_picture', 'require_guest_phone_verification', 'calculated_host_listings_count',
            'price_per_person']]
y = df_sel.price
X_constant = sm.add_constant(X)
lin_reg = sm.OLS(y, X_constant).fit()
predictions = lin_reg.predict(X_constant)
lin_reg.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.728
Model:,OLS,Adj. R-squared:,0.728
Method:,Least Squares,F-statistic:,5032.0
Date:,"Tue, 19 Nov 2019",Prob (F-statistic):,0.0
Time:,11:15:08,Log-Likelihood:,-401470.0
No. Observations:,37659,AIC:,803000.0
Df Residuals:,37638,BIC:,803200.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.377e+04,526.260,-26.174,0.000,-1.48e+04,-1.27e+04
host_identity_verified,172.9089,109.214,1.583,0.113,-41.153,386.970
neighbourhood_group_cleansed,433.2459,71.323,6.074,0.000,293.452,573.040
property_type,17.0697,6.116,2.791,0.005,5.081,29.058
room_type,-777.1639,65.043,-11.948,0.000,-904.650,-649.678
accommodates,3662.2054,52.857,69.286,0.000,3558.605,3765.806
bathrooms,4146.4633,144.858,28.624,0.000,3862.538,4430.389
bedrooms,1031.7784,106.187,9.717,0.000,823.649,1239.908
beds,-946.2146,75.895,-12.467,0.000,-1094.971,-797.458

0,1,2,3
Omnibus:,70556.871,Durbin-Watson:,1.869
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2815879389.207
Skew:,12.992,Prob(JB):,0.0
Kurtosis:,1342.358,Cond. No.,631000.0


In [187]:
from sklearn.linear_model import Ridge
ridgeReg = Ridge(alpha=1, normalize=True)
ridgeReg.fit(X_train,y_train)
pred = ridgeReg.predict(X_test)

In [188]:
ridgeReg.score(X_test,y_test)

0.7589294931193998

In [189]:
ridgeReg.score(X_train,y_train)

0.5798112807698947

In [190]:
from sklearn.linear_model import Lasso

lassoReg = Lasso(alpha=18, normalize=True)
lassoReg.fit(X_train,y_train)
pred = lassoReg.predict(X_test)
lassoReg.score(X_test,y_test)


0.7220962380181146

In [191]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.gridspec as gridspec
from sklearn.model_selection import cross_val_score, train_test_split
import itertools
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [192]:
# Create the parameter grid 
param_grid = {
    'max_depth':range(5,10,5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),
    'criterion': ["mse", "mae"]
}

n_folds = 5

# Instantiate the grid search model
dtree = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
                          cv = n_folds, verbose = 1)

# Fit the grid search to the data
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 18.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(5, 10, 5), 'min_samples_leaf': range(50, 150, 50), 'min_samples_split': range(50, 150, 50), 'criterion': ['mse', 'mae']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [193]:
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

best accuracy 0.5224712236399615
DecisionTreeRegressor(criterion='mae', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=50,
           min_samples_split=50, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')


In [196]:
tr=grid_search.best_estimator_
tr

DecisionTreeRegressor(criterion='mae', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=50,
           min_samples_split=50, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [197]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90,],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator =rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  2.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [80, 90], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [198]:
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

best accuracy 0.5814242819249974
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=90,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)


In [201]:
rft=grid_search.best_estimator_
rft

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=90,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [202]:
clf1 = DecisionTreeRegressor(max_depth=1)
clf2 = LinearRegression()
clf3 = Ridge()
clf4 = Lasso()
bagging1 = BaggingRegressor(base_estimator=clf1, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging2 = BaggingRegressor(base_estimator=clf2, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging3 = BaggingRegressor(base_estimator=clf3, n_estimators=10, max_samples=0.8, max_features=0.8)
bagging4 = BaggingRegressor(base_estimator=clf4, n_estimators=10, max_samples=0.8, max_features=0.8)
label = ['Decision Tree','Bagging Tree','Linear','bagg_lr','Ridge','bagg_ridge','Lasso','bagg_lasso']
clf_list = [clf1,bagging1,clf2,bagging2,clf3,bagging3,clf4,bagging4]
grid = itertools.product([0,1],repeat=4)

for clf, label, grd in zip(clf_list, label, grid):        
    scores =cross_val_score(clf,X_train,y_train, cv=10)
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
        
    clf.fit(X_train, y_train)

Accuracy: 0.10 (+/- 0.18) [Decision Tree]
Accuracy: 0.24 (+/- 0.15) [Bagging Tree]
Accuracy: 0.66 (+/- 0.21) [Linear]
Accuracy: 0.66 (+/- 0.10) [bagg_lr]
Accuracy: 0.66 (+/- 0.21) [Ridge]
Accuracy: 0.70 (+/- 0.09) [bagg_ridge]
Accuracy: 0.66 (+/- 0.22) [Lasso]
Accuracy: 0.67 (+/- 0.12) [bagg_lasso]


In [205]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.gridspec as gridspec
from sklearn.model_selection import cross_val_score, train_test_split
import itertools
from sklearn.linear_model import LinearRegression

In [206]:
import xgboost
from sklearn.metrics import explained_variance_score

In [207]:
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [208]:
xgb.fit(X_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=0.75, verbosity=1)

In [209]:
predictions = xgb.predict(X_test)
print(explained_variance_score(y_test,predictions))

0.9244371560541532


In [210]:
accuracy = explained_variance_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 92.44%


In [211]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
bag_tree = BaggingRegressor(DecisionTreeRegressor(),
                   max_features=0.8, n_estimators=200, 
                            random_state=0)
dtree= DecisionTreeRegressor()

In [212]:
bag_tree.fit(X_train, y_train)
bag_tree.score(X_test, y_test)

0.8473876996441736

In [213]:
from sklearn.ensemble import AdaBoostRegressor

In [214]:
ada_clf=AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), learning_rate=1.0, loss='linear',
        n_estimators=100, random_state=0)
ada_clf.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=100,
         random_state=0)

In [215]:
ada_clf.score(X_test, y_test)

0.8728494811770628

In [216]:
bag_tree = BaggingRegressor(RandomForestRegressor(),
                   max_features=0.8, n_estimators=200, 
                            random_state=0)
rf= RandomForestRegressor()

In [217]:
bag_tree.fit(X_train, y_train)
bag_tree.score(X_test, y_test)

0.8170396536850479

In [218]:
bag_tree.score(X_train, y_train)

0.9336981886340103

In [219]:
from sklearn.ensemble import AdaBoostRegressor

In [220]:
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 =AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=10)

In [221]:

num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']

In [222]:
print(X.shape)
print(y.shape)

(37659, 20)
(37659,)


In [226]:
clf1 = DecisionTreeRegressor(max_depth=1)
clf2 = LinearRegression()
clf3 = Ridge()
clf4 = Lasso()
boster1 = AdaBoostRegressor(base_estimator=clf1, n_estimators=10)
boster2 = AdaBoostRegressor(base_estimator=clf2, n_estimators=10)
boster3 = AdaBoostRegressor(base_estimator=clf3, n_estimators=10)
boster4 = AdaBoostRegressor(base_estimator=clf4, n_estimators=10)
label = ['Decision Tree','Bos_Tree','Linear','bos_lr','Ridge','bos_ridge','Lasso','bos_lasso']
clf_list = [clf1,boster1,clf2,boster2,clf3,boster3,clf4,boster4]
grid = itertools.product([0,1],repeat=4)

for clf, label, grd in zip(clf_list, label, grid):        
    scores =cross_val_score(clf,X_train,y_train, cv=10)
    print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
        
    clf.fit(X_train, y_train)
    

Accuracy: 0.10 (+/- 0.18) [Decision Tree]
Accuracy: 0.13 (+/- 0.34) [Bos_Tree]
Accuracy: 0.66 (+/- 0.21) [Linear]
Accuracy: -1.30 (+/- 1.51) [bos_lr]
Accuracy: 0.66 (+/- 0.21) [Ridge]
Accuracy: -0.82 (+/- 1.10) [bos_ridge]
Accuracy: 0.66 (+/- 0.22) [Lasso]
Accuracy: -1.45 (+/- 1.36) [bos_lasso]
