data preprocessing

In [84]:
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime

In [2]:
# read in data
df_orig = pd.read_csv('data/2019aug/dlistings.csv', low_memory=False)

In [3]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48864 entries, 0 to 48863
Columns: 106 entries, id to reviews_per_month
dtypes: float64(22), int64(21), object(63)
memory usage: 39.5+ MB


In [4]:
# check for null values
n_nulls = defaultdict()
for col in df_orig.columns:
    n_nulls[col] = sum(pd.isnull(df_orig[col]).values)

In [8]:
# use this line to see all the columns
cols = df_orig.columns.tolist()

In [37]:
# remove unwanted columns
to_rm = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'thumbnail_url','experiences_offered', 
        'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url','host_name',
        'host_location','host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
        'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 
         'availability_90', 'availability_365', 'calendar_last_scraped','requires_license', 'license', 
         'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms', 
         'calculated_host_listings_count_shared_rooms']

In [38]:
df_eda = df_orig.drop(to_rm, axis=1)

In [40]:
df_eda.columns

Index(['summary', 'space', 'description', 'neighborhood_overview', 'notes',
       'transit', 'access', 'interaction', 'house_rules', 'host_since',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
       'price', 'weekly_price', 'monthly_price', 'security_deposit',
       'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       

In [35]:
np.unique(df_orig['requires_license'], return_counts=True)

(array(['f', 't'], dtype=object), array([48863,     1]))

In [41]:
# test textblob functionality
from textblob import TextBlob

In [54]:
blob = TextBlob(df_eda['summary'][0])

In [55]:
blob.sentiment

Sentiment(polarity=0.2826530612244898, subjectivity=0.5806122448979592)

In [57]:
df_eda['summary'].fillna(value='', axis=0, inplace=True)

In [59]:
# string cols
str_cols = ['summary', 'space', 'description', 'neighborhood_overview', 'notes', 
            'transit', 'access', 'interaction', 'house_rules']

In [64]:
# modify all str_cols to use textblob
for col in str_cols:
    df_eda[col].fillna(value='', axis=0, inplace=True)

In [65]:
df_eda['summary_p'] = df_eda['summary'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [67]:
df_eda['summary_s'] = df_eda['summary'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [74]:
# get polarity and subjectivity from str_cols
for string in str_cols:
    df_eda[add_p(string)] = df_eda[string].apply(lambda x: TextBlob(x).sentiment.polarity)
    df_eda[add_s(string)] = df_eda[string].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [73]:
# help naming
def add_p(string):
    return string + '_p'

def add_s(string):
    return string + '_s'

In [76]:
df_eda['access_p']

0        0.350000
1        0.000000
2        0.221875
3        0.000000
4        0.000000
           ...   
48859    0.000000
48860    0.000000
48861    0.083333
48862   -0.062500
48863    0.083333
Name: access_p, Length: 48864, dtype: float64

In [79]:
df_eda.drop(str_cols, axis=1, inplace=True)

In [167]:
df_eda[df_eda['host_since'].isna()]

Unnamed: 0,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,...,transit_p,transit_s,access_p,access_s,interaction_p,interaction_s,house_rules_p,house_rules_s,host_about_p,host_about_s


In [95]:
# drop corrupted rows
df_eda.drop(df_eda[df_eda['host_since'].isna()].index, axis=0, inplace=True)

In [98]:
# turn host_since to datetime object
df_eda['host_since'] = df_eda['host_since'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [99]:
df_eda['host_since'][0]

Timestamp('2008-09-09 00:00:00')

In [101]:
df_eda['host_about'].fillna('', inplace=True)

In [104]:
df_eda['host_about_p'] = df_eda['host_about'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_eda['host_about_s'] = df_eda['host_about'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [109]:
df_eda.drop('host_about', axis=1, inplace=True)

In [359]:
# use this line to drop cols
df_eda.drop(['is_business_travel_ready'], axis=1, inplace=True)

In [315]:
df_eda

Unnamed: 0,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,...,transit_p,transit_s,access_p,access_s,interaction_p,interaction_s,house_rules_p,house_rules_s,host_about_p,host_about_s
0,2008-09-09,within a few hours,0.90,0,5.0,5.0,"['email', 'phone', 'reviews', 'kba', 'work_ema...",1,1,Manhattan,...,-0.077778,0.372222,0.350000,0.550000,0.370455,0.497727,0.000000,0.000000,0.445202,0.664899
1,2008-11-25,within a day,1.00,0,1.0,1.0,"['email', 'phone', 'google', 'reviews', 'jumio...",1,1,Manhattan,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.057792,0.488312,0.133939,0.396667
2,2008-12-07,within an hour,0.90,0,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",1,1,Brooklyn,...,-0.040000,0.245000,0.221875,0.525000,0.216667,0.344444,0.125000,0.191667,0.000000,0.000000
3,2009-01-29,0,0.00,0,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",1,1,Manhattan,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.193056,0.633333,0.250000,0.300000
4,2009-02-02,within a few hours,0.90,0,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",1,0,Manhattan,...,0.151190,0.430952,0.000000,0.000000,0.075000,0.325000,0.500000,0.700000,0.468182,0.677273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48859,2019-08-05,0,0.00,0,0.0,0.0,"['email', 'phone']",1,0,Queens,...,0.000000,0.000000,0.000000,0.000000,0.375000,0.375000,0.000000,0.000000,0.000000,0.000000
48860,2010-11-26,0,0.00,0,0.0,0.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",1,1,Manhattan,...,0.200000,0.433333,0.000000,0.000000,0.347273,0.510000,0.000000,0.666667,0.285795,0.687500
48861,2017-01-26,within a few hours,0.96,0,1.0,1.0,"['email', 'phone', 'google', 'reviews', 'jumio...",1,0,Manhattan,...,0.223485,0.443182,0.083333,0.416667,0.275000,0.712500,0.142500,0.505556,0.000000,0.000000
48862,2013-08-04,0,0.00,0,0.0,0.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",1,1,Manhattan,...,1.000000,1.000000,-0.062500,0.687500,0.800000,1.000000,0.133333,0.577778,0.000000,0.000000


In [153]:
# turn 0 values in host_response_time back to '' or nan
# df_eda['host_response_time'].apply(lambda x: '' if x == 0)

In [155]:
# turn str to numerical
df_eda['host_response_rate'] = df_eda['host_response_rate'].apply(lambda x: int(x.replace('%', ''))/100 if type(x) != float else 0.0)

In [165]:
df_eda.drop('host_acceptance_rate', axis=1, inplace=True)

In [184]:
# turn cols with bool values to 0 n 1
def is_bool(df):
    for col in df.columns:
        if len(set(df_eda[col].values)) == 2:
            df_eda[col] = df_eda[col].apply(lambda x: 1 if x == 't' else 0)

In [185]:
is_bool(df_eda)

In [170]:
df_eda['host_has_profile_pic'][0]

't'

In [210]:
# fill na with 0
df_eda['bathrooms'].fillna(0, inplace=True)
df_eda['bedrooms'].fillna(0, inplace=True)
df_eda['beds'].fillna(0, inplace=True)
df_eda['security_deposit'].fillna(0, inplace=True)
df_eda['cleaning_fee'].fillna(0, inplace=True)
df_eda['reviews_per_month'].fillna(0, inplace=True)

In [273]:
# change some other cols to bool
to_bool = ['square_feet', 'weekly_price', 'monthly_price']

for col in to_bool:
    df_eda[col].fillna(0, inplace=True)
    df_eda[col] = df_eda[col].apply(lambda x: 1 if x != 0 else 0)

In [276]:
rename_cols = {'square_feet': 'show_sqft', 'weekly_price': 'offer_weekly', 'monthly_price': 'offer_monthly'}

In [279]:
df_eda.rename(columns=rename_cols, inplace=True)

In [294]:
# fill all nan values in review scores with mean
# should consider adding some kind of noise or randomness
# current rationale: some customer might just dont like leaving reviews if there isn't anything in their experience with the listing to report

review_scores = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 
                 'review_scores_value']

for col in review_scores:
    df_eda[col].fillna(np.mean(df_eda[df_eda[col].notna()][col].values), inplace=True)

In [293]:
df_eda['review_scores_rating'].describe()

count    37746.000000
mean        93.930430
std          8.631339
min         20.000000
25%         92.000000
50%         96.000000
75%        100.000000
max        100.000000
Name: review_scores_rating, dtype: float64

In [770]:
# check for any nulls values left
n_nulls = defaultdict()
for col in df_eda.columns:
    n_nulls[col] = sum(pd.isnull(df_eda[col]).values)
    
n_nulls

defaultdict(None,
            {'host_since': 0,
             'host_response_rate': 0,
             'host_is_superhost': 0,
             'host_listings_count': 0,
             'host_total_listings_count': 0,
             'host_has_profile_pic': 0,
             'host_identity_verified': 0,
             'accommodates': 0,
             'bathrooms': 0,
             'bedrooms': 0,
             'beds': 0,
             'show_sqft': 0,
             'offer_weekly': 0,
             'offer_monthly': 0,
             'security_deposit': 0,
             'cleaning_fee': 0,
             'guests_included': 0,
             'extra_people': 0,
             'minimum_nights': 0,
             'maximum_nights': 0,
             'minimum_minimum_nights': 0,
             'maximum_minimum_nights': 0,
             'minimum_maximum_nights': 0,
             'maximum_maximum_nights': 0,
             'minimum_nights_avg_ntm': 0,
             'maximum_nights_avg_ntm': 0,
             'number_of_reviews': 0,
            

In [482]:
# get all cols with dtype Object
obj_cols = df_eda.dtypes.values == 'O'

df_eda.columns[obj_cols]

Index(['host_verifications', 'amenities'], dtype='object')

In [443]:
get_dummies = ['host_response_time', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy']
dummy_prefix = ['response_time', 'neighbor_group', 'property_type', 'room_type', 'bed_type', 'cancellation']

df_eda[get_dummies]

Unnamed: 0,host_response_time,neighbourhood_group_cleansed,property_type,room_type,bed_type,cancellation_policy
0,within a few hours,Manhattan,Apartment,Entire home/apt,Real Bed,strict_14_with_grace_period
1,within a day,Manhattan,Apartment,Private room,Pull-out Sofa,strict_14_with_grace_period
2,within an hour,Brooklyn,Guest suite,Entire home/apt,Real Bed,moderate
3,,Manhattan,Apartment,Entire home/apt,Real Bed,strict_14_with_grace_period
4,within a few hours,Manhattan,Apartment,Entire home/apt,Real Bed,strict_14_with_grace_period
...,...,...,...,...,...,...
48859,,Queens,Apartment,Shared room,Real Bed,flexible
48860,,Manhattan,Apartment,Private room,Real Bed,flexible
48861,within a few hours,Manhattan,Apartment,Entire home/apt,Real Bed,strict_14_with_grace_period
48862,,Manhattan,Apartment,Private room,Real Bed,strict_14_with_grace_period


In [370]:
for col in get_dummies:
    types = set(df_eda[col].values)
    print(len(types), '    ', types, '\n')

5      {'a few days or more', 'N/A', 'within an hour', 'within a few hours', 'within a day'} 

5      {'Staten Island', 'Bronx', 'Brooklyn', 'Manhattan', 'Queens'} 

37      {'Nature lodge', 'Guesthouse', 'Apartment', 'House', 'Casa particular (Cuba)', 'Aparthotel', 'Bungalow', 'Boat', 'Resort', 'Boutique hotel', 'Houseboat', 'Hotel', 'Dome house', 'Earth house', 'Bus', 'Cottage', 'Tent', 'Condominium', 'Townhouse', 'Lighthouse', 'Barn', 'Timeshare', 'Loft', 'Tiny house', 'Castle', 'Serviced apartment', 'Cabin', 'Treehouse', 'Other', 'Cave', 'Farm stay', 'Yurt', 'Villa', 'Hostel', 'Guest suite', 'Bed and breakfast', 'Camper/RV'} 

3      {'Private room', 'Shared room', 'Entire home/apt'} 

5      {'Futon', 'Real Bed', 'Airbed', 'Couch', 'Pull-out Sofa'} 

6      {'strict_14_with_grace_period', 'super_strict_30', 'moderate', 'strict', 'flexible', 'super_strict_60'} 



In [452]:
# get dummies
for i, col in enumerate(get_dummies):
    df_eda = pd.concat([df_eda, pd.get_dummies(df_eda[col], drop_first=True, prefix=dummy_prefix[i], prefix_sep=' : ')], axis=1)

In [476]:
df_eda.drop(df_eda[col_list[69:124]], axis=1, inplace=True)

In [481]:
df_eda.drop(get_dummies, axis=1, inplace=True)

In [369]:
df_eda['host_response_time'].replace(0, 'N/A', inplace=True)

In [423]:
# convert string to int for currency
to_num = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']

for col in to_num:
    df_eda[col] = df_eda[col].apply(lambda x: str_to_float(x))

In [484]:
col_list = list(df_eda.columns)

In [422]:
money_test = '$2,250.00'

def str_to_float(x):
    if type(x) == int or type(x) == float:
        return float(x)
    x = x.replace('$', '')
    x = x.replace(',', '')
    return float(x)
str_to_float(money_test)

2250.0

In [715]:
# now we deal with these two
unpack = ['host_verifications', 'amenities']

In [716]:
df_eda[unpack]

Unnamed: 0,host_verifications,amenities
0,"{phone, work_email, email, reviews, kba}","{Ethernet connection, Fire extinguisher, Hair ..."
1,"{google, jumio, email, government_id, reviews,...","{Buzzer/wireless intercom, Kitchen, Smoke dete..."
2,"{reviews, kba, email, phone}","{Fire extinguisher, Hair dryer, Lock on bedroo..."
3,"{phone, facebook, email, reviews, kba}","{Buzzer/wireless intercom, Kitchen, Smoke dete..."
4,"{jumio, email, government_id, reviews, phone}","{Buzzer/wireless intercom, Fire extinguisher, ..."
...,...,...
48859,"{email, phone}","{Kitchen, Smoke detector, Essentials, Pets all..."
48860,"{jumio, email, government_id, reviews, phone}","{Kitchen, Smoke detector, Essentials, Private ..."
48861,"{google, offline_government_id, jumio, email, ...","{Crib, Ethernet connection, Fire extinguisher,..."
48862,"{jumio, work_email, email, government_id, revi...","{Kitchen, Smoke detector, Essentials, Laptop f..."


In [497]:
import ast

In [506]:
txt_to_lst_test = df_eda['host_verifications'][0]

ast.literal_eval(txt_to_lst_test)

['email', 'phone', 'reviews', 'kba', 'work_email']

In [542]:
txt_to_json_test = df_eda['amenities'][10]

In [543]:
def extract_lst(text):
    return text.replace('"', "").replace('{', "").replace('}', "").split(sep=',')

extract_lst(txt_to_json_test)

['Internet',
 'Wifi',
 'Air conditioning',
 'Kitchen',
 'Doorman',
 'Elevator',
 'Buzzer/wireless intercom',
 'Heating',
 'Smoke detector',
 'Carbon monoxide detector',
 'Fire extinguisher',
 'Essentials',
 'Shampoo',
 '24-hour check-in',
 'Hair dryer',
 'Iron',
 'Laptop friendly workspace',
 'Self check-in',
 'Lockbox',
 'Hot water',
 'Long term stays allowed']

In [573]:
# convert host verifications from str to list
df_eda['host_verifications'] = df_eda['host_verifications'].apply(set)

In [579]:
# convert amenities to list
df_eda['amenities'] = df_eda['amenities'].apply(set)

In [660]:
empty_set = {''}

In [674]:
df_eda['amenities'] = df_eda['amenities'].apply(lambda x: empty_set if x == None else x)

In [720]:
# use to get all unique items
unique_verifications = set()
for item in df_eda['host_verifications'].values:
    unique_verifications = unique_verifications | item

unique_amenities = set()
for item in df_eda['amenities'].values:
    unique_amenities = unique_amenities | item

In [721]:
print(unique_verifications, '\n')
print(unique_amenities)

{'google', 'offline_government_id', 'sent_id', 'facebook', 'manual_online', 'selfie', 'email', 'reviews', 'government_id', 'manual_offline', 'phone', 'jumio', 'sesame', 'weibo', 'work_email', 'identity_manual', 'sesame_offline', 'zhima_selfie', 'kba'} 

{'', 'Crib', 'Paid parking on premises', 'Wide entrance', 'Cat(s)', 'Long term stays allowed', 'Children’s dinnerware', 'Single level home', 'Fireplace guards', 'Heating', 'Air purifier', 'Patio or balcony', 'Self check-in', 'Bed linens', 'Internet', 'Waterfront', 'Lockbox', 'Wide clearance to shower', 'Kitchen', 'Essentials', 'Changing table', 'Pool with pool hoist', 'Cable TV', 'Baby monitor', 'Children’s books and toys', 'Family/kid friendly', 'Pack ’n Play/travel crib', 'Beachfront', 'Kitchenette', 'Room-darkening shades', 'Oven', 'Wide hallways', 'Outlet covers', 'Free parking on premises', 'Private entrance', 'Suitable for events', 'Dishes and silverware', 'Lock on bedroom door', 'Breakfast', 'Flat path to guest entrance', 'High c

In [737]:
# create dummies of host verifications
for i in unique_verifications:
    col_name = 'verification_' + i
    df_eda[col_name] = df_eda['host_verifications'].apply(lambda x: 1 if i in x else 0)

In [739]:
# create dummies of amenities
for i in unique_amenities:
    col_name = 'amenities_' + i
    df_eda[col_name] = df_eda['amenities'].apply(lambda x : 1 if i in x else 0)

In [736]:
# df_eda.drop(unique_verifications, axis=1, inplace=True)

In [711]:
# use this to fix accidental Nones
# df_eda['amenities'] = df_eda['amenities'].apply(lambda x: empty_set if x == None else x)

In [725]:
check_this = df_eda.iloc[946,5]

In [726]:
check_this

{'email', 'government_id', 'jumio', 'kba', 'phone', 'reviews'}

In [299]:
df_eda.drop(df_eda[df_eda['cancellation_policy'].isna()].index, axis=0, inplace=True)

In [227]:
df_orig[df_orig['cancellation_policy'].isna()]['host_url']

48818    https://www.airbnb.com/users/show/278187111
Name: host_url, dtype: object

In [782]:
df_eda['host_since'] = df_eda['host_since'].apply(lambda x: 2020 - x.year)

In [817]:
df_eda.to_csv('data/2019aug/cleaned_dlistings.csv', index=False)

In [742]:
y = df_eda.pop('price')

In [851]:
y.to_csv('data/2019aug/prices.csv')

In [824]:
X = df_eda

In [755]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [825]:
X1 = sm.add_constant(X)

In [826]:
olsmodel = sm.OLS(y, X1.astype(float))

olsmodel = olsmodel.fit()

In [848]:
olsmodel.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.214
Model:,OLS,Adj. R-squared:,0.209
Method:,Least Squares,F-statistic:,50.77
Date:,"Wed, 26 Aug 2020",Prob (F-statistic):,0.0
Time:,09:13:36,Log-Likelihood:,-330450.0
No. Observations:,48845,AIC:,661400.0
Df Residuals:,48584,BIC:,663700.0
Df Model:,260,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-70.0617,63.855,-1.097,0.273,-195.218,55.095
host_since,1.4502,0.624,2.326,0.020,0.228,2.672
host_response_rate,-38.2092,12.965,-2.947,0.003,-63.620,-12.799
host_is_superhost,11.5820,2.890,4.007,0.000,5.917,17.247
host_listings_count,0.0276,0.010,2.861,0.004,0.009,0.046
host_total_listings_count,0.0276,0.010,2.859,0.004,0.009,0.046
host_has_profile_pic,22.1533,21.386,1.036,0.300,-19.763,64.069
host_identity_verified,4.9262,2.951,1.669,0.095,-0.857,10.710
accommodates,28.0105,0.964,29.058,0.000,26.121,29.900

0,1,2,3
Omnibus:,116045.953,Durbin-Watson:,1.936
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1561917044.991
Skew:,24.193,Prob(JB):,0.0
Kurtosis:,877.704,Cond. No.,5.7e+17


In [832]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, 
from sklearn import tree

In [830]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [834]:
reg = LinearRegression()
reg.fit(X_train, y_train)

mse_reg = mean_squared_error(y_test,reg.predict(X_test))
mse_reg
# lol

35982.205288017365

In [835]:
r2_reg = r2_score(y_test, reg.predict(X_test))
r2_reg

0.24445967702555438

In [838]:
params = {'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

gbt = GradientBoostingRegressor(**params)
gbt.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=4, n_estimators=1000)

In [845]:
gbt.feature_importances_

array([6.01349550e-03, 2.76721758e-03, 8.66749982e-05, 9.22484777e-04,
       5.64651115e-04, 4.51113533e-06, 4.45269235e-05, 1.18043071e-01,
       6.70406630e-02, 4.41174900e-02, 1.41721486e-02, 4.58860981e-07,
       6.00038313e-06, 6.31449946e-05, 1.44708054e-02, 6.73177544e-02,
       5.90803072e-04, 1.43739628e-02, 3.42737447e-02, 6.81155163e-03,
       5.74203461e-02, 1.42970322e-02, 7.72487783e-03, 7.07444621e-03,
       9.98960162e-03, 5.24428070e-03, 2.98141970e-03, 3.00728837e-04,
       3.29188875e-02, 1.81793292e-04, 3.53115872e-03, 5.67200266e-03,
       1.43620289e-05, 3.08042776e-03, 8.37664485e-04, 6.68314877e-04,
       1.60018512e-03, 1.44282420e-03, 2.46025244e-03, 1.48438688e-02,
       1.54452738e-02, 9.65611247e-03, 1.00070036e-02, 1.14422803e-02,
       4.81469863e-02, 5.20387732e-03, 8.64824363e-03, 8.50419722e-04,
       1.33047233e-04, 4.79559689e-05, 6.99650656e-04, 1.97807923e-03,
       3.58585066e-02, 7.19185460e-04, 3.06851997e-03, 5.95189052e-04,
      

In [840]:
mse_gbt = mean_squared_error(y_test, gbt.predict(X_test))
mse_gbt

33379.71408592212

In [852]:
r2_gbt = r2_score(y_train, gbt.predict(X_train))
r2_gbt

0.6171124905662771