In [5]:
import re
import numpy as np
import pandas as pd
from requests_html import HTMLSession

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import IPython.display
from IPython.display import display, clear_output

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt

URL = "http://insideairbnb.com/get-the-data"

In [98]:
def get_cities(URL="http://insideairbnb.com/get-the-data"):
    
    session = HTMLSession()
    links = session.get(URL).html.links
    session.close()
    cities = list(filter(lambda x: 'http' not in x and len(x) > 1 and 'about' not in x and 'explore' not in x, links))
    cities = [city.split('/')[1] for city in cities]
    cities.remove('data-requests')
    cities.sort()
    
    return cities

def get_city_data(city, URL="http://insideairbnb.com/get-the-data"):
        
    session = HTMLSession()
    links = session.get(URL).html.links
    session.close()
    data_url = list(filter(lambda x: city in x and 'listings.csv.gz' in x, links))[0]
    
    return pd.read_csv(data_url)

In [119]:
def clean(df):
    to_drop = ['listing_url', 'scrape_id', 'last_scraped', 'source', 'host_id', 'host_url','host_name','host_thumbnail_url','host_picture_url','neighbourhood_group_cleansed','bathrooms','minimum_minimum_nights','maximum_minimum_nights', 'minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','calendar_updated','host_neighbourhood','neighbourhood','license']
    df = df.drop(to_drop, axis = 1) #drop unnecessary/redundant columns
    
    df['description']= df['description'].str.replace(r'<[^<>]*>', '', regex=True) #cleaning
    df['host_about']= df['host_about'].str.replace('\r\n','')             # descriptions from
    df['host_about']= df['host_about'].str.replace(r'<[^<>]*>', '', regex=True) #html tags
    
    to_datetime = ['host_since', 'calendar_last_scraped', 'first_review', 'last_review']
    for col in to_datetime:
        df[col] = pd.to_datetime(df[col])
    
    df['host_verifications'] = df['host_verifications'].apply(lambda x: x.strip('][').split(', '))
    df['num_verifications'] = df['host_verifications'].apply(lambda x:len(x))
    df['amenities'] = df['amenities'].apply(lambda x: x.strip('][').split(', '))
    df['num_amenities'] = df['amenities'].apply(lambda x:len(x))
    
    df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    df['min_price'] = df['price'] * df['minimum_nights']
    df['max_price'] = df['price'] * df['maximum_nights']
    
    boolean = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified']
    for col in boolean:
            df[col] = df[col].apply(lambda x: True if (x == "t") else False)
    
    df.replace("N/A", None)
    
    NA_to_empty = ['neighborhood_overview', 'host_about', 'description']
    for col in NA_to_empty:
            df[col] = df[col].replace(np.nan, '', regex=True)
    
    imputate_scores = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                       'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                      'review_scores_value', 'reviews_per_month']
    for col in imputate_scores:
        df[col].fillna(value=df[col].mean(), inplace=True)
    
    df["neighborhood_overview_wordcount"] = df["neighborhood_overview"].apply(lambda x: len(x.split(" ")))
    df["host_about_wordcount"] = df["host_about"].apply(lambda x: len(x.split(" ")))
    df["description_wordcount"] = df["description"].apply(lambda x: len(x.split(" ")))

    df = df.replace(
    ['within an hour', 'within a few hours', 'within a day', 'a few days or more'], 
    [0, 1, 2, 3])
    df['host_response_time'] = df['host_response_time'].replace(np.nan, df['host_response_time'].mode().values[0], regex=True)
    
    df['host_response_rate'] = (df['host_response_rate'].replace('[\%,]', '', regex=True).astype(float))/100
    df['host_acceptance_rate'] = (df['host_acceptance_rate'].replace('[\%,]', '', regex=True).astype(float))/100
    df['host_response_rate'].fillna(value=df['host_response_rate'].mean(), inplace=True)
    df['host_acceptance_rate'].fillna(value=df['host_acceptance_rate'].mean(), inplace=True)
    
    df['has_availability'] = df['has_availability'].apply(lambda x: True if x == 't' else False)
    df['instant_bookable'] = df['instant_bookable'].apply(lambda x: True if x == 't' else False)

    if 'bathrooms_text' in df.columns.tolist():
        df['bathrooms'] = df['bathrooms_text'].apply(lambda x: float(x.split(" ")[0]) if str(x)[0].isdigit() else 0.5)
        df['is_shared_bath'] = df['bathrooms_text'].apply(lambda x: True if "hared" in str(x) else False)
    
    encoder = OneHotEncoder(sparse=False)
    encoded_room_types = pd.DataFrame(encoder.fit_transform(df[['room_type']]), columns=[ 'encoded_room_type_' + str(i) for i in range(df['room_type'].value_counts().shape[0])])
    df = pd.concat([df, encoded_room_types], axis=1)

    df = df.dropna()
    
    return df

In [121]:
def get_basic_Xy(cleaned_data):
    
    df = cleaned_data.drop(columns=['id', 'name', 'description', 'neighborhood_overview', 
                                    'picture_url', 'host_location', 'host_about', 
                                    'host_verifications', 'property_type', 'room_type', 
                                    'bathrooms_text', 'amenities'])
    
    for col in  df.select_dtypes(include='datetime').columns.tolist():
        df[col] = df[col].apply(pd.Timestamp.toordinal)
    
    y = df['price']
    X = df.loc[:, df.columns != 'price']

    return X, y

def var_filter(X, num_features=38):
    new_cols = X.var().sort_values(ascending=False)[:num_features].index.to_list()
    return X[new_cols]

def make_poly(X, y, degree=2):
    '''creates and returns a polynomial regression model fit with input data X and Y'''
    
    polynomial_features = PolynomialFeatures(degree=degree)
    poly_model = Pipeline([("polynomial_features", polynomial_features),
                           ("linear_regression", LinearRegression())])

    return poly_model.fit(X, y)

In [136]:
df = None
cities = get_cities()

dropdown_locations = widgets.Dropdown(options=cities, value=None, description="City:")
dropdown_show = widgets.Dropdown(options=['head', 'tail', 'random'], value='random', description="Display: ")
is_cleaned = widgets.Checkbox(value=True, description='Preprocessed')

def get_loc_data(city, show, cleaned):
    if city == None:
        print("Choose City of Interest")
        print("It make take a while to scrape data...")
    else:
        df = get_city_data(city)
        if cleaned:
            X, y = get_basic_Xy(clean(df))
            Xy = pd.concat([X, y], axis=1)
            print(f'Features: {X.shape}, Target: {y.shape}')
            if show == 'head':
                display(Xy.head(5))
            elif show == 'tail':
                display(Xy.tail(5))
            else:
                display(Xy.sample(5))
        else:
            print(f'Data Shape: {df.shape}')
            if show == 'head':
                display(df.head(5))
            elif show =='tail':
                display(df.tail(5))
            else:
                display(df.sample(5))

out1 = widgets.interactive_output(get_loc_data, {'city': dropdown_locations, 'show': dropdown_show, 'cleaned':is_cleaned})
display(widgets.VBox([widgets.HBox([dropdown_locations, dropdown_show, is_cleaned]), out1]))

VBox(children=(HBox(children=(Dropdown(description='City:', options=('amsterdam', 'antwerp', 'asheville', 'ath…