In [9]:
import pandas as pd
listings = pd.read_csv('listings.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
def wrangle(df):

    df = df.copy()

    
    ''' Casting to new formats '''
    
    
    # As cost is in yen, drop the $ sign and convert all to numeric
    df['price'] = df['price'].str.replace('$','').str.replace(',','').astype(float)
    
    # Cast security deposit as floats. Replace the small amount of nulls with zero as zero accounts for half 
    # of all security deposits. Similarily clean the cleaning_fee column and extra_people
    df['security_deposit'] = df['security_deposit'].fillna('0').str.replace('$','').str.replace(',','').astype(float)
    df['cleaning_fee'] = df['cleaning_fee'].fillna('0').str.replace('$','').str.replace(',','').astype(float)
    df['extra_people'] = df['extra_people'].str.replace('$','').str.replace(',','').astype(float)

    # Keep this categorical feature but only those neighbourhoods with a higher frequency
    most_popular_neighbourhoods = df['neighbourhood_cleansed'].value_counts()[:23].index.tolist()
    df = df[df['neighbourhood_cleansed'].isin(most_popular_neighbourhoods)]

    # These columns are in string format but have innate hierarchy that could be useful 
    df['cancellation_policy'] = df['cancellation_policy'].map(
        {'flexible': 0, 'moderate': 1, 'strict_14_with_grace_period': 2, 'strict': 2, 'super_strict_30': 3, 
         'super_strict_60': 4})

    df['host_response_time'] = df['host_response_time'].fillna('within a day').map(
        {'within an hour': 1, 'within a few hours': 2, 'within a day': 3, 'a few days or more': 4})
    
    df['host_is_superhost'] = df['host_is_superhost'].map({'t': 1,'f': 0})

    df['instant_bookable'] = df['instant_bookable'].map({'t':1, 'f':0})
    
    
    
    ''' Fill or drop null values '''
    
    #Fill nulls
    df['host_response_rate'] = df['host_response_rate'].str.replace('%','').astype(float).fillna(100)
    df['host_identity_verified'] = df['host_identity_verified'].fillna('f').map({'t':1, 'f':0})
    
    # Drop small amount of nulls from columns
    df = df[df['review_scores_rating'].notnull()]
    df = df[df['bedrooms'].notnull()]
    df = df[df['beds'].notnull()]
    df = df[df['first_review'].notnull()]
    df = df[df['last_review'].notnull()]
    df['host_since'] = df['host_since'].dropna()
    
    
    
    ''' Engineer new features '''
    
    # Query for hosts residing in Tokyo and create new column for local (Tokyo) hosts
    tokyo_strings = ['Tokyo', 'Tōkyō-to', '東京', 'tokyo', 'TOKYO', 'Tokyo-to', 'Tokyo To', 'tokyo-to']
    df['local_host'] = df['host_location'].fillna('tokyo').str.contains('|'.join(tokyo_strings)).astype(int)
    
    # Create a new column that estimates the duration the airbnb has existed
    df['first_review'] = pd.to_datetime(df['first_review'])
    df['last_review'] = pd.to_datetime(df['last_review'])
    df['airbnb_duration'] = (df['last_review'] - df['first_review']).dt.days 

    # Create a new column estimating how long one has been a host
    df['host_since'] = pd.to_datetime(df['host_since'])
    df['host_duration'] = (df['last_review'] - df['host_since']).dt.days
    
    # Estimate next month's airbnb income based on current availability 
    # (cannot account for additional bookings or cancelling)
    df['income_estimate'] = df['price'] * (30 - df['availability_30']) 
    
    # Cleaner, useable version of amenities
    def beautiful_amenities(df):

        # strip excess characters from column, convert each row to a list
        df['amenities'] = df['amenities'].str.replace('"','').str.replace('{','').str.replace('}','').str.split(',')        

        # Create new columns for 69 most commonly found amenities (there's probably a cleaner way)
        df['hot_tub'] = [1 if 'Hot tub' in row else 0 for row in df['amenities']]
        df['wifi'] = [1 if 'Wifi' or 'Internet' in row else 0 for row in df['amenities']]
        df['ac'] = [1 if 'Air conditioning' in row else 0 for row in df['amenities']]
        df['kitchen'] = [1 if 'Kitchen' in row else 0 for row in df['amenities']]
        df['washer'] = [1 if 'Washer' in row else 0 for row in df['amenities']]
        df['tv'] = [1 if 'TV' in row else 0 for row in df['amenities']]
        df['intercom'] = [1 if 'Buzzer/wireless intercom' in row else 0 for row in df['amenities']]
        df['heating'] = [1 if 'Heating' in row else 0 for row in df['amenities']]
        df['kid_friendly'] = [1 if 'Family/kid friendly' in row else 0 for row in df['amenities']]
        df['dryer'] = [1 if 'Dryer' in row else 0 for row in df['amenities']]
        df['smoke_detector'] = [1 if 'Smoke detector' in row else 0 for row in df['amenities']]
        df['cm_detector'] = [1 if 'Carbon monoxide detector' in row else 0 for row in df['amenities']]
        df['fire_extinguisher'] = [1 if 'Fire extinguisher' in row else 0 for row in df['amenities']]
        df['essentials'] = [1 if 'Essentials' in row else 0 for row in df['amenities']]
        df['shampoo'] = [1 if 'Shampoo' in row else 0 for row in df['amenities']]
        df['hangers'] = [1 if 'Hangers' in row else 0 for row in df['amenities']]
        df['hair_dryer'] = [1 if 'Hair dryer' in row else 0 for row in df['amenities']]
        df['iron'] = [1 if 'Iron' in row else 0 for row in df['amenities']]
        df['workspace'] = [1 if 'Laptop friendly workspace' in row else 0 for row in df['amenities']]
        df['self_check_in'] = [1 if 'Self check-in' in row else 0 for row in df['amenities']]
        df['lockbox'] = [1 if 'Lockbox' in row else 0 for row in df['amenities']]
        df['dishware'] = [1 if 'Dishes and silverware' in row else 0 for row in df['amenities']]
        df['safety_card'] = [1 if 'Safety card' in row else 0 for row in df['amenities']]
        df['shades'] = [1 if 'Room-darkening shades' in row else 0 for row in df['amenities']]
        df['hot_water'] = [1 if 'Hot water' in row else 0 for row in df['amenities']]
        df['bed_linens'] = [1 if 'Bed linens' in row else 0 for row in df['amenities']]
        df['microwave'] = [1 if 'Microwave' in row else 0 for row in df['amenities']]
        df['coffee_maker'] = [1 if 'Coffee maker' in row else 0 for row in df['amenities']]
        df['refrigerator'] = [1 if 'Refrigerator' in row else 0 for row in df['amenities']]
        df['cooking_basics'] = [1 if 'Cooking basics' in row else 0 for row in df['amenities']]
        df['stove'] = [1 if 'Stove' in row else 0 for row in df['amenities']]
        df['patio_or_balcony'] = [1 if 'Patio or balcony' in row else 0 for row in df['amenities']]
        df['long_term'] = [1 if 'Long term stays allowed' in row else 0 for row in df['amenities']]
        df['no_stairs'] = [1 if 'No stairs or steps to enter' in row else 0 for row in df['amenities']]
        df['first_aid'] = [1 if 'First aid kit' in row else 0 for row in df['amenities']]
        df['fire_extinguisher'] = [1 if 'Fire extinguisher' in row else 0 for row in df['amenities']]
        df['twenty_four_hour'] = [1 if '24-hour check-in' in row else 0 for row in df['amenities']]
        df['private_entrance'] = [1 if 'Private entrance' in row else 0 for row in df['amenities']]
        df['luggage_dropoff'] = [1 if 'Luggage dropoff allowed' in row else 0 for row in df['amenities']]
        df['well_lit'] = [1 if 'Well-lit path to entrance' in row else 0 for row in df['amenities']]
        df['other'] = [1 if 'Other' in row else 0 for row in df['amenities']]
        df['paid_parking'] = [1 if 'Paid parking off premises' in row else 0 for row in df['amenities']]
        df['pets_allowed'] = [1 if 'Pets allowed' in row else 0 for row in df['amenities']]
        df['extra_pillow'] = [1 if 'Extra pillows and blankets' in row else 0 for row in df['amenities']]
        df['ethernet'] = [1 if 'Ethernet connection' in row else 0 for row in df['amenities']]
        df['wide_hallways'] = [1 if 'Wide hallways' in row else 0 for row in df['amenities']]
        df['flat_path'] = [1 if 'Flat path to guest entrance' in row else 0 for row in df['amenities']]
        df['wide_entrance'] = [1 if 'Wide entrance' in row else 0 for row in df['amenities']]
        df['extra_space'] = [1 if 'Extra space around bed' in row else 0 for row in df['amenities']]
        df['wide_clearance'] = [1 if 'Wide clearance to shower' in row else 0 for row in df['amenities']]
        df['extra_space'] = [1 if 'Extra space around bed' in row else 0 for row in df['amenities']]
        df['shower_head'] = [1 if 'Handheld shower head' in row else 0 for row in df['amenities']]
        df['bedroom_lock'] = [1 if 'Lock on bedroom door' in row else 0 for row in df['amenities']]
        df['greet'] = [1 if 'Host greets you' in row else 0 for row in df['amenities']]
        df['keypad'] = [1 if 'Key pad' in row else 0 for row in df['amenities']]
        df['crib'] = [1 if 'Pack ’n Play/travel crib' in row else 0 for row in df['amenities']]
        df['wheelchair'] = [1 if 'Wheelchair accessible' in row else 0 for row in df['amenities']]
        df['cleaning'] = [1 if 'Cleaning before checkout' in row else 0 for row in df['amenities']]
        df['kitchenette'] = [1 if 'Kitchenette' in row else 0 for row in df['amenities']]
        df['full_kitchen'] = [1 if 'Full kitchen' in row else 0 for row in df['amenities']]
        df['pocket_wifi'] = [1 if 'Pocket wifi' in row else 0 for row in df['amenities']]
        df['events'] = [1 if 'Suitable for events' in row else 0 for row in df['amenities']]
        df['smart_lock'] = [1 if 'Smart lock' in row else 0 for row in df['amenities']]
        df['private_living'] = [1 if 'Private living room' in row else 0 for row in df['amenities']]
        df['garden_backyard'] = [1 if 'Garden or backyard' in row else 0 for row in df['amenities']]
        df['smoking'] = [1 if 'Smoking allowed' in row else 0 for row in df['amenities']]
        df['oven'] = [1 if 'Oven' in row else 0 for row in df['amenities']]
        df['single_level'] = [1 if 'Single level home' in row else 0 for row in df['amenities']]
        df['crib'] = [1 if 'Crib' in row else 0 for row in df['amenities']]

        return df

    # Apply cleaned amenities function
    df = beautiful_amenities(df)   
    
    
    
    ''' Drop unnecessary columns '''
    
    
    # Columns to drop
    all_or_none = ['thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'has_availability', 'requires_license',
                   'neighbourhood_group_cleansed', 'weekly_price', 'monthly_price', 'listing_url', 'scrape_id', 
                   'last_scraped', 'experiences_offered']
    strings = ['neighborhood_overview', 'interaction', 'access', 'transit', 'notes', 'house_rules', 'description', 
               'space', 'summary', 'name', 'license', 'amenities']
    irrelevant = ['host_location', 'host_about', 'host_acceptance_rate', 'host_thumbnail_url', 'host_picture_url', 
                  'is_location_exact', 'calendar_updated', 'calendar_last_scraped']
    redundant = ['city', 'state','zipcode', 'market', 'smart_location', 'country_code', 'country', 
                 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 
                 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'host_neighbourhood', 
                 'bathrooms', 'bedrooms']
    null_or_uniform = ['jurisdiction_names', 'is_business_travel_ready', 'require_guest_profile_picture', 
                       'require_guest_phone_verification', 'calculated_host_listings_count_entire_homes', 
                       'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 
                       'square_feet', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 
                       'street', 'neighbourhood','host_has_profile_pic', 'reviews_per_month', 'host_since',
                       'first_review', 'last_review']

    # Drop unnecessary columns
    df = df.drop(columns=all_or_none)
    df = df.drop(columns=strings)
    df = df.drop(columns=null_or_uniform)
    df = df.drop(columns=redundant)
    df = df.drop(columns=irrelevant)

    return df

In [11]:
listings = wrangle(listings)

In [12]:
pd.set_option('display.max_columns', None)
listings.describe()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,beds,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,local_host,airbnb_duration,host_duration,income_estimate,hot_tub,wifi,ac,kitchen,washer,tv,intercom,heating,kid_friendly,dryer,smoke_detector,cm_detector,fire_extinguisher,essentials,shampoo,hangers,hair_dryer,iron,workspace,self_check_in,lockbox,dishware,safety_card,shades,hot_water,bed_linens,microwave,coffee_maker,refrigerator,cooking_basics,stove,patio_or_balcony,long_term,no_stairs,first_aid,twenty_four_hour,private_entrance,luggage_dropoff,well_lit,other,paid_parking,pets_allowed,extra_pillow,ethernet,wide_hallways,flat_path,wide_entrance,extra_space,wide_clearance,shower_head,bedroom_lock,greet,keypad,crib,wheelchair,cleaning,kitchenette,full_kitchen,pocket_wifi,events,smart_lock,private_living,garden_backyard,smoking,oven,single_level
count,11622.0,11622.0,11622.0,11612.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11612.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0,11622.0
mean,28222210.0,1.156858,98.560661,0.326042,0.210893,35.697541,139.739956,4.4254,2.810015,18524.94,11523.10721,5885.233953,2.300723,1491.259164,3.108673,699.488556,14.126914,31.359663,51.743934,151.144381,31.418431,16.715367,92.659955,9.54655,9.357684,9.658665,9.681552,9.43736,9.226295,0.814834,1.590432,14.248494,0.66985,382.467217,834.156993,297671.1,0.122268,1.0,0.989761,0.825675,0.854156,0.699793,0.057305,0.829719,0.232146,0.313199,0.90931,0.609448,0.789623,0.971003,0.974015,0.870676,0.984512,0.607899,0.727758,0.451557,0.273189,0.581311,0.159783,0.234985,0.75185,0.463259,0.646274,0.082516,0.67768,0.378248,0.483308,0.150835,0.476854,0.073309,0.30296,0.071244,0.511014,0.268887,0.117708,0.056187,0.253227,0.017983,0.187575,0.108329,0.066254,0.046205,0.043882,0.038289,0.022457,0.086388,0.357081,0.082946,0.0,0.022543,0.011272,0.027792,0.000344,0.002065,0.264756,0.063758,0.037171,0.127087,0.028653,0.044915,0.088109,0.03631
std,8398703.0,0.477625,5.398919,0.468783,0.40796,0.042932,0.055326,3.120791,2.567839,49889.66,29118.679091,4692.081028,1.78844,1435.422989,10.555453,485.150177,9.781222,18.299887,27.703922,98.309511,41.685343,18.527282,7.844841,0.760002,0.913999,0.717042,0.689231,0.740246,0.825693,0.388449,0.658587,19.743529,0.470287,398.770412,593.163393,857318.2,0.327609,0.0,0.100674,0.379405,0.352965,0.458367,0.232435,0.375895,0.42222,0.463815,0.28718,0.487895,0.407594,0.167804,0.159098,0.335572,0.123488,0.48824,0.445133,0.497669,0.445616,0.493365,0.366421,0.424008,0.431958,0.49867,0.478146,0.275161,0.467385,0.484971,0.499743,0.357903,0.499485,0.260655,0.459558,0.257243,0.4999,0.4434,0.322276,0.230292,0.434878,0.132896,0.390389,0.310809,0.248736,0.209939,0.204842,0.191902,0.148172,0.280948,0.47916,0.275813,0.0,0.148449,0.105573,0.164384,0.01855,0.045398,0.441222,0.244333,0.189188,0.333084,0.166835,0.207126,0.283465,0.18707
min,35303.0,1.0,0.0,0.0,0.0,35.53424,139.57507,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23559170.0,1.0,100.0,0.0,0.0,35.685085,139.700265,2.0,1.0,7007.0,0.0,3000.0,1.0,0.0,1.0,120.0,6.0,17.0,32.0,74.0,5.0,4.0,90.0,9.0,9.0,9.0,10.0,9.0,9.0,1.0,1.0,3.0,0.0,91.0,341.75,63063.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29819570.0,1.0,100.0,0.0,0.0,35.702745,139.728525,4.0,2.0,10948.0,0.0,5500.0,2.0,1500.0,1.0,1125.0,14.0,33.0,58.0,145.0,17.0,13.0,95.0,10.0,10.0,10.0,10.0,10.0,9.0,1.0,2.0,8.0,1.0,270.0,681.0,164220.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,34580600.0,1.0,100.0,1.0,0.0,35.72506,139.78518,6.0,4.0,19049.0,20000.0,8000.0,3.0,2400.0,2.0,1125.0,23.0,47.0,75.0,220.0,40.0,25.0,98.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,2.0,17.0,1.0,497.0,1297.25,314958.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40276450.0,4.0,100.0,1.0,1.0,35.81595,139.91197,16.0,50.0,1063900.0,560000.0,60000.0,16.0,20000.0,365.0,1125.0,30.0,60.0,90.0,365.0,876.0,874.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,4.0,134.0,1.0,2968.0,3216.0,29998950.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
