In [44]:
# Imports
import pandas as pd
import re
from datetime import datetime, date
from geopy.geocoders import Nominatim

In [8]:
def parse_num_strings(series):
    """Converts a series of strings representing how many guests, bedrooms or
    beds can be or are at an Airbnb property to a single floatvalue.
    
    Input: pandas series
    output: pandas series
    
    Example:
    Input:
    0    3 guests
    1    4 guests
    2    6 guests
    3    5 guests
    4    8 guests
    Name: guests, dtype: object
    
    Output:
    0    3.0
    1    4.0
    2    6.0
    3    5.0
    4    8.0
    Name: guests, dtype: int
    
    """
    guests_numeric = pd.Series(index=series.index, dtype=float)
    try:
        assert type(series[0]) in [str, int, float]
        if type(series[0]) == str:
            studio = re.compile(r'studio')
            for idx, val in series.iteritems():
                if studio.search(val):
                    guests_numeric[idx] = 1
                else:
                    num, rest = val.split(' ')
                    guests_numeric[idx] = float(num)
            return guests_numeric
        elif type(series[0]) == int:
            for idx, val in series.iteritems():
                guests_numeric[idx] = float(num)
            return guests_numeric
        else:
            return series
    except TypeError:
        print('series must be of type str, int, or float.')
        

In [9]:
def word_counts(series):
    """Returns a series of the word counts of each element of a series."""
    try:
        assert type(series[0]) == str
        words = pd.Series(index=series.index)
        for idx, item in series.iteritems():
            words[idx] = len(item.split(' '))
        return words
    except AssertionError:
        print('series must be of type str.')
    

In [10]:
def find_prob_indices(series):
    """Returns a series of booleans that tells whether or not the word
    'bedroom' is in the string."""
    problems = pd.Series(index=series.index)
    br = re.compile(r'bedroom')
    for idx, item in series.iteritems():
        if br.search(item):
            problems[idx] = False
        else:
            problems[idx] = True
    return problems

In [48]:
# Read in the data frame, drop the lat_lon columns since we won't use it.
# rms = pd.read_csv('../data/initial_tokyo_data_scrape-10-07-19--00-55-48.csv', index_col=0)
rms = pd.read_csv('../data/initial_data_scrape.csv', index_col=0)
# rm.drop('lat_lon', axis=1, inplace=True)
rms.head(3)

Unnamed: 0_level_0,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,lat_lon,price
guests,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4 guests,1 bedroom,1 bed,1 bath,$55,$85,Show all 40 amenities,True,https://maps.googleapis.com/maps/api/js/Viewpo...,$100
2 guests,studio,1 bed,1 bath,$45,$95,Show all 28 amenities,False,https://maps.googleapis.com/maps/api/js/Viewpo...,$115
3 guests,1 bedroom,2 beds,1 bath,$25,$65,Show all 19 amenities,True,https://maps.googleapis.com/maps/api/js/Viewpo...,$80


In [49]:
rms['lat_lon'][0]

'https://maps.googleapis.com/maps/api/js/ViewportInfoService.GetViewportInfo?1m6&1m2&1d47.576650785194346&2d-122.42010372955906&2m2&1d47.63943551868126&2d-122.26617986681697&2u14&4sen&5e0&6sm%40487000000&7b0&8e0&callback=_xdc_._acm37v&key=AIzaSyAytC_TusuhG7kpNQ19hMrCzXDIUjd307o&token=82406'

In [59]:
try5 = rms.head()['lat_lon'].str.extract(r'1d(\d{2}\.\d+).*2d(-\d{3}\.\d+).*1d(\d{2}\.\d+).*2d(-\d{3}\.\d+)')

In [60]:
try5.head()

Unnamed: 0_level_0,0,1,2,3
guests,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4 guests,47.576650785194346,-122.42010372955906,47.63943551868126,-122.26617986681696
2 guests,47.58139476049791,-122.42159239293886,47.64417506428916,-122.2676793900958
3 guests,47.58464018115919,-122.37786089998347,47.64741675131903,-122.22193199990905
2 guests,47.58662432355578,-122.43127205219172,47.64939816840813,-122.27737488416663
6 guests,47.58835317156296,-122.42125114456326,47.651123954829,-122.2653366183863


In [61]:
for column in try5.columns:
    try5[column] = try5[column].astype(float)

In [62]:
try5.columns = ['lat1', 'lon1', 'lat2', 'lon2']

In [63]:
try5['latitude'] = (try5['lat1'] + try5['lat2']) / 2
try5['lonitude'] = (try5['lon1'] + try5['lon2']) / 2

In [64]:
try5.head()

Unnamed: 0_level_0,lat1,lon1,lat2,lon2,latitude,lonitude
guests,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4 guests,47.576651,-122.420104,47.639436,-122.26618,47.608043,-122.343142
2 guests,47.581395,-122.421592,47.644175,-122.267679,47.612785,-122.344636
3 guests,47.58464,-122.377861,47.647417,-122.221932,47.616028,-122.299896
2 guests,47.586624,-122.431272,47.649398,-122.277375,47.618011,-122.354323
6 guests,47.588353,-122.421251,47.651124,-122.265337,47.619739,-122.343294


In [45]:
nom = Nominatim(user_agent="GreatScott")

In [66]:
nom.reverse('{},{}'.format(47.612785,-122.344636))

Location(The Rivoli, 2127, 2nd Avenue, Pike Place Market Area, Belltown, Seattle, King County, Washington, 98121, USA, (47.6127721, -122.34431030899, 0.0))

In [28]:
rms['lat_lon'].head().str[100:200]

0    552&2d139.5542348304566&2m2&1d35.7725078623588...
1    549&2d140.49389930834946&2m2&1d36.356121627266...
2    997&2d138.85097549235445&2m2&1d35.340363226963...
3    224&2d139.4067283266436&2m2&1d35.3645402173937...
4    139&2d139.54797799073185&2m2&1d35.179935444668...
Name: lat_lon, dtype: object

In [6]:
# These lines convert the columns from strings to integer or float values so they
# can be used in a linear regression.
rm['guests'] = parse_num_strings(rm['guests'])
rm['bedrooms'] = parse_num_strings(rm['bedrooms'])
rm['beds'] = parse_num_strings(rm['beds'])
rm['baths'] = parse_num_strings(rm['baths'])
rm['cleaning_fee'] = rm['cleaning_fee'].str.extract(r'(\d+)').astype(float)
rm['service_fee'] = rm['service_fee'].str.extract(r'(\d+)').astype(float)
rm['price'] = rm['price'].str.extract(r'(\d+)').astype(float)
rm['amenities'] = rm['amenities'].str.extract(r'(\d+)').astype(float)
rm['superhost'] = rm['superhost'].astype(int)
rm['narrative'] = word_counts(rm['narrative'])
rm['rules'] = word_counts(rm['rules'])
rm.head(3)

Unnamed: 0,id,guests,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,narrative,rules,price
0,5262658,2.0,1.0,1.0,1.0,52.0,69.0,13.0,0,40.0,27.0,80.0
1,14677019,7.0,1.0,7.0,1.0,9.0,28.0,27.0,1,85.0,20.0,35.0
2,16186430,4.0,2.0,4.0,1.0,19.0,39.0,19.0,1,85.0,19.0,48.0


In [7]:
# Write the resulting data to a csv file.
CITY = 'tokyo'
rm.to_csv('../data/{}-clean-{}.csv'.format(CITY, datetime.now().strftime('%m-%d-%Y--%H-%M-%S')), index=False)