In [7]:
# Imports
import pandas as pd
import re
from datetime import datetime, date

In [8]:
def parse_num_strings(series):
    """Converts a series of strings representing how many guests, bedrooms or
    beds can be or are at an Airbnb property to a single floatvalue.
    
    Input: pandas series
    output: pandas series
    
    Example:
    Input:
    0    3 guests
    1    4 guests
    2    6 guests
    3    5 guests
    4    8 guests
    Name: guests, dtype: object
    
    Output:
    0    3.0
    1    4.0
    2    6.0
    3    5.0
    4    8.0
    Name: guests, dtype: int
    
    """
    guests_numeric = pd.Series(index=series.index, dtype=float)
    try:
        assert type(series[0]) in [str, int, float]
        if type(series[0]) == str:
            studio = re.compile(r'studio')
            for idx, val in series.iteritems():
                if studio.search(val):
                    guests_numeric[idx] = 1
                else:
                    num, rest = val.split(' ')
                    guests_numeric[idx] = float(num)
            return guests_numeric
        elif type(series[0]) == int:
            for idx, val in series.iteritems():
                guests_numeric[idx] = float(num)
            return guests_numeric
        else:
            return series
    except TypeError:
        print('series must be of type str, int, or float.')
        

In [9]:
def word_counts(series):
    """Returns a series of the word counts of each element of a series."""
    try:
        assert type(series[0]) == str
        words = pd.Series(index=series.index)
        for idx, item in series.iteritems():
            words[idx] = len(item.split(' '))
        return words
    except AssertionError:
        print('series must be of type str.')
    

In [10]:
def find_prob_indices(series):
    """Returns a series of booleans that tells whether or not the word
    'bedroom' is in the string."""
    problems = pd.Series(index=series.index)
    br = re.compile(r'bedroom')
    for idx, item in series.iteritems():
        if br.search(item):
            problems[idx] = False
        else:
            problems[idx] = True
    return problems

In [11]:
# Read in the data frame, drop the lat_lon columns since we won't use it.
rms = pd.read_csv('../data/initial_tokyo_data_scrape-10-07-19--00-55-48.csv', index_col=0)
# rm.drop('lat_lon', axis=1, inplace=True)
rms.head(3)

Unnamed: 0,id,guests,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,lat_lon,narrative,rules,price
0,5262658,2 guests,1 bedroom,1 bed,1 bath,$52,$69,Show all 13 amenities,False,https://maps.googleapis.com/maps/api/js/Viewpo...,It takes 5 min walk to Fujimidai Sta.. You can...,House rules\nNot safe or suitable for infants ...,$80
1,14677019,7 guests,1 bedroom,7 beds,1 bath,$9,$28,Show all 27 amenities,True,https://maps.googleapis.com/maps/api/js/Viewpo...,Our house was renovated recently. 13 - 15 mins...,"House rules\nNo pets\nNo smoking, parties, or ...",$35
2,16186430,4 guests,2 bedrooms,4 beds,1 bath,$19,$39,Show all 19 amenities,True,https://maps.googleapis.com/maps/api/js/Viewpo...,Translate this description to English\n･it's 1...,House rules\nNo pets\nNo parties or events\nCh...,$48


In [13]:
rms['lat_lon'][0]

'https://maps.googleapis.com/maps/api/js/ViewportInfoService.GetViewportInfo?1m6&1m2&1d35.69691496912552&2d139.5542348304566&2m2&1d35.772507862358864&2d139.71029757777706&2u14&4sen&5e0&6sm%40487000000&7b0&8e0&callback=_xdc_._5lat0&key=AIzaSyAytC_TusuhG7kpNQ19hMrCzXDIUjd307o&token=122073'

In [30]:
try5 = rms.head()['lat_lon'].str.extract(r'1d(\d{2}\.\d+).*2d(\d{3}\.\d+).*1d(\d{2}\.\d+).*2d(\d{3}\.\d+)')

In [35]:
try5.head()

Unnamed: 0,lat1,lon1,lat2,lon2
0,35.696915,139.554235,35.772508,139.710298
1,36.281086,140.493899,36.356122,140.648812
2,35.264362,138.850975,35.340363,139.00788
3,35.288561,139.406728,35.36454,139.561137
4,35.103783,139.547978,35.179935,139.702739


In [32]:
for column in try5.columns:
    try5[column] = try5[column].astype(float)

In [34]:
try5.columns = ['lat1', 'lon1', 'lat2', 'lon2']

In [39]:
try5['latitude'] = (try5['lat1'] + try5['lat2']) / 2
try5['lonitude'] = (try5['lon1'] + try5['lon2']) / 2

In [40]:
try5.head()

Unnamed: 0,lat1,lon1,lat2,lon2,latitude,lonitude
0,35.696915,139.554235,35.772508,139.710298,35.734711,139.632266
1,36.281086,140.493899,36.356122,140.648812,36.318604,140.571356
2,35.264362,138.850975,35.340363,139.00788,35.302363,138.929428
3,35.288561,139.406728,35.36454,139.561137,35.326551,139.483933
4,35.103783,139.547978,35.179935,139.702739,35.141859,139.625359


In [28]:
rms['lat_lon'].head().str[100:200]

0    552&2d139.5542348304566&2m2&1d35.7725078623588...
1    549&2d140.49389930834946&2m2&1d36.356121627266...
2    997&2d138.85097549235445&2m2&1d35.340363226963...
3    224&2d139.4067283266436&2m2&1d35.3645402173937...
4    139&2d139.54797799073185&2m2&1d35.179935444668...
Name: lat_lon, dtype: object

In [6]:
# These lines convert the columns from strings to integer or float values so they
# can be used in a linear regression.
rm['guests'] = parse_num_strings(rm['guests'])
rm['bedrooms'] = parse_num_strings(rm['bedrooms'])
rm['beds'] = parse_num_strings(rm['beds'])
rm['baths'] = parse_num_strings(rm['baths'])
rm['cleaning_fee'] = rm['cleaning_fee'].str.extract(r'(\d+)').astype(float)
rm['service_fee'] = rm['service_fee'].str.extract(r'(\d+)').astype(float)
rm['price'] = rm['price'].str.extract(r'(\d+)').astype(float)
rm['amenities'] = rm['amenities'].str.extract(r'(\d+)').astype(float)
rm['superhost'] = rm['superhost'].astype(int)
rm['narrative'] = word_counts(rm['narrative'])
rm['rules'] = word_counts(rm['rules'])
rm.head(3)

Unnamed: 0,id,guests,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,narrative,rules,price
0,5262658,2.0,1.0,1.0,1.0,52.0,69.0,13.0,0,40.0,27.0,80.0
1,14677019,7.0,1.0,7.0,1.0,9.0,28.0,27.0,1,85.0,20.0,35.0
2,16186430,4.0,2.0,4.0,1.0,19.0,39.0,19.0,1,85.0,19.0,48.0


In [7]:
# Write the resulting data to a csv file.
CITY = 'tokyo'
rm.to_csv('../data/{}-clean-{}.csv'.format(CITY, datetime.now().strftime('%m-%d-%Y--%H-%M-%S')), index=False)