In [146]:
import pandas as pd
import re
from datetime import datetime, date

In [107]:
def parse_num_strings(series):
    """Converts a series of strings representing how many guests, bedrooms or
    beds can be or are at an Airbnb property to a single floatvalue.
    
    Input: pandas series
    output: pandas series
    
    Example:
    Input:
    0    3 guests
    1    4 guests
    2    6 guests
    3    5 guests
    4    8 guests
    Name: guests, dtype: object
    
    Output:
    0    3.0
    1    4.0
    2    6.0
    3    5.0
    4    8.0
    Name: guests, dtype: int
    
    """
    guests_numeric = pd.Series(index=series.index, dtype=float)
    try:
        assert type(series[0]) in [str, int, float]
        if type(series[0]) == str:
            studio = re.compile(r'studio')
            for idx, val in series.iteritems():
                if studio.search(val):
                    guests_numeric[idx] = 1
                else:
                    num, rest = val.split(' ')
                    guests_numeric[idx] = float(num)
            return guests_numeric
        elif type(series[0]) == int:
            for idx, val in series.iteritems():
                guests_numeric[idx] = float(num)
            return guests_numeric
        else:
            return series
    except TypeError:
        print('series must be of type str, int, or float.')
        

In [133]:
def word_counts(series):
    """Returns a series of the word counts of each element of a series."""
    try:
        assert type(series[0]) == str
        words = pd.Series(index=series.index)
        for idx, item in series.iteritems():
            words[idx] = len(item.split(' '))
        return words
    except AssertionError:
        print('series must be of type str.')
    

In [134]:
def find_prob_indices(series):
    problems = pd.Series(index=series.index)
    br = re.compile(r'bedroom')
    for idx, item in series.iteritems():
        if br.search(item):
            problems[idx] = False
        else:
            problems[idx] = True
    return problems

In [144]:
rm = pd.read_csv('../data/initial_bogota_data_scrape-10-04-19--17-11-25.csv', index_col=0)
rm.drop('lat_lon', axis=1, inplace=True)
rm.head(3)

Unnamed: 0,id,guests,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,narrative,rules,price
0,38238821,3 guests,1 bedroom,1 bed,2 baths,0,$212,Show all 18 amenities,True,Translate this description to English\nEs un a...,House rules\nCheck-in time is 3PM - 12AM (midn...,$274
1,31941600,4 guests,1 bedroom,1 bed,1 bath,$5,$10,Show all 29 amenities,False,Lindo apartamento/estudio para 4 personas en u...,"House rules\nNo smoking, parties, or events\nC...",$12
2,32079582,6 guests,1 bedroom,2 beds,1 bath,$8,$10,Show all 32 amenities,False,Lindo apartamento/estudio para 6 personas en u...,"House rules\nNo smoking, parties, or events\nC...",$12


In [145]:
rm['guests'] = parse_num_strings(rm['guests'])
rm['bedrooms'] = parse_num_strings(rm['bedrooms'])
rm['beds'] = parse_num_strings(rm['beds'])
rm['baths'] = parse_num_strings(rm['baths'])
rm['cleaning_fee'] = rm['cleaning_fee'].str.extract(r'(\d+)').astype(float)
rm['service_fee'] = rm['service_fee'].str.extract(r'(\d+)').astype(float)
rm['price'] = rm['price'].str.extract(r'(\d+)').astype(float)
rm['amenities'] = rm['amenities'].str.extract(r'(\d+)').astype(float)
rm['superhost'] = rm['superhost'].astype(int)
rm['narrative'] = word_counts(rm['narrative'])
rm['rules'] = word_counts(rm['rules'])
rm.head(3)

Unnamed: 0,id,guests,bedrooms,beds,baths,cleaning_fee,service_fee,amenities,superhost,narrative,rules,price
0,38238821,3.0,1.0,1.0,2.0,0.0,212.0,18.0,1,95.0,8.0,274.0
1,31941600,4.0,1.0,1.0,1.0,5.0,10.0,29.0,0,76.0,17.0,12.0
2,32079582,6.0,1.0,2.0,1.0,8.0,10.0,32.0,0,76.0,17.0,12.0


In [149]:
CITY = 'bogota'
rm.to_csv('../data/{}-clean-{}.csv'.format(CITY, datetime.now().strftime('%m-%d-%Y--%H-%M-%S')), index=False)