In [1]:
# https://rstudio-pubs-static.s3.amazonaws.com/407929_afc5ef0f2ad648389447a6ca3f4a7cd4.html

In [2]:
# !pip install sklearn
# !pip install reverse_geocoder

In [3]:
import pandas as pd
import numpy as np
import csv
import re
from collections import defaultdict
import requests

### Read Data

In [4]:
# Listings

%cd ../../data/airbnbdata
filename = '201702_listings.csv.gz' # 94 cols
# filename = '201702_listings.csv' # 15 cols

df_read = pd.read_csv(filename,
                       encoding='utf-8',
                       compression='gzip',
                       sep=',',
                       index_col = 0,
                       header=0
                       )
df_read['street'] = df_read['street'].apply(lambda x: x.lower())

/usr/local/bin/notebooks/data/airbnbdata


In [5]:
# Places Names

%cd ./geo/regexp

f=open("pattern.txt", "r")
patterns =f.read().replace('\n','').lower()

# patterns
# type(patterns)

/usr/local/bin/notebooks/data/airbnbdata/geo/regexp


In [6]:
# Area Mapping

AreaDict = defaultdict(list)
with open("area_mapping.txt") as f:
    reader = csv.reader(f, delimiter=':')
    for line in reader:
         AreaDict[line[0].strip().lower()] = line[1].strip().lower()

# print(AreaDict.items())
# print(AreaDict['ashtown'])

### Pre-processing

In [7]:
df = df_read.copy()
print(df.shape)

(6729, 94)


#### Clean monetary values

In [8]:
df['price'] = df['price'].replace('[$,]','',regex=True).astype(float)
df['cleaning_fee'] = df['cleaning_fee'].replace('[$,]','',regex=True).astype(float)

#### Dublin City only

In [9]:
print(df.neighbourhood_cleansed.unique())
df.neighbourhood_cleansed.describe()
df[['street', 'zipcode', 'neighbourhood_cleansed']].isnull().sum(axis = 0)

['Dublin City' 'South Dublin' 'Fingal' 'Dn Laoghaire-Rathdown']


street                       0
zipcode                   4212
neighbourhood_cleansed       0
dtype: int64

In [10]:
df = df[df.neighbourhood_cleansed == 'Dublin City']
print(df.shape)
print(df.neighbourhood_cleansed.unique())
df.neighbourhood_cleansed.describe()
df[['street', 'zipcode']].isnull().sum(axis = 0)

(5377, 94)
['Dublin City']


street        0
zipcode    3305
dtype: int64

#### RegExp: Extract Postcode

In [11]:
"""
for reference, eircode regexp: re1 = re.search(r'd\d{1,2}\s{0,1}[a-z0-9]{4}', x) # eircode format Dxx xxxx

in this particular case however we can achieve the goal by a more general approach

below is the breakdown of \bd\d{2}|d\s\d{2}|d\d{1}w?|d\s\d{1,2}w?|d\d{1}\b

d\d{2} >> d01, d10
d\s\d{2} >> d 10, d 01
d\d{1}w? >> d6w
d\s\d{1}w? >> d 6w
d\d{1}\b >>  d1, d3

d(ublin)* >> same as above for dublin instad of d

"""
def replace(x):
    re1 = re.search(r'\bd\d{2}|d\s\d{2}|d\d{1}w?|d\s\d{1,2}w?|d\d{1}\b', x)
    re2 = re.search(r'\bd\dublin{2}|dublin\s\d{2}|dublin\d{1}w?|dublin\s\d{1,2}w?|dublin\d{1}\b', x)
    x = x.strip()
    
    if re1:
        return 'dublin ' + re1[0][1:] 
    elif re2:
        if isinstance(re2[0][6], int):
            return 'dublin ' + re2[0][6:]
        else:
            return 'dublin ' + re2[0][7:]
    else:
        return ''

df['zipcode_new'] = df['street'].apply(lambda x: replace(x).replace('0', '') if not pd.isnull(x) else np.nan)
df['zipcode_new'].unique()
# df = df.sample(frac=1)
df[['street', 'zipcode_new']].sample(5)

Unnamed: 0_level_0,street,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6145042,"drumcondra road lower, drumcondra, dublin, ire...",
12246300,"sandymount castle park, sandymount, dublin d04...",dublin 4
8866250,"a northbrook lane, ranelagh, dublin, ireland",
14328938,"longboat quay, dublin, dublin 2ireland, ireland",dublin 2
17165247,"grosvenor square, rathmines, county dublin d06...",dublin 6


In [12]:
len(df[df.zipcode_new==''])

3194

#### RegExp: use place names file to extract core location

In [13]:
# df = df_dub.copy()
df['street_cleansed'] = df['street'].str.extract(r'('+patterns+')', expand=True)
df[['zipcode_new', 'street_cleansed']].head(10)

Unnamed: 0_level_0,zipcode_new,street_cleansed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10778114,dublin 4,
14348712,,
15749806,,foxrock
9602076,dublin 2,
15952233,,
14737754,,dorset street
3592153,,ballsbridge
17230584,dublin 2,
10932760,dublin 6,
13168698,,


#### use Area mapping file to get postalcode

In [14]:
def mapArea(x):
    res = AreaDict[x]
    if res:
        return res
    else:
        return ''

In [15]:
mask = df.zipcode_new==''
df.loc[mask, 'zipcode_new'] = df[mask].apply(lambda row: mapArea(row.street_cleansed), axis=1)
df.loc[[5201857], ['street', 'street_cleansed', 'zipcode_new']]

df = df.sample(frac=1)
df[['street', 'street_cleansed', 'zipcode_new']].head()

Unnamed: 0_level_0,street,street_cleansed,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7298739,"ward's hill, dublin, dublin 8, ireland",,dublin 8
6499397,"emmet st, dublin, dublin 01, ireland",,dublin 1
13255271,"new row south, dublin, dublin 8, ireland",,dublin 8
13097361,"liffey street west, dublin, dublin, ireland",,
15911654,"charleville avenue, dublin, county dublin d03 ...",,dublin 3


In [16]:
len(df[df.zipcode_new==''])

2193

### Quantify missing Postal code data

In [17]:
df_na = df[df.zipcode_new=='']
df_na.columns

df_na = (df_na[(df_na.room_type=='Entire home/apt')])
print(len(df_na))
grouped = df_na.groupby('property_type').agg({'host_id': ['count']})
grouped.columns = grouped.columns.map('_'.join)
grouped.sort_values(by='host_id_count', ascending=False).head()
# df_na.head()

1087


Unnamed: 0_level_0,host_id_count
property_type,Unnamed: 1_level_1
Apartment,835
House,205
Townhouse,19
Other,9
Cabin,8


### Google Maps API

In [18]:
API_KEY = ''
BACKOFF_TIME = 30
RETURN_FULL_RESULTS = False

def get_google_results(address, api_key=None, return_full_response=False):
    
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    geocode_url = geocode_url + "&key={}".format(api_key)
    results = requests.get(geocode_url)
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address" : None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address" : answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
#     print(output['formatted_address'])
    # Append some other details:    
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

def geocode(address):
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            print("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            geocoded = True
#     print(geocode_result['formatted_address'])
    return geocode_result['formatted_address']

#     # Print status every 100 addresses
#     if len(results) % 100 == 0:
#     	logger.info("Completed {} of {} address".format(len(results), len(addresses)))
            
#     # Every 500 addresses, save progress to file(in case of a failure so you have something!)
#     if len(results) % 500 == 0:
#         pd.DataFrame(results).to_csv("{}_bak".format(output_filename))



In [19]:
# get google api data
df2 = df[df.zipcode_new=='']
df2 = df2.iloc[:10,:]

df2['street'] = df2.apply(lambda row: geocode(row.street) if row.zipcode_new=='' else row.street, axis=1)
df2[['street', 'street_cleansed', 'zipcode_new']].head(10)

Unnamed: 0_level_0,street,street_cleansed,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13097361,"Liffey St W, Arran Quay, Dublin, Ireland",,
11269403,"Shaw St, Dublin, Ireland",,
11144313,"Townsend St, Dublin, Ireland",,
13166732,"Townsend St, Dublin, Ireland",,
11765464,"Hanover Quay, Grand Canal Dock, Dublin, Ireland",,
14649656,"St Kevin's Parade, Wood Quay, Dublin, Ireland",,
8184787,"Harold Rd, Arran Quay, Dublin, Ireland",,
7171042,"Fitzgerald St, Dublin, Ireland",,
14053545,"Mount Argus Way, Harold's Cross, Dublin, Ireland",,
15697664,"Cathedral House, Patrick St, Wood Quay, Dublin...",,


In [20]:
# # use place names file to extract core location
# df['street_cleansed'] = df['street'].str.extract(r'('+patterns+')', expand=True)
# df[['zipcode_new', 'street_cleansed']].head(10)

# # use Area mapping file to get postalcode
# mask = df.zipcode_new==''
# df.loc[mask, 'zipcode_new'] = df[mask].apply(lambda row: mapArea(row.street_cleansed), axis=1)
# df[['street', 'street_cleansed', 'zipcode_new']]

In [21]:
# len(df[df.zipcode_new==''])
tmp = df.copy()
# tmp = tmp[tmp.street.str.contains("george's quay")]
# tmp = tmp[tmp.street.str.contains("terenure")]
tmp = tmp[tmp.street.str.contains("george's quay")]
tmp[['street', 'street_cleansed', 'zipcode_new']].head()

Unnamed: 0_level_0,street,street_cleansed,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7276073,"george's quay, dublin, dublin, ireland",,
16156397,"george's quay, dublin, county dublin, ireland",,
8072359,"george's quay, dublin, dublin dublin 2, ireland",,dublin 2
15978026,"george's quay, dublin, county dublin, ireland",,
7784872,"george's quay, dublin, dublin 2, ireland",,dublin 2


### Write to file

In [22]:
# %cd ../../
# df.to_csv('201702_listings_preproc.csv', index=False)