In [1]:
# https://rstudio-pubs-static.s3.amazonaws.com/407929_afc5ef0f2ad648389447a6ca3f4a7cd4.html

In [2]:
# !pip install sklearn
# !pip install reverse_geocoder

In [3]:
import pandas as pd
import numpy as np
import csv
import re
from collections import defaultdict
import requests

### Read Data

In [4]:
# Listings

%cd ../../data/airbnbdata
filename = '201702_listings.csv.gz' # 94 cols
# filename = '201702_listings.csv' # 15 cols

df_read = pd.read_csv(filename,
                       encoding='utf-8',
                       compression='gzip',
                       sep=',',
                       index_col = 0,
                       header=0
                       )
df_read['street'] = df_read['street'].apply(lambda x: x.lower())

/usr/local/bin/notebooks/data/airbnbdata


In [5]:
# Places Names

%cd ./geo/regexp

f=open("pattern.txt", "r")
patterns =f.read().replace('\n','').lower()

# patterns
# type(patterns)

/usr/local/bin/notebooks/data/airbnbdata/geo/regexp


In [6]:
# Area Mapping

AreaDict = defaultdict(list)
with open("area_mapping.txt") as f:
    reader = csv.reader(f, delimiter=':')
    for line in reader:
         AreaDict[line[0].strip().lower()] = line[1].strip().lower()

# print(AreaDict.items())
# print(AreaDict['ashtown'])

### Pre-processing

In [7]:
df = df_read.copy()
print(df.shape)

(6729, 94)


#### Clean monetary values

In [8]:
df['price'] = df['price'].replace('[$,]','',regex=True).astype(float)
df['cleaning_fee'] = df['cleaning_fee'].replace('[$,]','',regex=True).astype(float)

#### Dublin City only

In [9]:
print(df.neighbourhood_cleansed.unique())
df.neighbourhood_cleansed.describe()
df[['street', 'zipcode', 'neighbourhood_cleansed']].isnull().sum(axis = 0)

['Dublin City' 'South Dublin' 'Fingal' 'Dn Laoghaire-Rathdown']


street                       0
zipcode                   4212
neighbourhood_cleansed       0
dtype: int64

In [10]:
df = df[df.neighbourhood_cleansed == 'Dublin City']
print(df.shape)
print(df.neighbourhood_cleansed.unique())
df.neighbourhood_cleansed.describe()
df[['street', 'zipcode']].isnull().sum(axis = 0)

(5377, 94)
['Dublin City']


street        0
zipcode    3305
dtype: int64

#### RegExp: Extract Postcode

In [11]:
"""
for reference, eircode regexp: re1 = re.search(r'd\d{1,2}\s{0,1}[a-z0-9]{4}', x) # eircode format Dxx xxxx

in this particular case however we can achieve the goal by a more general approach

below is the breakdown of \bd\d{2}|d\s\d{2}|d\d{1}w?|d\s\d{1,2}w?|d\d{1}\b

d\d{2} >> d01, d10
d\s\d{2} >> d 10, d 01
d\d{1}w? >> d6w
d\s\d{1}w? >> d 6w
d\d{1}\b >>  d1, d3

d(ublin)* >> same as above for dublin instad of d

"""
def replace(x):
    re1 = re.search(r'\bd\d{2}|d\s\d{2}|d\d{1}w?|d\s\d{1,2}w?|d\d{1}\b', x)
    re2 = re.search(r'\bd\dublin{2}|dublin\s\d{2}|dublin\d{1}w?|dublin\s\d{1,2}w?|dublin\d{1}\b', x)
    x = x.strip()
    
    if re1:
        return 'dublin ' + re1[0][1:] 
    elif re2:
        if isinstance(re2[0][6], int):
            return 'dublin ' + re2[0][6:]
        else:
            return 'dublin ' + re2[0][7:]
    else:
        return ''

df['zipcode_new'] = df['street'].apply(lambda x: replace(x).replace('0', '') if not pd.isnull(x) else np.nan)
df['zipcode_new'].unique()
# df = df.sample(frac=1)
df[['street', 'zipcode_new']].sample(5)

Unnamed: 0_level_0,street,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12954373,"red cow lane, dublin, dublin dublin 7, ireland",dublin 7
15373806,"dominick street lower, dublin, county dublin, ...",
7353069,"fairview avenue lower, fairview, dublin, ireland",
4213654,"daniel street, dublin, dublin, ireland",
7297942,"townsend st, dublin, dublin, ireland",


In [12]:
len(df[df.zipcode_new==''])

3194

#### RegExp: use place names file to extract core location

In [13]:
# df = df_dub.copy()
df['street_cleansed'] = df['street'].str.extract(r'('+patterns+')', expand=True)
df[['zipcode_new', 'street_cleansed']].head(10)

Unnamed: 0_level_0,zipcode_new,street_cleansed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10778114,dublin 4,
14348712,,
15749806,,foxrock
9602076,dublin 2,
15952233,,
14737754,,dorset street
3592153,,ballsbridge
17230584,dublin 2,
10932760,dublin 6,
13168698,,


#### use Area mapping file to get postalcode

In [14]:
def mapArea(x):
    res = AreaDict[x]
    if res:
        return res
    else:
        return ''

In [15]:
mask = df.zipcode_new==''
df.loc[mask, 'zipcode_new'] = df[mask].apply(lambda row: mapArea(row.street_cleansed), axis=1)
df.loc[[5201857], ['street', 'street_cleansed', 'zipcode_new']]

df = df.sample(frac=1)
df[['street', 'street_cleansed', 'zipcode_new']].head()

Unnamed: 0_level_0,street,street_cleansed,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11293171,"north strand road, dublin, dublin, ireland",north,
17156981,"brookfield, kimmage, county dublin d12 v769, i...",kimmage,dublin 12
11312270,"military rd, dublin, dublin d8, ireland",,dublin 8
16081960,"reuben street, dublin, ireland, dublin d8, ire...",,dublin 8
6741086,"dame street, dublin, dublin 2, ireland",dame street,dublin 2


In [16]:
len(df[df.zipcode_new==''])

2193

### Quantify missing Postal code data

In [17]:
df_na = df[df.zipcode_new=='']
df_na.columns

df_na = (df_na[(df_na.room_type=='Entire home/apt')])
print(len(df_na))
grouped = df_na.groupby('property_type').agg({'host_id': ['count']})
grouped.columns = grouped.columns.map('_'.join)
grouped.sort_values(by='host_id_count', ascending=False).head()
# df_na.head()

1087


Unnamed: 0_level_0,host_id_count
property_type,Unnamed: 1_level_1
Apartment,835
House,205
Townhouse,19
Other,9
Cabin,8


In [18]:
len(df[df.zipcode_new==''])

2193

In [19]:
len(df[df.zipcode_new!=''])

3184

In [20]:
tmp = df.copy()
# tmp = tmp[tmp.street.str.contains("george's quay")]
# tmp = tmp[tmp.street.str.contains("terenure")]
tmp = tmp[tmp.street.str.contains("george's quay")]
tmp[['street', 'street_cleansed', 'zipcode_new']].head()

Unnamed: 0_level_0,street,street_cleansed,zipcode_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8072359,"george's quay, dublin, dublin dublin 2, ireland",,dublin 2
7784872,"george's quay, dublin, dublin 2, ireland",,dublin 2
15978026,"george's quay, dublin, county dublin, ireland",,
16156397,"george's quay, dublin, county dublin, ireland",,
7276073,"george's quay, dublin, dublin, ireland",,


In [21]:
tmp = df[df.zipcode_new!='']
tmp.street.unique()

array(['brookfield, kimmage, county dublin d12 v769, ireland',
       'military rd, dublin, dublin d8, ireland',
       'reuben street, dublin, ireland, dublin d8, ireland', ...,
       'longeadow apartments, dublin, dublin d8, ireland',
       'brickfield walk, irishtown, county dublin d4, ireland',
       'bridgefoot street, dublin, dublin dublin 8, ireland'],
      dtype=object)

In [22]:
df.street_cleansed.unique()

array(['north', 'kimmage', nan, 'dame street', 'capel street',
       'drumcondra', 'cabra', 'ringsend', 'south circular road',
       'rathmines', 'fairview', 'portobello', 'leeson street', 'pembroke',
       'finglas', 'artane', 'sandymount', 'mountjoy', 'grange',
       'mary street', 'raheny', 'kilmainham', 'merrion', 'smithfield',
       'donnybrook', 'ranelagh', 'ballsbridge', 'crumlin', 'rathgar',
       "harold's cross", 'talbot street', 'temple bar',
       'marlborough street', 'harmonstown', 'rathfarnham', 'santry',
       'ballybough', 'glasnevin', 'clontarf', 'parnell square', 'grace',
       'arbour hill', 'dorset street', 'phibsboro', 'phoenix',
       'palmerston', 'donnycarney', 'terenure', 'marino', 'shangan',
       'balgriffin', 'oxmantown', 'botanic', 'swords', "o'connell street",
       'stoneybatter', 'whitehall', 'east wall', 'drimnagh',
       'wexford street', 'milltown', 'malahide', 'donaghmede',
       'arran quay', 'inchicore', 'abbey street', 'islandbridge

### Write to file

In [23]:
%cd ../../
df.to_csv('201702_listings_preproc.csv', index=False)

/usr/local/bin/notebooks/data/airbnbdata
