In [1]:
# https://rstudio-pubs-static.s3.amazonaws.com/407929_afc5ef0f2ad648389447a6ca3f4a7cd4.html

In [2]:
# !pip install sklearn
# !pip install reverse_geocoder

In [3]:
import pandas as pd
import numpy as np
import csv
from sklearn import preprocessing
# import reverse as rg 
import pprint 
import re
from collections import defaultdict

In [4]:
%cd ../../data/airbnbdata
!ls

/usr/local/bin/notebooks/data/airbnbdata
201702_calendar.csv		    201811_listings.csv
201702_calendar.csv.gz		    201811_listings.csv.gz
201702_listings.csv		    201811_neighbourhoods.csv
201702_listings.csv.gz		    201811_reviews.csv
201702_listings_preproc.csv	    201811_reviews.csv.gz
201702_listings_to_be_geocoded.csv  column_names.txt
201702_neighbourhoods.csv	    core
201702_reviews.csv		    df_na_geocoded.csv
201702_reviews.csv.gz		    geo
201811_calendar.csv		    housing
201811_calendar.csv.gz


In [5]:
filename = '201702_listings.csv.gz' # 94 cols
# filename = '201702_listings.csv' # 15 cols

df_read = pd.read_csv(filename,
                       encoding='utf-8',
                       compression='gzip',
                       sep=',',
                       index_col = 0,
                       header=0
                       )

In [6]:
df = df_read.copy()
# remove datapoint w/753 min nights
df = df.drop([4745260])

print(df.shape)
# df.head(1)
# df.columns

(6728, 94)


#### clean monetary values

In [7]:
df['price'] = df['price'].replace('[$,]','',regex=True).astype(float)
df['cleaning_fee'] = df['cleaning_fee'].replace('[$,]','',regex=True).astype(float)

#### Geo

In [8]:
print(df.neighbourhood_cleansed.unique())
df.neighbourhood_cleansed.describe()
df[['street', 'zipcode', 'neighbourhood_cleansed']].isnull().sum(axis = 0)

['Dublin City' 'South Dublin' 'Fingal' 'Dn Laoghaire-Rathdown']


street                       0
zipcode                   4212
neighbourhood_cleansed       0
dtype: int64

In [9]:
# Dublin City only
df_dub = df[df.neighbourhood_cleansed == 'Dublin City']
print(df_dub.shape)
print(df_dub.neighbourhood_cleansed.unique())
df_dub.neighbourhood_cleansed.describe()
df_dub[['street', 'zipcode']].isnull().sum(axis = 0)

(5376, 94)
['Dublin City']


street        0
zipcode    3305
dtype: int64

'
#### read pattern.txt

In [10]:
%cd ./geo/regexp
!ls
f=open("pattern.txt", "r")
patterns =f.read().replace('\n','').lower()
patterns
type(patterns)

/usr/local/bin/notebooks/data/airbnbdata/geo/regexp
area_mapping.txt  pattern.txt


str

#### new zipcode column - part 1: use regexp and pattern file

In [11]:
df = df_dub.copy()
df['street'] = df['street'].str.lower()
df['zipcode_new'] = df['street'].str.extract(r'('+patterns+')', expand=True)
df['zipcode_new'].head()

id
10778114    dublin 4
14348712         NaN
15749806     foxrock
9602076     dublin 2
15952233         NaN
Name: zipcode_new, dtype: object

#### new zipcode column - part 2: look for zipcodes that could be used

for all NaN values in the new zipcode column, use its corresponding zipcode
Note: before/after shows that all NaNs also have no zipcode, so this step is not adding anything for this dataset

In [12]:
# before
print(df[['zipcode_new', 'zipcode']].isnull().sum(axis = 0))

mask = pd.isnull(df['zipcode_new'])
df['zipcode_new'] = np.where(mask, 
                                 df['zipcode'], 
                                 df['zipcode_new'])

# after
print(df[['zipcode_new']].isnull().sum(axis = 0))

zipcode_new    2141
zipcode        3305
dtype: int64
zipcode_new    1875
dtype: int64


#### new zipcode column - part 3: use regexp and are mapping file

In [13]:
# setup dict
AreaDict = defaultdict(list)
with open("area_mapping.txt") as f:
    reader = csv.reader(f, delimiter=':')
    for line in reader:
         AreaDict[line[0].strip().lower()] = line[1].strip().lower()

# print(AreaDict.items())
# print(AreaDict['ashtown'])

In [14]:
# regexp function

def replace(x):
    if not pd.isnull(x):
        x = x.strip()
        tmp = re.search(r'[d]\d{1,2}\s{0,1}[a-z,0-9]{4}', x)
        if tmp:
            return 'dublin ' + tmp[0][2]    
    
        tmp = re.search(r'(0\d{1})', x)
        if tmp:
            return 'dublin ' + tmp[0][1:]
            
        tmp = re.search(r'(dublin\d{1,2})', x)
        if tmp:
            return 'dublin ' + tmp[0][6:]
        tmp = re.search(r'(d\d{1,2})', x)
        if tmp:
            return 'dublin ' + tmp[0][1:]

        tmp = re.search(r'(\d{1,2}w*)', x)
        if tmp:
            if tmp[0][0]=='0':
                return 'dublin ' + tmp[0][1:]
            else: 
                return 'dublin ' + tmp[0]
        try: 
            return AreaDict[x].lower() 
        except:
            return np.nan
    
    else: return np.nan


In [15]:
df['zipcode_new'] = df['zipcode_new'].map(lambda x: replace(x))
df['zipcode_new'].unique()


array(['dublin 4', 'dublin 1', 'dublin 18', 'dublin 2', nan, 'dublin 8',
       'dublin 6', 'dublin 3', 'dublin 11', 'dublin 9', 'dublin 7',
       'dublin 5', 'dublin 12', 'dublin 13', 'dublin 15', 'dublin 14',
       'dublin 10', 'dublin 20', 'dublin 35', 'dublin 6w', 'dublin 17',
       'dublin 0', 'dublin 16'], dtype=object)

In [16]:
# NaNs
# print(df_dub.shape)
# print(len(df[pd.isnull(df.zipcode_new)]))
# df = df[['street', 'zipcode_new']]
# df[pd.isnull(df.zipcode_new)].tail(10)

In [17]:
df_na = df[pd.isnull(df.zipcode_new)]
df_na.columns

df_na = (df_na[(df_na.room_type=='Entire home/apt')])
print(len(df_na))
grouped = df_na.groupby('property_type').agg({'host_id': ['count']})
grouped.columns = grouped.columns.map('_'.join)
grouped.sort_values(by='host_id_count', ascending=False).head()
# df_na.head()

1139


Unnamed: 0_level_0,host_id_count
property_type,Unnamed: 1_level_1
Apartment,879
House,206
Townhouse,20
Other,13
Cabin,8


In [18]:
%cd ../../
!ls

/usr/local/bin/notebooks/data/airbnbdata
201702_calendar.csv		    201811_listings.csv
201702_calendar.csv.gz		    201811_listings.csv.gz
201702_listings.csv		    201811_neighbourhoods.csv
201702_listings.csv.gz		    201811_reviews.csv
201702_listings_preproc.csv	    201811_reviews.csv.gz
201702_listings_to_be_geocoded.csv  column_names.txt
201702_neighbourhoods.csv	    core
201702_reviews.csv		    df_na_geocoded.csv
201702_reviews.csv.gz		    geo
201811_calendar.csv		    housing
201811_calendar.csv.gz


#### write to file

In [19]:
df.to_csv('201702_listings_preproc.csv', index=False)