In [41]:
# !pip install memory_profiler

In [42]:
import pandas as pd
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
%cd ../../data/airbnbdata
filename = '201702_listings_preproc.csv' # 15 cols

df = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=',',
#                        index_col = 'host_id',
                       header=0
                       )
print(len(df))
df['street'] = df['street'].apply(lambda x: x.split(',')[0])

/usr/local/bin/notebooks/data/airbnbdata
5377


### Split Dataset: known vs. unknown Zip

In [44]:
df_unknown_zip = df[pd.isnull(df.zipcode_new)]
df_known_zip = df[~pd.isnull(df.zipcode_new)]

print('# unknown zip: {}'.format(len(df_unknown_zip)))
print('# known zip: {}'.format(len(df_known_zip)))
print(len(df_known_zip)+len(df_unknown_zip))

# unknown zip: 2193
# known zip: 3184
5377


### Generate Matrix X of street names with know Zip

In [45]:

streets_known_zip = df_known_zip.reset_index().copy()
streets_known_zip = streets_known_zip[['street', 'street_cleansed', 'zipcode_new']]
streets_known_zip.head()

Unnamed: 0,street,street_cleansed,zipcode_new
0,brookfield,kimmage,dublin 12
1,military rd,,dublin 8
2,reuben street,,dublin 8
3,dame street,dame street,dublin 2
4,capel street,capel street,dublin 1


#### Create corpus of street names (known Zip)

In [46]:
corpus = streets_known_zip.street.values.tolist()
corpus[:5]

['brookfield', 'military rd', 'reuben street', 'dame street', 'capel street']

#### Compile Document-Term Matrix

In [59]:
# init
# count_vec = CountVectorizer(stop_words="english", analyzer='word', 
#                             ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')

# Transforms the data into a bag of words
train_vocab = tf.fit(corpus)
X = tf.transform(corpus)
X.todense()[:5, :]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### Generate Matrix W of street names with unknow Zip

In [48]:
streets_unknown_zip = df_unknown_zip.reset_index().copy()#.head()
streets_unknown_zip = streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new']]
streets_unknown_zip.head()

Unnamed: 0,street,street_cleansed,zipcode_new
0,north strand road,north,
1,dublin,,
2,exchange street upper,,
3,donnellan avenue,,
4,richmond street south,,


In [49]:
streets_unknown_zip = df_unknown_zip.copy()#.head()
q = streets_unknown_zip.street.values.tolist()
q[:5]

['north strand road',
 'dublin',
 'exchange street upper',
 'donnellan avenue',
 'richmond street south']

In [60]:
W = tf.transform(q).todense()
W[:5, :]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [61]:
res = X @ W.T
res[:5, :]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 7.48, 0, 7.48, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 7.48, 0, 7.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.48, 7.48, 0, 0, 0, ..., 7.48, 7.48, 0, 0, 0, 0, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.48],
        [0, 0, 7.48, 0, 7.48, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 7.48, 0, 7.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.48, 7.48, 0, 0, 0, ..., 7.48, 7.48, 0, 0, 0, 0, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.48],
        [0, 0, 7.48, 0, 7.48, 0, 0, 0, 7.48, 0, 0, 7.48, 0, 7.48, 0, 7.48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.48, 7.48, 0, 0, 0, ..., 7.

#### argmax

In [52]:
ix = np.argmax(res, axis=0) # axis=0 > run through each column
ix = [item for sublist in ix.tolist() for item in sublist]
ix[:10]

[440, 278, 1157, 17, 29, 278, 43, 263, 1181, 2614]

#### max()

In [53]:
scores = pd.DataFrame(res).max().values.tolist()
scores[:5]

[69.26373880863443,
 34.18761624250555,
 72.60867920408916,
 15.174215577237268,
 57.49574786281826]

#### select argmax in df of interest

In [54]:
tmp = df_known_zip.iloc[ix, :].copy()
tmp[['street', 'street_cleansed', 'zipcode_new']].head()

Unnamed: 0,street,street_cleansed,zipcode_new
734,north strand road,north,dublin 3
457,dublin,,dublin 2
1933,exchange street upper,,dublin 8
24,richmond avenue,fairview,dublin 3
44,richmond avenue south,rathmines,dublin 6


In [55]:
# streets_unknown_zip['street']

In [56]:
streets_unknown_zip['zipcode_new'] = tmp.zipcode_new.tolist()
streets_unknown_zip['scores'] = scores

streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new', 'scores']]

Unnamed: 0,street,street_cleansed,zipcode_new,scores
0,north strand road,north,dublin 3,69.263739
12,dublin,,dublin 2,34.187616
13,exchange street upper,,dublin 8,72.608679
14,donnellan avenue,,dublin 3,15.174216
18,richmond street south,,dublin 6,57.495748
20,dublin,,dublin 2,34.187616
22,spencer dock,north,dublin 1,76.180570
25,portland row,,dublin 1,85.355490
26,aungier street,,dublin 8,53.219239
29,villa park avenue,,dublin 7,72.376604


#### Manual Inspection / Spot Check

In [57]:
tmp = streets_unknown_zip[streets_unknown_zip.scores<20].copy()
tmp[['street', 'street_cleansed', 'zipcode_new', 'scores']]

Unnamed: 0,street,street_cleansed,zipcode_new,scores
14,donnellan avenue,,dublin 3,15.174216
91,lennox street,,dublin 2,14.953796
92,captain's road,,dublin 9,12.677369
134,cecilia street,,dublin 2,14.953796
221,fitzgerald street,,dublin 2,14.953796
242,knockmaree,,dublin 12,0.000000
311,lomond avenue,,dublin 3,15.174216
357,munster street,,dublin 2,14.953796
392,gilbert road,,dublin 9,12.677369
506,killeen road,,dublin 9,12.677369


In [58]:
streets_unknown_zip['zipcode_new'] = (streets_unknown_zip.apply(lambda row: np.nan if row['scores'] < 16 else row['zipcode_new']
                              , axis=1))
# streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new', 'scores']]
print(len(streets_unknown_zip[pd.isnull(streets_unknown_zip.zipcode_new)]))

114
