In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
%cd ../../data/airbnbdata
filename = '201702_listings_preproc.csv' # 15 cols

df = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=',',
#                        index_col = 'host_id',
                       header=0
                       )
print(len(df))
df['street'] = df['street'].apply(lambda x: x.split(',')[0])

/usr/local/bin/notebooks/data/airbnbdata
5377


### Split Dataset: known vs. unknown Zip

In [3]:
df_unknown_zip = df[pd.isnull(df.zipcode_new)]
df_known_zip = df[~pd.isnull(df.zipcode_new)]

print('# unknown zip: {}'.format(len(df_unknown_zip)))
print('# known zip: {}'.format(len(df_known_zip)))
print(len(df_known_zip)+len(df_unknown_zip))

# unknown zip: 2193
# known zip: 3184
5377


### Generate Matrix X of street names with know Zip

In [4]:

streets_known_zip = df_known_zip.reset_index().copy()
streets_known_zip = streets_known_zip[['street', 'street_cleansed', 'zipcode_new']]
streets_known_zip.head()

Unnamed: 0,street,street_cleansed,zipcode_new
0,brookfield,kimmage,dublin 12
1,military rd,,dublin 8
2,reuben street,,dublin 8
3,dame street,dame street,dublin 2
4,capel street,capel street,dublin 1


#### Create corpus of street names (known Zip)

In [5]:
corpus = streets_known_zip.street.values.tolist()
corpus[:5]

['brookfield', 'military rd', 'reuben street', 'dame street', 'capel street']

#### Compile Document-Term Matrix

In [6]:
# init
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# Transforms the data into a bag of words
train_vocab = count_vec.fit(corpus)
X = count_vec.transform(corpus)
X.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

### Generate Matrix W of street names with unknow Zip

In [7]:
streets_unknown_zip = df_unknown_zip.reset_index().copy()#.head()
streets_unknown_zip = streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new']]
streets_unknown_zip.head()

Unnamed: 0,street,street_cleansed,zipcode_new
0,north strand road,north,
1,dublin,,
2,exchange street upper,,
3,donnellan avenue,,
4,richmond street south,,


In [8]:
streets_unknown_zip = df_unknown_zip.copy()#.head()
q = streets_unknown_zip.street.values.tolist()
q[:5]

['north strand road',
 'dublin',
 'exchange street upper',
 'donnellan avenue',
 'richmond street south']

In [9]:
W = count_vec.transform(q).todense()
W

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [10]:
res = X @ W.T
res

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 1]], dtype=int64)

#### argmax

In [11]:
ix = np.argmax(res, axis=0) # axis=0 > run through each column
ix = [item for sublist in ix.tolist() for item in sublist]
ix[:10]

[440, 278, 1157, 17, 29, 278, 43, 263, 141, 507]

#### max()

In [12]:
scores = pd.DataFrame(res).max().values.tolist()
scores[:5]

[3, 1, 3, 1, 2]

#### combine argmax and max

In [13]:
# ix_final = [i for i,j in zip(ix, scores) if j > 1]
# ix_final[:5]

#### select argmax in df of interest

In [14]:
tmp = df_known_zip.iloc[ix, :].copy()
tmp[['street', 'street_cleansed', 'zipcode_new']]#.head()

Unnamed: 0,street,street_cleansed,zipcode_new
734,north strand road,north,dublin 3
457,dublin,,dublin 2
1933,exchange street upper,,dublin 8
24,richmond avenue,fairview,dublin 3
44,richmond avenue south,rathmines,dublin 6
457,dublin,,dublin 2
68,spencer dock,,dublin 1
427,portland row,,dublin 1
226,castle street\ncastle street,,dublin 2
848,grace park avenue,grace,dublin 3


In [15]:
# streets_unknown_zip['street']

In [16]:
streets_unknown_zip['zipcode_new'] = tmp.zipcode_new.tolist()
streets_unknown_zip['scores'] = scores

streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new', 'scores']]

Unnamed: 0,street,street_cleansed,zipcode_new,scores
0,north strand road,north,dublin 3,3
12,dublin,,dublin 2,1
13,exchange street upper,,dublin 8,3
14,donnellan avenue,,dublin 3,1
18,richmond street south,,dublin 6,2
20,dublin,,dublin 2,1
22,spencer dock,north,dublin 1,2
25,portland row,,dublin 1,2
26,aungier street,,dublin 2,2
29,villa park avenue,,dublin 3,2


In [17]:
streets_unknown_zip['zipcode_new'] = streets_unknown_zip.apply(lambda row: np.nan if row['scores'] == 1 else streets_unknown_zip['zipcode_new'], axis=1)
streets_unknown_zip[['street', 'street_cleansed', 'zipcode_new', 'scores']]

Unnamed: 0,street,street_cleansed,zipcode_new,scores
0,north strand road,north,dublin 3,3
12,dublin,,,1
13,exchange street upper,,dublin 3,3
14,donnellan avenue,,,1
18,richmond street south,,dublin 3,2
20,dublin,,,1
22,spencer dock,north,dublin 3,2
25,portland row,,dublin 3,2
26,aungier street,,dublin 3,2
29,villa park avenue,,dublin 3,2


In [18]:
len(streets_unknown_zip[pd.isnull(streets_unknown_zip.zipcode_new)])

643