In [16]:
import pandas as pd
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
%cd ../../data/airbnbdata
filename = '201702_listings_preproc.csv' # 15 cols

df = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=',',
#                        index_col = 'host_id',
                       header=0
                       )

# Isolate known Zip rows
df = df[~pd.isnull(df.zipcode_new)]
print('# known zip: {}'.format(len(df)))

# select column subset
df = df[['street', 'zipcode_new']]
df.head()

/usr/local/bin/notebooks/data/airbnbdata
# known zip: 3184


Unnamed: 0,street,zipcode_new
1,"brookfield, kimmage, county dublin d12 v769, i...",dublin 12
2,"military rd, dublin, dublin d8, ireland",dublin 8
3,"reuben street, dublin, ireland, dublin d8, ire...",dublin 8
4,"dame street, dublin, dublin 2, ireland",dublin 2
5,"capel street, dublin, county dublin, ireland",dublin 1


In [3]:
# keep only street name
df['street'] = df['street'].apply(lambda x: x.split(',')[0])
df.head()

Unnamed: 0,street,zipcode_new
1,brookfield,dublin 12
2,military rd,dublin 8
3,reuben street,dublin 8
4,dame street,dublin 2
5,capel street,dublin 1


#### train/test split

In [4]:
X = df.copy()
y = X.pop('zipcode_new')
# X.head()
# y.head()

X_train, X_test, y_train, y_test = train_test_split(X, y)
# print(X_train.head())
# print(y_train.head())
# X_train.shape
# X_test.shape

# reshape y_train
y_train = y_train.values.reshape((len(y_train), 1))
y_train.shape

(2388, 1)

#### use X_train to compile Document-Term Matrix X

In [5]:
# init tf-idf
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')

corpus = X_train.street.values.tolist()
print('Corpus sample: {}'.format(corpus[:5]))

# Transforms the data into a bag of words
train_vocab = tf.fit(corpus)
X = tf.transform(corpus)
X.todense()[:5, :]

Corpus sample: ['mountjoy street middle', 'parnell st', 'brunswick street north', 'howth road', 'a lea road']


matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#### use X_test to generate Matrix W

In [6]:
q = X_test.street.values.tolist()
q[:5]

['carndonagh lawn',
 'de courcy square',
 "st audeon's terrace",
 'milltown road',
 'dangan park']

In [7]:
W = tf.transform(q).todense()
W[:5, :]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#### matrix mult. to get scores

In [8]:
R = X @ W.T
R[:5, :]

matrix([[0, 0, 0, 0, 0, 7.75, 0, 7.75, 0, 0, 7.75, 7.75, 7.75, 0, 0, 7.75, 7.75, 0, 0, 0, 0, 0, 0, 7.75, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 7.75, 0, 0, 0, 7.75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.75, 0, 0, 0, 0, 0, 0, 7.75, 0, 0, 0],
        [0, 0, 16.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16.2, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 16.2, 0, 0, 0, 0, 0, 0, 0, 0, 16.2, 0, 0, 0, 0, 16.2, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 7.75, 0, 7.75, 0, 0, 7.75, 7.75, 7.75, 0, 0, 7.75, 7.75, 0, 0, 0, 0, 0, 0, 7.75, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 7.75, 0, 0, 0, 7.75, 0, 0, 0, 0, 0, 0, 0, 0, 24.7, 0, 7.75, 0, 24.7, 0, 0, 0, 0, 7.75, 0, 0, 0],
        [0, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 0, 0, 6.39, 6.39, 0, 0, 6.39, 0, 0, 6.39, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 6.39, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 6.39, 0, 6.39, 0, 0, 6.39, 6.39, 6.39],
        [0, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 0, 0, 0, 6.39, 0, 0, 0, 6.39, 6

#### argmax(): get the index of the largest value in each column of R

Each column in R represents the multiplication of the corpus tfidf matrix with the a given vector reprsentation of a query address.

In [9]:
ix = np.argmax(R, axis=0) # axis=0 > run through each column

# flatten list of lists
ix = [item for sublist in ix.tolist() for item in sublist]
ix[:10]

[298, 13, 349, 341, 56, 728, 2328, 595, 29, 2305]

#### max(): get the largest value in each column of R

In [10]:
scores = pd.DataFrame(R).max().values.tolist()
scores[:5]

[77.05699669126278,
 20.518329146640006,
 99.47873887416908,
 49.70136698688268,
 16.06027145384625]

#### lookup most similar Zipcode

* The index of the largest value in R corresponds to the row in X which is most similar to the query address. 
* Thus we can get the most similar Zipcode from y_train.
* Note that we have a prediction for each of the unknown data points.

In [11]:
y_pred = pd.DataFrame(y_train).iloc[ix, 0].values
y_pred[:5]

array(['dublin 13', 'dublin 11', 'dublin 8', 'dublin 6', 'dublin 5'], dtype=object)

In [12]:
X_test['zipcode'] = y_test
X_test['zipcode_pred'] = y_pred
X_test['scores'] = scores
X_test.head()

Unnamed: 0,street,zipcode,zipcode_pred,scores
1749,carndonagh lawn,dublin 13,dublin 13,77.056997
2320,de courcy square,dublin 11,dublin 11,20.518329
3963,st audeon's terrace,dublin 8,dublin 8,99.478739
2344,milltown road,dublin 6,dublin 6,49.701367
1519,dangan park,dublin 6w,dublin 5,16.060271


In [13]:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.7474874371859297

In [14]:
X_test[X_test.scores==0]

Unnamed: 0,street,zipcode,zipcode_pred,scores
2190,winter garden,dublin 2,dublin 7,0.0
2232,springfield,dublin 7,dublin 7,0.0
4209,stella townhouse,dublin 4,dublin 7,0.0
3500,a bóthar bhinn éadair,dublin 5,dublin 7,0.0
636,islandbridge,dublin 8,dublin 7,0.0
3988,mountbrown,dublin 8,dublin 7,0.0
3091,ifsc,dublin 1,dublin 7,0.0
1938,bettyglen,dublin 5,dublin 7,0.0
1183,shanagarry,dublin 6,dublin 7,0.0
2098,brooklands,dublin 4,dublin 7,0.0


In [15]:
X_test = X_test[X_test.scores!=0]
y_test = X_test.zipcode.values.tolist()
y_pred = X_test.zipcode_pred.values.tolist()
accuracy_score(y_true=y_test, y_pred=y_pred)

0.7634961439588689