In [1]:
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
# !pip install sklearn

In [2]:
import pandas as pd
import numpy as np
# np.set_printoptions(precision=3)
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
"""
CountVectorizer can lowercase letters, disregard punctuation and stopwords, 
but it can't LEMMATIZE or STEM
"""
txt = ["He is ::having a great Time, at the park time?",
       "She, unlike most women, is a big player on the park's grass.",
       "she can't be going"]
txt

['He is ::having a great Time, at the park time?',
 "She, unlike most women, is a big player on the park's grass.",
 "she can't be going"]

In [4]:
# Compile Document-Term Matrix

# init
# count_vec = CountVectorizer(stop_words="english", analyzer='word', 
#                             ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')

# Transforms the data into a bag of words
count_train = tf.fit(txt)
bag_of_words = tf.transform(txt)

In [5]:
# Print the first 10 features of the count_vec
print("Every feature:\n{}".format(tf.get_feature_names()))
# print("\nEvery 3rd feature:\n{}".format(tf.get_feature_names()[::3]))
print('Bag of Words:\n {}'.format(bag_of_words.todense()))
# type(bag_of_words)
vecs = bag_of_words.todense()
vecs[0]

Every feature:
['at', 'be', 'big', 'can', 'going', 'grass', 'great', 'having', 'he', 'is', 'most', 'on', 'park', 'player', 'she', 'the', 'time', 'unlike', 'women']
Bag of Words:
 [[2.1 0 0 0 0 0 2.1 2.1 2.1 1.41 0 0 1.41 0 0 1.41 4.2 0 0]
 [0 0 2.1 0 0 2.1 0 0 0 1.41 2.1 2.1 1.41 2.1 1.41 1.41 0 2.1 2.1]
 [0 2.1 0 2.1 2.1 0 0 0 0 0 0 0 0 0 1.41 0 0 0 0]]


matrix([[2.1, 0, 0, 0, 0, 0, 2.1, 2.1, 2.1, 1.41, 0, 0, 1.41, 0, 0, 1.41, 4.2, 0, 0]])

In [6]:
# Vocabulary and vocabulary ID
print("Vocabulary size: {}".format(len(count_train.vocabulary_)))
print("Vocabulary content:\n {}".format(count_train.vocabulary_))

Vocabulary size: 19
Vocabulary content:
 {'he': 8, 'is': 9, 'having': 7, 'great': 6, 'time': 16, 'at': 0, 'the': 15, 'park': 12, 'she': 14, 'unlike': 17, 'most': 10, 'women': 18, 'big': 2, 'player': 13, 'on': 11, 'grass': 5, 'can': 3, 'be': 1, 'going': 4}


### Test Unseen data

In [7]:
tf.transform(['lets go for a swim today']).todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#### Matrix mult.

In [8]:
vecs @ vecs.T

matrix([[41.2, 5.93, 0],
        [5.93, 38.7, 1.98],
        [0, 1.98, 15.2]])

### Addresses

In [9]:
%cd ../../data/airbnbdata
filename = '201702_listings_preproc.csv' # 15 cols

df_read = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=',',
                       index_col = 'host_id',
                       header=0
                       )

df_read = df_read.reset_index()
df_read = df_read[['street', 'zipcode_new']].head()
df_read

/usr/local/bin/notebooks/data/airbnbdata


Unnamed: 0,street,zipcode_new
0,"north strand road, dublin, dublin, ireland",
1,"brookfield, kimmage, county dublin d12 v769, i...",dublin 12
2,"military rd, dublin, dublin d8, ireland",dublin 8
3,"reuben street, dublin, ireland, dublin d8, ire...",dublin 8
4,"dame street, dublin, dublin 2, ireland",dublin 2


In [10]:
streets_list = df_read.street.values.tolist()
streets_list

['north strand road, dublin, dublin, ireland',
 'brookfield, kimmage, county dublin d12 v769, ireland',
 'military rd, dublin, dublin d8, ireland',
 'reuben street, dublin, ireland, dublin d8, ireland',
 'dame street, dublin, dublin 2, ireland']

In [11]:
# Learn a vocabulary dictionary of all tokens in the raw documents.
train_vocab = tf.fit(streets_list)
print("Vocabulary index:\n {}".format(train_vocab.vocabulary_))

train_data_matrix = tf.transform(streets_list)
train_data_matrix.todense()

Vocabulary index:
 {'north': 9, 'strand': 13, 'road': 12, 'dublin': 5, 'ireland': 6, 'brookfield': 0, 'kimmage': 7, 'county': 1, 'd12': 2, 'v769': 15, 'military': 8, 'rd': 10, 'd8': 3, 'reuben': 11, 'street': 14, 'dame': 4}


matrix([[0, 0, 0, 0, 0, 2, 1, 0, 0, 2.61, 0, 0, 2.61, 2.61, 0, 0],
        [2.61, 2.61, 2.61, 0, 0, 1, 1, 2.61, 0, 0, 0, 0, 0, 0, 0, 2.61],
        [0, 0, 0, 1.92, 0, 2, 1, 0, 2.61, 0, 2.61, 0, 0, 0, 0, 0],
        [0, 0, 0, 1.92, 0, 2, 2, 0, 0, 0, 0, 2.61, 0, 0, 1.92, 0],
        [0, 0, 0, 0, 2.61, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1.92, 0]])

In [12]:
q = ['dermot hurley avenue, dublin']
q = ['dermot hurley avenue', 'effra road, rathgar']

q_vector = tf.transform(q).todense()
q_vector

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.61, 0, 0, 0]])

In [13]:
res = train_data_matrix.todense() @ q_vector.T
res

matrix([[0, 6.81],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]])

#### argmax

In [14]:
ix = np.argmax(res, axis=0) # axis=0 > run through each column
ix = [item for sublist in ix.tolist() for item in sublist]
ix

[0, 0]

In [15]:
# df_read.iloc[[0, 0], :]
# df_read.iloc[ix, :]

#### max()

In [16]:
scores = pd.DataFrame(res).max().values.tolist()
scores

[0.0, 6.8091662188484365]

#### select argmax in df of interest

In [18]:
tmp = df_read.iloc[ix, :].copy()
tmp['scores'] = scores
tmp

Unnamed: 0,street,zipcode_new,scores
0,"north strand road, dublin, dublin, ireland",,0.0
0,"north strand road, dublin, dublin, ireland",,6.809166
