In [1]:
# https://www.kaggle.com/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
# !pip install sklearn

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
"""
CountVectorizer can lowercase letters, disregard punctuation and stopwords, 
but it can't LEMMATIZE or STEM
"""
txt = ["He is ::having a great Time, at the park time?",
       "She, unlike most women, is a big player on the park's grass.",
       "she can't be going"]
txt

['He is ::having a great Time, at the park time?',
 "She, unlike most women, is a big player on the park's grass.",
 "she can't be going"]

In [4]:
# Compile Document-Term Matrix

# init
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# Transforms the data into a bag of words
count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

In [5]:
# Print the first 10 features of the count_vec
print("Every feature:\n{}".format(count_vec.get_feature_names()))
print("\nEvery 3rd feature:\n{}".format(count_vec.get_feature_names()[::3]))
print('Bag of Words: {}'.format(bag_of_words.todense()))
# type(bag_of_words)
vecs = bag_of_words.todense()
vecs[0]

Every feature:
['big', 'going', 'grass', 'great', 'having', 'park', 'player', 'time', 'unlike', 'women']

Every 3rd feature:
['big', 'great', 'player', 'women']
Bag of Words: [[0 0 0 1 1 1 0 2 0 0]
 [1 0 1 0 0 1 1 0 1 1]
 [0 1 0 0 0 0 0 0 0 0]]


matrix([[0, 0, 0, 1, 1, 1, 0, 2, 0, 0]])

In [6]:
# Vocabulary and vocabulary ID
print("Vocabulary size: {}".format(len(count_train.vocabulary_)))
print("Vocabulary content:\n {}".format(count_train.vocabulary_))

Vocabulary size: 10
Vocabulary content:
 {'having': 4, 'great': 3, 'time': 7, 'park': 5, 'unlike': 8, 'women': 9, 'big': 0, 'player': 6, 'grass': 2, 'going': 1}


### Test Unseen data

In [7]:
count_vec.transform(['lets go for a swim today']).todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#### Matrix mult.

In [8]:
vecs @ vecs.T

matrix([[7, 1, 0],
        [1, 6, 0],
        [0, 0, 1]])

### Addresses

In [9]:
%cd ../../data/airbnbdata
filename = '201702_listings_preproc.csv' # 15 cols

df_read = pd.read_csv(filename,
                       encoding='utf-8',
                       sep=',',
                       index_col = 'host_id',
                       header=0
                       )

df_read = df_read.reset_index()
df_read = df_read[['street', 'zipcode_new']].head()
df_read

/usr/local/bin/notebooks/data/airbnbdata


Unnamed: 0,street,zipcode_new
0,"north strand road, dublin, dublin, ireland",
1,"brookfield, kimmage, county dublin d12 v769, i...",dublin 12
2,"military rd, dublin, dublin d8, ireland",dublin 8
3,"reuben street, dublin, ireland, dublin d8, ire...",dublin 8
4,"dame street, dublin, dublin 2, ireland",dublin 2


In [10]:
streets_list = df_read.street.values.tolist()
streets_list

['north strand road, dublin, dublin, ireland',
 'brookfield, kimmage, county dublin d12 v769, ireland',
 'military rd, dublin, dublin d8, ireland',
 'reuben street, dublin, ireland, dublin d8, ireland',
 'dame street, dublin, dublin 2, ireland']

In [11]:
# Learn a vocabulary dictionary of all tokens in the raw documents.
train_vocab = count_vec.fit(streets_list)
print("Vocabulary index:\n {}".format(train_vocab.vocabulary_))

train_data_matrix = count_vec.transform(streets_list)
train_data_matrix.todense()

Vocabulary index:
 {'north': 9, 'strand': 13, 'road': 12, 'dublin': 5, 'ireland': 6, 'brookfield': 0, 'kimmage': 7, 'county': 1, 'd12': 2, 'v769': 15, 'military': 8, 'rd': 10, 'd8': 3, 'reuben': 11, 'street': 14, 'dame': 4}


matrix([[0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0],
        [0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

In [12]:
q = ['dermot hurley avenue, dublin']
q = ['dermot hurley avenue', 'effra road, rathgar']

q_vector = count_vec.transform(q).todense()
q_vector

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [13]:
res = train_data_matrix.todense() @ q_vector.T
res

matrix([[0, 1],
        [0, 0],
        [0, 0],
        [0, 0],
        [0, 0]])

#### argmax

In [14]:
ix = np.argmax(res, axis=0) # axis=0 > run through each column
ix = [item for sublist in ix.tolist() for item in sublist]
ix

[0, 0]

In [15]:
df_read.iloc[[0, 0], :]
df_read.iloc[ix, :]

Unnamed: 0,street,zipcode_new
0,"north strand road, dublin, dublin, ireland",
0,"north strand road, dublin, dublin, ireland",


#### max()

In [16]:
ix = pd.DataFrame(res).max().values.tolist()
ix

[0, 1]

#### select argmax in df of interest

In [17]:
df_read.iloc[ix, :]

Unnamed: 0,street,zipcode_new
0,"north strand road, dublin, dublin, ireland",
1,"brookfield, kimmage, county dublin d12 v769, i...",dublin 12
