In [123]:
# System imports
import sys
import argparse
import pprint

# API imports
from googleapiclient.discovery import build
from sklearn.feature_extraction.text import CountVectorizer


apikey, cseid = ("AIzaSyAlKLHe1eAmug6XeTlQ1DxzOsPI4zax7Ms", "006096712590953604068:qoxtr78cjow")
precision, query = ('10', 'per se')

print('\nParameters:\nClient key = ', apikey, '\nEngine key = ', cseid,
        '\nQuery      = ', query, '\nPrecision  = ', precision)



Parameters:
Client key =  AIzaSyAlKLHe1eAmug6XeTlQ1DxzOsPI4zax7Ms 
Engine key =  006096712590953604068:qoxtr78cjow 
Query      =  per se 
Precision  =  10


In [124]:
# Setup search engine with above parameters
service = build("customsearch", "v1", developerKey=apikey)
cse = service.cse().list(q=query, cx=cseid)

# Run the query
resdict = cse.execute()

# For fast testing of 'per se' query
relevant_items = [resdict['items'][0], resdict['items'][5],
                  resdict['items'][6], resdict['items'][8],
                  resdict['items'][9]]
# Display search results and get relevance feedback lists
#relevant_items, nonrelevant_items = get_rf_data(resdict)

#print('\nRelevant: {}\tNonrelevant: {}\n'.format(len(relevant_items),
#        len(nonrelevant_items)))

# Construct document-text list
# TODO: try adding title text along with snippet's
docs = [item['snippet'] for item in relevant_items]

#Add query to docs to vectorize
docs.append(query)

# Read minimal stopword list from local file
from pathlib import Path
p = Path('.') / 'minimal-stop-pylist.txt'
stopwords = eval(p.read_text())

# Create CountVectorizer object anc construct doc-term matrix/index
vectorizer = CountVectorizer(stop_words=stopwords)
dtindex = vectorizer.fit_transform(docs)
dtindex.shape

(6, 67)

In [125]:
dtindex.dtype

dtype('int64')

In [126]:
type(dtindex)

scipy.sparse.csr.csr_matrix

In [127]:
# display nonzero terms in each document
vectorizer.inverse_transform(dtindex)

[array(['oysters', 'vegetables', 'fish', 'salon', 'right', 'pastries',
        'filled', 'meat', 'mini', 'left', 'chocolate', 'dipped', 'nut',
        'macadamia', 'center', 'entrance', 'se', 'per'],
       dtype='<U11'),
 array(['re', 'why', 'obvious', 'restaurants', 'worlds', 'many', 'dining',
        'pleasure', 've', 'best', 'reviews', '1287', 'se', 'per'],
       dtype='<U11'),
 array(['identity', 'restaurant', 'intentional', 'could', 'napkin', 'new',
        'bring', 'failure', 'wondered', 'briefly', 'mystique', 'such',
        '2016', '12', 'jan', 'se', 'per'],
       dtype='<U11'),
 array(['city', 'york', 'manhattan', 'circle', 'columbus', '10', 'warner',
        'time', 'floor', 'fourth', 'located', 'french', 'american',
        'restaurant', 'new', 'center', 'se', 'per'],
       dtype='<U11'),
 array(['table', 'find', 'size', 'party', 'date', 'select', 'ny',
        'reservation', 'make', 'york', 'time', 'restaurant', 'new', 'se',
        'per'],
       dtype='<U11'),
 array(

In [128]:
# Show doc-term sparse matrix
dtindex.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 3, 1, 1, 0, 0, 1, 1, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0,

In [129]:
# list the entire vocabulary 
terms = vectorizer.get_feature_names()
pprint.pprint(terms, compact=True)

['10', '12', '1287', '2016', 'american', 'best', 'briefly', 'bring', 'center',
 'chocolate', 'circle', 'city', 'columbus', 'could', 'date', 'dining', 'dipped',
 'entrance', 'failure', 'filled', 'find', 'fish', 'floor', 'fourth', 'french',
 'identity', 'intentional', 'jan', 'left', 'located', 'macadamia', 'make',
 'manhattan', 'many', 'meat', 'mini', 'mystique', 'napkin', 'new', 'nut', 'ny',
 'obvious', 'oysters', 'party', 'pastries', 'per', 'pleasure', 're',
 'reservation', 'restaurant', 'restaurants', 'reviews', 'right', 'salon', 'se',
 'select', 'size', 'such', 'table', 'time', 've', 'vegetables', 'warner', 'why',
 'wondered', 'worlds', 'york']


In [130]:
# length of each doc in matrix should be same as length of feature list
print('doclen: {}, num terms: {}'.format(len(dtindex.toarray()[0]), len(terms)))

doclen: 67, num terms: 67


In [131]:
# Create term-freq list
termfreq = zip(count_vect_2.get_feature_names()[:60], dtindex.toarray()[0])
pprint.pprint(list(termfreq), compact=True)

[('10', 0), ('12', 0), ('1287', 0), ('2016', 0), ('american', 0), ('best', 0),
 ('briefly', 0), ('bring', 0), ('center', 2), ('chocolate', 1), ('circle', 0),
 ('city', 0), ('columbus', 0), ('date', 0), ('dining', 0), ('dipped', 0),
 ('entrance', 1), ('failure', 1), ('filled', 0), ('fish', 1), ('floor', 0),
 ('fourth', 1), ('french', 0), ('identity', 0), ('intentional', 0), ('jan', 0),
 ('left', 0), ('located', 0), ('macadamia', 1), ('make', 0), ('manhattan', 1),
 ('meat', 0), ('mini', 0), ('mystique', 0), ('napkin', 1), ('new', 1),
 ('nut', 0), ('ny', 0), ('obvious', 0), ('oysters', 1), ('party', 0),
 ('pastries', 0), ('pleasure', 1), ('reservation', 0), ('restaurant', 1),
 ('restaurants', 2), ('reviews', 0), ('right', 0), ('salon', 0), ('se', 0),
 ('select', 0), ('size', 0), ('table', 2), ('time', 1), ('ve', 2),
 ('vegetables', 0), ('warner', 0), ('wondered', 0), ('worlds', 0), ('york', 0)]


In [99]:
# Test on paper example
docs = ["large dogs eat large dinners", "Dogs have mouths", "Large dinners for large mouths"]
query = 'large dogs'

#Add query to docs to vectorize
docs.append(query)

# Create CountVectorizer object and construct doc-term matrix/index
vectorizer = CountVectorizer(stop_words=stopwords)
dtindex = vectorizer.fit_transform(docs)
dtindex.shape

(4, 5)

In [100]:
terms = vectorizer.get_feature_names()
pprint.pprint(terms, compact=True)

['dinners', 'dogs', 'eat', 'large', 'mouths']


In [101]:
dtindex.toarray()

array([[1, 1, 1, 2, 0],
       [0, 1, 0, 0, 1],
       [1, 0, 0, 2, 1],
       [0, 1, 0, 1, 0]], dtype=int64)

In [104]:
dt = dtindex[:3]
dt.toarray()

array([[1, 1, 1, 2, 0],
       [0, 1, 0, 0, 1],
       [1, 0, 0, 2, 1]])

In [116]:
t = dt.transpose() * dt
t

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Column format>

In [117]:
t.toarray()

array([[2, 1, 1, 4, 1],
       [1, 2, 1, 2, 1],
       [1, 1, 1, 2, 0],
       [4, 2, 2, 8, 2],
       [1, 1, 0, 2, 2]], dtype=int64)

In [119]:
q = dtindex[3]
q.shape

(1, 5)

In [120]:
q2 = t * q.transpose()

In [122]:
q2.toarray()

array([[ 5],
       [ 4],
       [ 3],
       [10],
       [ 3]], dtype=int64)

In [109]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD()
x2 = svd.fit_transform(dtindex[:3])
x2

array([[ 2.51079322, -0.32467158],
       [ 0.49204466,  1.30316641],
       [ 2.27679545,  0.07640876]])

In [110]:
svd = TruncatedSVD(n_components=3)
x3 =svd.fit_transform(dtindex[:3])
x3

array([[ 2.51079322, -0.32467158, -0.76844374],
       [ 0.49204466,  1.30316641, -0.24423219],
       [ 2.27679545,  0.07640876,  0.90020229]])

In [112]:
svd = TruncatedSVD(n_components=4)
x4 =svd.fit_transform(dtindex[:3])
x4

array([[ 2.51079322, -0.32467158, -0.76844374],
       [ 0.49204466,  1.30316641, -0.24423219],
       [ 2.27679545,  0.07640876,  0.90020229]])