In [1]:
import nltk
from nltk.corpus import twitter_samples, stopwords
import pickle
import numpy as np
from package import process_tweets, cosine_similarity

In [2]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

all_tweets = positive_tweets + negative_tweets

In [3]:
en_embeddings = pickle.load(open('en_embeddings.p', 'rb')) #english word embeddings

In [4]:
def get_document_embedding(tweet, en_embeddings):
    
    doc_embedding = np.zeros(300)
    
    processed_doc = process_tweets(tweet)
    
    for word in processed_doc:
        doc_embedding += en_embeddings.get(word, 0)
    
    return doc_embedding

In [5]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

tweet_embedding = get_document_embedding(custom_tweet, en_embeddings)
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])

In [6]:
def get_document_vecs(docs, embeddings):
    
    doc_dict = {}
    
    doc_vecs = []
    
    for i, doc in enumerate(docs):
        doc_embedding = get_document_embedding(doc, embeddings)
        
        doc_dict[i] = doc_embedding
        
        doc_vecs.append(doc_embedding)
        
    doc_matrix = np.vstack(doc_vecs)
    
    return doc_matrix, doc_dict

In [7]:
doc_matrix, doc_dict = get_document_vecs(all_tweets, en_embeddings)

In [8]:
print("Length of dictionary: ", len(doc_dict))
print("Document vecs shape: ", doc_matrix.shape)

Length of dictionary:  10000
Document vecs shape:  (10000, 300)


In [9]:
my_tweet = "i am sad"
process_tweets(my_tweet)

['sad']

In [10]:
tweet_embedding = get_document_embedding(my_tweet, en_embeddings)

In [11]:
max_id = np.argmax(cosine_similarity(doc_matrix, tweet_embedding))
print(all_tweets[max_id])

@zoeeylim sad sad sad kid :( it's ok I help you watch the match HAHAHAHAHA


In [12]:
N_VECS = len(all_tweets) #no. of vectors
N_DIMS = len(doc_dict[1]) #dimension in each vector
print(N_VECS,N_DIMS)

10000 300


In [13]:
N_PLANES = 10 #no. of planes
N_UNIVERSES = 25 #no. of times to repeat the hashing

In [14]:
np.random.seed(11)
planes_l = [np.random.normal(size=(N_DIMS, N_PLANES)) for _ in range(N_UNIVERSES)]

In [15]:
def hash_value_of_vector(v, planes):
    
    dot_product = np.dot(v, planes)
    
    sign_dot = np.sign(dot_product)
    
    h = sign_dot>=0
    
    h = np.squeeze(h)
    
    hash_value = 0
    
    n_planes = planes.shape[1]
    
    for i in range(n_planes):
        hash_value += 2**i * h[i]
    
    return int(hash_value)

In [16]:
np.random.seed(11)
ids = 1
planes = planes_l[ids]  # get one 'universe' of planes to test the function
vec = np.random.rand(1, 300)
print(f" The hash value for this vector,",
      f"and the set of planes at index {ids},",
      f"is {hash_value_of_vector(vec, planes)}")

 The hash value for this vector, and the set of planes at index 1, is 43


In [17]:
def make_hash_table(vec, planes):
    
    n_planes = planes.shape[1]
    
    n_buckets = 2**n_planes
    
    hash_table = {i:[] for i in range(n_buckets)}
    
    id_table = {i:[] for i in range(n_buckets)}
    
    for i,v in enumerate(vec):
        h = hash_value_of_vector(v, planes)
        hash_table[h].append(v)
        id_table[h].append(i)
        
    return hash_table, id_table

In [18]:
np.random.seed(11)
planes= planes_l[0]
vec = np.random.rand(1,300)
print(planes.shape, '')

hash_table_temp , id_table_temp = make_hash_table(vec, planes)

(300, 10) 


In [19]:
len(hash_table_temp)

1024

In [20]:
len(id_table_temp)

1024

In [21]:
hash_tables = []
id_tables = []
for universe in range(N_UNIVERSES):
    print("Working on hash universe : #", universe)
    planes = planes_l[universe]
    hash_table, id_table = make_hash_table(doc_matrix, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

Working on hash universe : # 0
Working on hash universe : # 1
Working on hash universe : # 2
Working on hash universe : # 3
Working on hash universe : # 4
Working on hash universe : # 5
Working on hash universe : # 6
Working on hash universe : # 7
Working on hash universe : # 8
Working on hash universe : # 9
Working on hash universe : # 10
Working on hash universe : # 11
Working on hash universe : # 12
Working on hash universe : # 13
Working on hash universe : # 14
Working on hash universe : # 15
Working on hash universe : # 16
Working on hash universe : # 17
Working on hash universe : # 18
Working on hash universe : # 19
Working on hash universe : # 20
Working on hash universe : # 21
Working on hash universe : # 22
Working on hash universe : # 23
Working on hash universe : # 24


In [22]:
def approximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES):
    
    vecs_to_consider_l = list()
    ids_to_consider_l = list()
    
    ids_to_consider_set =  set()
    
    for universe_id in range(num_universes_to_use):
        
        planes = planes_l[universe_id]
        
        hash_value_of_vector = (v, planes)
        
        hash_table = hash_tables[universe_id]
        
        document_

IndentationError: expected an indented block after function definition on line 1 (212121665.py, line 2)