In [1]:
with open('corpus.txt') as f:
    contents = f.read()

In [2]:
contents = [contents]

In [11]:
from re import sub
from gensim.utils import simple_preprocess

query_strings = ['Ambrette Seed',
'Apple Cinnamon Granola',
'Arizona Seasoning',
'Americano Coffee',
'Baby Abalone',
'Cadbury Double Decker Chocolate Bar',
'Campari Tomato',
'Celery Soup',
'Chia Meal',
'Crunch Bars',
'Cardamom',
'Giardiniera',
'Hog Maw',
'Mccormick Montreal Steak Seasoning',
'Muesli',
'Mulberry',
'Munch Chocolate',
'Murukku Packet',
'Mango',
'Organic Maize',
'Organic Peruvian Groundcherry',
'Organic Tartar Cream',
'Orange Extract',
'Pickled Cauliflower',
'Pork Chump Chops',
'Pork Lungs',
'Pork Tripe',
'Peanut Butter',
'Smokies Sausage',
'Snickers Spread',
'Strawberry Gelatin',
'Salmon',
'Tomato',
'Tamarind',
'Vegan Carob Chips',
'Vegan Chicken Strips',
'Vegan Chorizo',
'Vegan Marshmallow',
'Vegan Puff Pastry Sheet',
'Vegan Semisweet Chocolate Chips',
'Vegan White Cake',
'Vegetable Stock',
'Vinegar']

documents = contents

stopwords = ['the', 'and', 'are', 'a']

# From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    doc = sub(r'<[^<>]+(>|$)', " ", doc)
    doc = sub(r'\[img_assist[^]]*?\]', " ", doc)
    doc = sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', " url_token ", doc)
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in documents]
queries = []
for query in query_strings:
  queries.append(preprocess(query))

In [15]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 1.1 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0


In [12]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

# Load the model: this is a big file, can take a while to download and open
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)

# Build the term dictionary, TF-idf model
for query in queries:
  dictionary = Dictionary(corpus+[query])
  tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|██████████| 420/420 [00:08<00:00, 46.90it/s]


In [6]:
import numpy as np

In [17]:
doc_similarity_scores = []

In [19]:
for query in queries:
  query_tf = tfidf[dictionary.doc2bow(query)]

  index = SoftCosineSimilarity(
              tfidf[[dictionary.doc2bow(document) for document in corpus]],
              similarity_matrix)

  doc_similarity_scores.append(int(index[query_tf]))
print(doc_similarity_scores)


[array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array([0.56932014], dtype=float32), array(0., dtype=float32), array([0.7922004], dtype=float32), array(0., dtype=float32), array([0.95291597], dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array([0.99999994], dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array([0.7068612], dtype=float32), array([0.7068612], dtype=float32), array([0.7068612], dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array(0., dtype=float32), array([0.7922004], dtype=float32), array(0., dtype=float32), array([0.92930907], dtype=float32), arra

In [20]:
Z = [x for _,x in sorted(zip(doc_similarity_scores,query_strings))]
print(Z) 

['Ambrette Seed', 'Americano Coffee', 'Apple Cinnamon Granola', 'Arizona Seasoning', 'Cadbury Double Decker Chocolate Bar', 'Cardamom', 'Celery Soup', 'Crunch Bars', 'Giardiniera', 'Hog Maw', 'Mango', 'Mccormick Montreal Steak Seasoning', 'Muesli', 'Mulberry', 'Murukku Packet', 'Orange Extract', 'Peanut Butter', 'Pickled Cauliflower', 'Pork Chump Chops', 'Pork Lungs', 'Pork Tripe', 'Salmon', 'Smokies Sausage', 'Snickers Spread', 'Strawberry Gelatin', 'Tamarind', 'Vegetable Stock', 'Baby Abalone', 'Organic Maize', 'Organic Peruvian Groundcherry', 'Organic Tartar Cream', 'Vinegar', 'Campari Tomato', 'Tomato', 'Vegan Carob Chips', 'Vegan Chicken Strips', 'Vegan Chorizo', 'Vegan Marshmallow', 'Vegan Puff Pastry Sheet', 'Vegan Semisweet Chocolate Chips', 'Vegan White Cake', 'Chia Meal', 'Munch Chocolate']


In [22]:
#q2

def productExceptSelf(a, n):
 
    prod = 1
    flag = 0
 
    for i in range(n):
        if (a[i] == 0):
            flag += 1
        else:
            prod *= a[i]

    arr = [0 for i in range(n)]
 
    for i in range(n):

        if (flag > 1):
            arr[i] = 0
        elif (flag == 0):
            arr[i] = (prod // a[i])

        elif (flag == 1 and a[i] != 0):
            arr[i] = 0
 
        # If(flag == 1 && a[i] == 0)
        else:
            arr[i] = prod
 
    return arr
 

In [23]:
ans = productExceptSelf( [5, 1, 4, 2], 4)
 
print(*ans)

8 40 10 20


In [24]:
 ans = productExceptSelf( [1, 0, 3, 4],4)
 print(*ans)


0 12 0 0
