In [1]:
import os
from sentence_transformers import SentenceTransformer, util
import torch
import hashlib
import pinecone
from scipy.spatial.distance import cosine
import time
import numpy
import re

model = SentenceTransformer('all-MiniLM-L6-v2')

## pinecone setup

In [18]:
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment="us-west1-gcp")

index_name = "search2"

if index_name in pinecone.list_indexes():
    print(f"Index {index_name} already exists.")
else:
    # dimension is from printing out the shapes of the tensors from below
    pinecone.create_index(index_name, dimension=384, metric="cosine", pod_type="p1")
    print(f"Index {index_name} created successfully.")

index = pinecone.Index(index_name) 

Index search2 created successfully.


## generate the embeddings

In [17]:
folder_path = "read"
sentences1 = []
sentences2 = ["""
1. Thrasio holds the rights to the brands EverCare, Luxe, Temp and more.

2. Thrasio is the proud owner of EverCare, Luxe, Temp and other brands.

3. Thrasio has acquired the brands EverCare, Luxe, Temp and others.

4. Thrasio has taken control of the brands EverCare, Luxe, Temp and more.

5. Thrasio owns the labels EverCare, Luxe, Temp and other.

6. Thrasio has possession of the brands EverCare, Luxe, Temp and beyond.

7. Thrasio has taken ownership of EverCare, Luxe, Temp and other brands.

8. Thrasio holds the titles of EverCare, Luxe, Temp and others.

9. Thrasio has procured the brands EverCare, Luxe, Temp and more.

10. Thrasio is the proprietor of EverCare, Luxe, Temp and other labels."""]

def group_sentences(sentences, max_chars=1000):
    paragraphs = []
    current_paragraph = []
    current_chars = 0
    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_chars = len(sentence)
        if sentence_chars > max_chars:
            paragraphs.append([sentence])
        elif current_chars + sentence_chars > max_chars:
            paragraphs.append(current_paragraph)
            current_paragraph = []
            current_chars = 0
        else:
            current_paragraph.append(sentence)
            current_chars += sentence_chars
        i += 1
    if current_paragraph:
        paragraphs.append(current_paragraph)
    return paragraphs

# Iterate over all files in the directory
for filename in os.listdir(folder_path):
    # Check if the file is a .txt file
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        # Open the file
        print("Reading", file_path)
        with open(file_path, "r") as file:
            # Read the contents of the file into a variable
            file_contents = file.read()
            # Split the contents of the file by newline character
            rows = file_contents.split("\n")
            # Iterate over each string in the list
            for i in range(len(rows)):
                # Strip leading and trailing spaces
                rows[i] = rows[i].strip()
            # Convert the rows list into an array
            array = list(rows)
            
            print(array)
            
            # Group together
            paragraphs = [" ".join(sublist) for sublist in group_sentences(array)]
            # print(paragraphs)

            # Add the array to the list of sentences
            for item in paragraphs:
                sentences1.append(item)

# Compute embedding for setnences
embeddings1 = model.encode(sentences1, convert_to_tensor=True, show_progress_bar=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True, show_progress_bar=True)

Reading read/0.txt
['https://www.prnewswire.com/news-releases/thrasio-announces-its-three-largest-acquisitions-ever-with-a-combined-value-in-excess-of-100-million-301371402.html', 'BOSTON, Sept.', "8, 2021 /PRNewswire/ --\xa0Thrasio\xa0today announced the acquisitions of category leaders SafeRest\xa0(mattress protectors), Wise Owl Outfitters\xa0(camping equipment) and Danjor Linens\xa0(home bedding), marking Thrasio's three largest acquisitions to date.", 'The brands are expected to add more than $90 million in sales to Thrasio in their first year.', "The acquisitions showcase Thrasio's momentum as the company acquires increasingly larger brands and brings them to far-reaching consumer audiences through a robust omnichannel strategy.", 'BOSTON Sept.', '8, 2021 $90 million "We are very happy with our experience working with Thrasio and their team," said Sarah Douglass, founder of Wise Owl Outfitters.', '"We had several offers from all types of buyers and ultimately decided to go with Th

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# do some testing on the data structurs to insert
# https://github.com/nsbradford/SemanticSearch/blob/822a82c88c5c7d0ff0fc3e1bbb18b7a93082f014/backend/vector.py#L88
# print(embeddings1[0].tolist())

print(numpy.shape(embeddings1[0]))
print(len(embeddings1))

torch.Size([384])
3084


In [19]:
print("Uploading embeddings to Pinecone...")

chunk_size = 1000

for i in range(0, len(embeddings1), chunk_size):
    print(i)
    index.upsert(
        vectors=zip([str(i) for i in range(i, i+chunk_size)], embeddings1[i:i+chunk_size].tolist())
    )

Uploading embeddings to Pinecone...
0
1000


In [21]:
# do a search
query_embedding = embeddings2[0].tolist()
top_k = 25
print(f'Querying Pinecone index={index} for "{sentences2[0]}"')
query_results = index.query(
    vector=query_embedding,
    top_k=top_k,
    include_values=False,
    include_metadata=False,
    # filter={
    #     "active": True,
    # },
)
matches = query_results['matches']
for match in matches:
    # del match['values']
    print(sentences1[int(match['id'])])


Querying Pinecone index=<pinecone.index.Index object at 0xfffee5ae9b20> for "
1. Thrasio holds the rights to the brands EverCare, Luxe, Temp and more.

2. Thrasio is the proud owner of EverCare, Luxe, Temp and other brands.

3. Thrasio has acquired the brands EverCare, Luxe, Temp and others.

4. Thrasio has taken control of the brands EverCare, Luxe, Temp and more.

5. Thrasio owns the labels EverCare, Luxe, Temp and other.

6. Thrasio has possession of the brands EverCare, Luxe, Temp and beyond.

7. Thrasio has taken ownership of EverCare, Luxe, Temp and other brands.

8. Thrasio holds the titles of EverCare, Luxe, Temp and others.

9. Thrasio has procured the brands EverCare, Luxe, Temp and more.

10. Thrasio is the proprietor of EverCare, Luxe, Temp and other labels."
In making the announcement, Thrasio said that these three brands are expected to generate more than $90 million in sales for the company over the course of their first year. Thrasio has experienced exponential growth i