In [35]:
import os
from sentence_transformers import SentenceTransformer, util
import torch
import hashlib
import pinecone
from scipy.spatial.distance import cosine
import time
import numpy

## pinecone setup

In [64]:
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment="us-west1-gcp")

index_name = "search2"

if index_name in pinecone.list_indexes():
    print(f"Index {index_name} already exists.")
else:
    # dimension is from printing out the shapes of the tensors from below
    pinecone.create_index(index_name, dimension=384, metric="cosine", pod_type="p1")
    print(f"Index {index_name} created successfully.")

index = pinecone.Index(index_name) 

Index search2 created successfully.


## generate the embeddings

In [76]:
model = SentenceTransformer('all-MiniLM-L6-v2')

folder_path = "read"
sentences1 = []
sentences2 = ["acquisition"]

# Iterate over all files in the directory
for filename in os.listdir(folder_path):
    # Check if the file is a .txt file
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        # Open the file
        with open(file_path, "r") as file:
            # Read the contents of the file into a variable
            file_contents = file.read()
            # Split the contents of the file by newline character
            rows = file_contents.split("\n")
            # Iterate over each string in the list
            for i in range(len(rows)):
                # Strip leading and trailing spaces
                rows[i] = rows[i].strip()
            # Convert the rows list into an array
            array = list(rows)
            
            # Print the array
            # print(array)

            # Add the array to the list of sentences
            for item in array:
                sentences1.append(item)

# print(len(sentences1))

# Compute embedding for setnences
embeddings1 = model.encode(sentences1, convert_to_tensor=True, show_progress_bar=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/396 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [60]:
# do some testing on the data structurs to insert
# https://github.com/nsbradford/SemanticSearch/blob/822a82c88c5c7d0ff0fc3e1bbb18b7a93082f014/backend/vector.py#L88
# print(embeddings1[0].tolist())

print(numpy.shape(embeddings1[0]))
print(len(embeddings1))

torch.Size([384])
12664


In [65]:
print("Uploading embeddings to Pinecone...")

chunk_size = 1000

for i in range(0, len(embeddings1), chunk_size):
    print(i)
    index.upsert(
        vectors=zip([str(i) for i in range(i, i+chunk_size)], embeddings1[i:i+chunk_size].tolist())
    )

Uploading embeddings to Pinecone...
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [77]:
# do a search
query_embedding = embeddings2[0].tolist()
top_k = 1000
print(f'Querying Pinecone index={index} for "{sentences2[0]}"')
query_results = index.query(
    vector=query_embedding,
    top_k=top_k,
    include_values=False,
    include_metadata=False,
    # filter={
    #     "active": True,
    # },
)
matches = query_results['matches']
for match in matches:
    # del match['values']
    print(match)
    print(sentences1[int(match['id'])])


Querying Pinecone index=<pinecone.index.Index object at 0xffff2c5b2760> for "acquisition"
{'id': '2789', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '8296', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '2796', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '2803', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '8303', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '8282', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '8289', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '2782', 'score': 0.797878623, 'sparseValues': {}, 'values': []}
Acquisition Style:
{'id': '12458', 'score': 0.693412423, 'sparseValues': {}, 'values': []}
The acquisition process seems to be formulaic and slick.
{'id': '6963', 'score': 0.693412423, 'sparseValues': {}, 'val