In [10]:
# Importing the general modules
import warnings
warnings.filterwarnings('ignore')
import os
import time
from tqdm.auto import tqdm
from pathlib import Path

In [13]:
from bs4 import BeautifulSoup
from pathlib import Path


## 1. Setting up the black box

In [2]:
from sentence_transformers import SentenceTransformer
import torch
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode("something")




In [4]:
embeddings

array([-1.75111294e-01,  1.58490404e-01, -1.18856125e-01, -3.34617615e-01,
        1.56028762e-01, -2.08266422e-01,  1.12331617e+00, -1.84903204e-01,
        1.42543659e-01,  1.68133434e-02,  6.23148493e-02, -2.68896073e-01,
        1.37362897e-01, -9.53574851e-02,  2.44151697e-01, -2.48515680e-01,
        4.20203120e-01,  1.27731428e-01, -6.40652061e-01,  8.50197952e-03,
        1.98105931e-01,  1.62470981e-01, -4.45389599e-02,  1.74844071e-01,
       -1.78510144e-01,  4.46950674e-01, -2.08346993e-02,  1.68394640e-01,
        1.37454523e-02, -1.06811881e-01,  2.34184816e-01,  6.92231283e-02,
        1.08811952e-01, -3.69863398e-02, -2.07060948e-01,  5.38644612e-01,
        1.31701678e-01,  2.02830195e-01,  2.24573135e-01,  1.48830807e-03,
       -2.30970576e-01, -2.48231962e-01,  1.67221889e-01,  2.47285590e-01,
       -2.53734514e-02,  3.99057239e-01, -1.14053465e-01,  6.27188608e-02,
        2.25057438e-01, -3.40958796e-02, -3.25980306e-01, -1.18578561e-01,
       -2.11133555e-01, -

## 2. Create a Vector DataBase

In [19]:
# Read api key
# Open the file 'api_key' and read the contents
with open('api_key.txt', 'r') as file:
    api_key = file.read().strip()

In [6]:
from pinecone import Pinecone, ServerlessSpec
pinecone = Pinecone(api_key= api_key)
pinecone.create_index(name= 'vectordbsearch', 
    dimension= 384, 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

## 3. Making and Storing the Vectors

In [11]:
pinecone = Pinecone(api_key= api_key) # Replace with your actual environment
index = pinecone.Index("vectorsearch")

In [14]:
html_dir = Path(r"D:\Sameer\descipr\webinars\vector database\html_documents")

# Loop through each document, extract text, generate embeddings, and upload to Pinecone
for file_path in html_dir.glob("*.html"):
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    
    # Extract only the text content using BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")
    text_content = soup.get_text()
    # Convert text to vectors - refer section 1
    embedding = model.encode(text_content).tolist()
    doc_id = file_path.stem
    index.upsert([(doc_id, embedding)])

In [15]:
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 15}},
 'total_vector_count': 15}


## 4. Querying some text

In [16]:
query = "AI Simulation"
query_embedding = model.encode(query).tolist()  # Convert to list for compatibility with Pinecone

# Perform the query
search_results = index.query(
    vector=query_embedding,
    top_k=5,  # Retrieve the top 5 most similar documents
    include_values=True  # Include the vector values in the results if needed
)

# Display the results
print("Search Results:")
for match in search_results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")

Search Results:
ID: document_1, Score: 0.63061434
ID: document_9, Score: 0.541022897
ID: document_7, Score: 0.529140949
ID: document_3, Score: 0.464173108
ID: document_10, Score: 0.417532802
