In [55]:
import os
import json
import requests
import pinecone
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

False

In [57]:


# Initialize Pinecone client
pinecone_client = pinecone.Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment='us-east1-gcp'
)

# Define index name
index_name = "rag"

# Check if the index exists, if not, create it
if index_name not in pinecone_client.list_indexes():
    pinecone_client.create_index(
        name=index_name,
        dimension=384,  # Ensure this matches your embedding dimension
        metric="cosine",
        spec=pinecone.ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the Pinecone index
index = pinecone_client.Index(index_name)

# Load the JSON data from the reviews.json file
with open("reviews.json", "r") as file:
    data = json.load(file)

# Extract the reviews list from the data
reviews = [review['review'] for review in data['reviews']]

# Hugging Face API details
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.getenv("HUGGINGFACE_API_KEY")
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

# Get embeddings for the reviews
response = requests.post(api_url, headers=headers, json={"inputs": reviews, "options": {"wait_for_model": True}})
embeddings = response.json()

# Ensure embeddings are valid and have the correct structure
if not isinstance(embeddings, list) or not all(isinstance(e, list) for e in embeddings):
    raise ValueError("Invalid embeddings format received from Hugging Face.")

# Prepare the data for Pinecone
processed_data = [
    {
        "id": f"review-{i}",
        "values": embedding,  # Embeddings should be a list of floats
        "metadata": {
            "review": data['reviews'][i]['review'],
            "professor": data['reviews'][i]['professor'],
            "subject": data['reviews'][i]['subject'],
            "stars": data['reviews'][i]['stars']
        }
    }
    for i, embedding in enumerate(embeddings)
]

# Upsert the processed data into Pinecone
try:
    index.upsert(
        vectors=processed_data,
        namespace="ns1"
    )
    print("Upsert operation completed successfully.")
except Exception as e:
    print(f"Error during upsert operation: {e}")

# Describe and print the index stats
stats = index.describe_index_stats()
print(stats)


Upsert operation completed successfully.
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
