In [9]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
import os
import json
from pinecone import Pinecone, ServerlessSpec



  from tqdm.autonotebook import tqdm


In [17]:
load_dotenv()  # or load_dotenv(dotenv_path="/path/to/.env")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [18]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [31]:
# Load the review data
import json
data = json.load(open("reviews.json"))
data['reviews']



[{'professor': 'Dr. John Smith',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Great professor, explains concepts clearly and is very approachable.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Calculus I',
  'stars': 5,
  'review': 'Challenging class, but Dr. Emily Johnson is excellent at breaking down difficult topics.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': 'The course material is tough, but Dr. Michael Brown is always willing to help during office hours.'},
 {'professor': 'Prof. Laura Williams',
  'subject': 'World History',
  'stars': 4,
  'review': 'Engaging lectures, though the exams are a bit difficult.'},
 {'professor': 'Dr. James Taylor',
  'subject': 'Introduction to Computer Science',
  'stars': 5,
  'review': 'Amazing professor, very knowledgeable and passionate about the subject.'},
 {'professor': 'Prof. Sarah Miller',
  'subject': 'English Literature',
  'stars': 4,
  'review': 'Ver

In [25]:
processed_data = []
client = OpenAI()

In [28]:
# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )


In [29]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

Upserted count: 39


In [30]:
# Print index statistics
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
