In [13]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [11]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Wilkins',
  'subject': 'Introduction to Computer Science',
  'stars': 4,
  'review': 'Dr. Wilkins is an excellent professor who really cares about her students. The course material is challenging but she explains it clearly and is always available to help.'},
 {'professor': 'Professor Michael Chen',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': "Professor Chen is knowledgeable, but his lectures can be dry and difficult to follow. The exams are very challenging and don't always align with the course content."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Johnson is an engaging and enthusiastic professor. Her lectures are always interesting and she does a great job of making the material relevant and applicable to real life.'},
 {'professor': 'Professor David Lee',
  'subject': 'Calculus I',
  'stars': 2,
  'review': "Professor Lee is not the best at explaining the concepts in Calcul

In [14]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id":  review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [15]:
processed_data[0]

{'values': [-0.013189973,
  0.003681061,
  -0.0099489745,
  0.046242215,
  -0.046266004,
  -0.009455391,
  0.015235668,
  -0.009877613,
  -0.011982776,
  -0.005827851,
  0.033254433,
  -0.031541757,
  -0.009318615,
  -0.0075345794,
  0.010626908,
  0.043720778,
  0.0047306693,
  0.016746152,
  -0.014224715,
  0.019802801,
  0.009294828,
  -0.026879478,
  0.006898273,
  -0.017352724,
  -0.040747385,
  -0.012607189,
  0.030899506,
  0.007029102,
  -0.015925495,
  0.054567717,
  0.08182779,
  -0.009996549,
  -0.017614383,
  -0.022003112,
  -0.047265064,
  0.039653175,
  0.0039873207,
  0.023965552,
  0.019112974,
  -0.00560782,
  -0.0057089156,
  -0.004596866,
  -0.041484788,
  0.0076891957,
  0.017685745,
  0.027902326,
  -0.024833784,
  0.009027223,
  0.043911077,
  0.024524549,
  -0.030923292,
  0.05490074,
  0.029591212,
  -0.01068043,
  -0.05732703,
  0.07364501,
  0.03418213,
  0.002844051,
  -0.0038594648,
  -0.013035357,
  0.048454423,
  0.009122372,
  -0.029139256,
  -0.0329452,


In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}