In [4]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os

In [5]:
#Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))     #this gets our environmental variable

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [11]:
import json
# load the review data
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Jane Smith',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Dr. Smith is an amazing professor! Her lectures are engaging, and she genuinely cares about her students.'},
 {'professor': 'Prof. John Doe',
  'subject': 'Calculus I',
  'stars': 3,
  'review': 'Prof. Doe knows his material, but his teaching style is a bit dry. The class can be challenging.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Modern History',
  'stars': 4,
  'review': 'Dr. Johnson is very knowledgeable and presents the material in an interesting way. Sometimes, the workload is a bit heavy.'},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': 'Prof. Lee is very strict and not very approachable. The course is difficult, and his explanations can be unclear.'},
 {'professor': 'Dr. Karen Davis',
  'subject': 'Computer Science 101',
  'stars': 5,
  'review': 'Dr. Davis is fantastic! She explains complex concepts clearly and is 

In [12]:
processed_data = []
client = OpenAI()

# creating embedding for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [13]:
processed_data[0]

{'values': [0.013864718,
  0.0032215733,
  -0.019512895,
  0.051100437,
  0.028263122,
  -0.0066210423,
  0.0415608,
  0.01298636,
  0.0068434114,
  -0.0295751,
  0.014109324,
  -0.024860874,
  -0.02581706,
  0.016166238,
  0.025372323,
  0.0012431827,
  -0.005787158,
  -0.01723361,
  0.010868293,
  0.005489739,
  0.039136976,
  -0.019935397,
  0.020835992,
  -0.016900057,
  -0.01795631,
  -0.034978673,
  0.018601181,
  0.03889237,
  0.022626063,
  0.017978547,
  0.06319732,
  -0.005920579,
  -0.016844464,
  -0.023504421,
  -0.033333138,
  0.030909315,
  -0.03179879,
  0.01493209,
  -0.002728192,
  0.0010889141,
  0.028040754,
  0.011907869,
  -0.010434673,
  0.017100189,
  0.038203023,
  -0.014009258,
  -0.0017233611,
  -0.007888546,
  0.029530626,
  0.042895015,
  -0.053679917,
  0.0145095885,
  0.035112094,
  -0.018923616,
  -0.059861783,
  0.030131023,
  0.028418781,
  0.06786707,
  0.010162272,
  -0.045852523,
  0.022425931,
  0.022181325,
  -0.027707199,
  -0.016844464,
  -0.0053

In [14]:
# insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
# print(f"Upserted count: {upsert_response['upserted_count']}")


In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}