In [7]:
from dotenv import load_dotenv
load_dotenv()

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [5]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Computer Science',
  'rating': 5,
  'review': 'Dr. Thompson is an amazing professor. Her lectures are clear, and she always makes time to help students during office hours.'},
 {'professor': 'Dr. John Rivera',
  'subject': 'Mathematics',
  'rating': 4,
  'review': 'Dr. Rivera is very knowledgeable, but his exams are tough. Make sure to attend his review sessions.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Cybersecurity',
  'rating': 3,
  'review': 'Dr. Lee knows her stuff, but sometimes her explanations can be confusing. Overall, a decent class.'},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Physics',
  'rating': 2,
  'review': "Dr. Brown's lectures are hard to follow, and he rarely responds to emails. I struggled in this class."},
 {'professor': 'Dr. Karen Patel',
  'subject': 'Biology',
  'rating': 5,
  'review': 'Dr. Patel is fantastic! Her passion for biology is contagious, and she makes even difficult topics easy to un

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review['review'],
            "subject": review['subject'],
            "rating": review['rating']
        }
    })

In [9]:
processed_data[0]

{'values': [0.020097485,
  -0.003113999,
  -0.006900795,
  0.05140413,
  0.013369518,
  0.007795799,
  0.029356154,
  0.006135411,
  -0.014517593,
  -0.014641042,
  -0.001217515,
  0.02462806,
  0.009332738,
  -0.04140477,
  -0.0029380843,
  0.040886283,
  0.024442887,
  0.0071353475,
  0.030491883,
  0.06039121,
  0.023109637,
  -0.0069995536,
  0.0051107854,
  -0.030491883,
  -0.03323245,
  -0.03535577,
  0.02160356,
  0.020529555,
  -0.014431179,
  -0.010048742,
  0.08369837,
  -0.01940617,
  0.009906776,
  -0.028245112,
  -0.03755316,
  0.0297265,
  0.003419535,
  0.012813997,
  0.007801972,
  -0.0018609925,
  0.012925101,
  0.021924527,
  0.00085719844,
  0.022517083,
  0.0028393252,
  -0.01831982,
  -0.047700662,
  0.012054787,
  0.028319182,
  0.022640532,
  -0.0245293,
  0.01718409,
  0.016221188,
  -0.002766799,
  -0.07871103,
  0.041750424,
  0.011147438,
  0.026887175,
  0.027109383,
  -0.05466318,
  0.036960606,
  -0.0108202975,
  -0.004138625,
  -0.0153200105,
  -0.0178013

In [10]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1",
    
)

{'upserted_count': 20}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}