In [7]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [6]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Sarah Johnson',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Engaging lectures and challenging assignments. Dr. Johnson really knows her stuff!'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Excellent teacher! Makes complex concepts easy to understand. Highly recommended.'},
 {'professor': 'Dr. Emily Rodriguez',
  'subject': 'Biology',
  'stars': 3,
  'review': 'Good content, but lectures can be a bit dry. Office hours are helpful.'},
 {'professor': 'Prof. David Thompson',
  'subject': 'History',
  'stars': 4,
  'review': 'Passionate about the subject. Assignments are thought-provoking.'},
 {'professor': 'Dr. Lisa Patel',
  'subject': 'Psychology',
  'stars': 5,
  'review': "Best professor I've had! Engaging, knowledgeable, and always willing to help."},
 {'professor': 'Prof. Robert Williams',
  'subject': 'Mathematics',
  'stars': 2,
  'review': 'Difficult to follow in class. Needs to improve explanation of

In [11]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [12]:
processed_data[0]

{'values': [-0.015253101,
  -0.008056973,
  0.028650004,
  0.029080426,
  0.020485425,
  -0.018602327,
  0.033653665,
  0.047884513,
  0.004129367,
  0.03389578,
  0.053991135,
  0.012179614,
  -0.03123254,
  0.013107713,
  0.01304046,
  0.023525285,
  -0.029295638,
  -0.0064798775,
  0.022247467,
  0.051489305,
  0.05444846,
  -0.011924051,
  0.028354088,
  -0.025986765,
  -0.04325747,
  -0.055901136,
  0.013591939,
  -0.0030835744,
  0.013437255,
  -0.0018746922,
  0.070427895,
  -0.0045160744,
  -0.001473693,
  -0.020525778,
  -0.004391656,
  0.03658592,
  0.009691234,
  0.04422592,
  0.019153805,
  0.017795283,
  0.023283172,
  -0.03443381,
  -0.032470006,
  0.010249438,
  0.0025068754,
  -0.0066076596,
  0.0010676498,
  -0.034729723,
  0.065047614,
  0.05108578,
  0.0021083981,
  0.011197713,
  0.027116623,
  -0.047857612,
  -0.04723888,
  -0.0052962154,
  -0.0075592967,
  0.08748339,
  -0.012267044,
  -0.01827951,
  0.017337961,
  -0.014163594,
  0.0017418665,
  0.008709332,
  -0

In [13]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}