In [10]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [5]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name= "rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [9]:
import json
data = json.load(open("reviews.json"))
data

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Calculus I',
  'stars': 5,
  'review': "Dr. Johnson is an amazing professor! Her lectures are clear, and she's always willing to help during office hours."},
 {'professor': 'Prof. Michael Davis',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Great professor, but his exams are tough. Make sure to keep up with the readings.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': 'Dr. Lee knows her stuff, but her lectures can be hard to follow sometimes. Study groups are a must.'},
 {'professor': 'Prof. Mark Thompson',
  'subject': 'History of Western Civilization',
  'stars': 2,
  'review': 'Prof. Thompson is knowledgeable, but his lectures are dry and hard to stay awake in.'},
 {'professor': 'Dr. Lisa Brown',
  'subject': 'Data Structures',
  'stars': 5,
  'review': "One of the best professors I've had! She makes complex topics easy to understand and is very approachable."},


In [15]:
processed_data = []
client = OpenAI()

for review in data:
    response = client.embeddings.create(
        input = review['review'],
        model= "text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })


In [16]:
processed_data[0]

{'values': [-0.0120052,
  -0.0015629915,
  0.030351425,
  0.045574635,
  0.0004289692,
  -0.010740558,
  0.024093522,
  0.03980359,
  -0.016838154,
  0.00503779,
  0.00911374,
  -0.0022012503,
  -0.019557433,
  0.018263103,
  -0.012112071,
  0.044767164,
  -0.01973555,
  0.020258034,
  0.02486537,
  0.059895378,
  0.029781446,
  -0.008514074,
  0.025767839,
  -0.022822943,
  -0.028902726,
  -0.026575308,
  0.0019251662,
  0.011571778,
  0.041751023,
  -0.00926811,
  0.07742226,
  0.007985655,
  -0.012729549,
  -0.018916205,
  -0.046192113,
  0.026195323,
  0.010229951,
  0.0020171942,
  0.014320743,
  0.005325749,
  -0.019593056,
  0.00860907,
  -0.001431629,
  0.0067328867,
  0.031776376,
  0.0030814535,
  -0.005779952,
  -0.0057205786,
  0.05191566,
  0.048472036,
  -0.026052827,
  0.0005406645,
  0.030161433,
  -0.014760102,
  -0.035125006,
  0.0010887505,
  0.028047757,
  0.029567704,
  0.015140089,
  -0.03360506,
  0.048638277,
  -0.004046263,
  -0.0254116,
  -0.02453288,
  -0.002

In [17]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [18]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}