In [51]:
from dotenv import load_dotenv
load_dotenv()
import os
import itertools
import random
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [52]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [53]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Excellent professor! She made complex topics easy to understand.'},
 {'professor': 'Dr. Mark Thompson',
  'subject': 'Algorithms',
  'stars': 4,
  'review': 'Very knowledgeable, but his lectures can be a bit fast-paced.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Operating Systems',
  'stars': 3,
  'review': 'Solid professor, but sometimes the material was hard to follow.'},
 {'professor': 'Dr. John Davis',
  'subject': 'Database Systems',
  'stars': 5,
  'review': 'Great professor with a lot of real-world experience.'},
 {'professor': 'Dr. Anna Smith',
  'subject': 'Computer Networks',
  'stars': 4,
  'review': 'Good teacher, but the assignments were very challenging.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Software Engineering',
  'stars': 5,
  'review': 'Amazing course! Dr. Brown is very engaging and helpful.'},
 {'professor': 'Dr. Linda Wilson',
  'subject': 'Artificial I

In [54]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })

In [55]:
processed_data[0]

{'values': [0.0008857712,
  -0.038897242,
  -0.05448068,
  0.03072514,
  0.00014944992,
  -0.017154051,
  -0.014896294,
  0.045032453,
  -0.0031028832,
  -0.014675426,
  0.02903182,
  -0.009386874,
  -0.0035737609,
  -0.023669647,
  0.0031688367,
  0.004122862,
  -0.016638694,
  0.011828688,
  0.04432077,
  0.031780396,
  0.039927956,
  -0.02871279,
  0.023841433,
  0.013423843,
  -0.030381568,
  -0.048149142,
  0.03074968,
  0.0046290173,
  0.0048130737,
  0.008208913,
  0.06949968,
  -0.00011359728,
  -0.007484958,
  -0.042234797,
  -0.026062379,
  0.01769395,
  -0.02549794,
  0.003020058,
  -0.014528181,
  0.013534277,
  0.028099269,
  0.010393049,
  0.0066260286,
  0.033449173,
  0.048590876,
  0.020197116,
  -0.042553827,
  0.010632322,
  0.035265196,
  0.051241286,
  -0.0538917,
  0.009975854,
  0.047486536,
  0.045449648,
  -0.08221184,
  0.04687302,
  0.020074412,
  0.058799867,
  -0.0014149331,
  -0.06832172,
  0.039265353,
  -0.006132144,
  0.032443,
  0.005672003,
  -0.01847

In [57]:
index = pc.Index('rag')
index.upsert(vectors=processed_data, namespace='ns1')

{'upserted_count': 20}

In [58]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}