In [44]:
from dotenv import load_dotenv
load_dotenv()
import os
import itertools
import random
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [22]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag",
    dimension=100,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [25]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Data Structures',
  'stars': 5,
  'review': 'Excellent professor! She made complex topics easy to understand.'},
 {'professor': 'Dr. Mark Thompson',
  'subject': 'Algorithms',
  'stars': 4,
  'review': 'Very knowledgeable, but his lectures can be a bit fast-paced.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Operating Systems',
  'stars': 3,
  'review': 'Solid professor, but sometimes the material was hard to follow.'},
 {'professor': 'Dr. John Davis',
  'subject': 'Database Systems',
  'stars': 5,
  'review': 'Great professor with a lot of real-world experience.'},
 {'professor': 'Dr. Anna Smith',
  'subject': 'Computer Networks',
  'stars': 4,
  'review': 'Good teacher, but the assignments were very challenging.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Software Engineering',
  'stars': 5,
  'review': 'Amazing course! Dr. Brown is very engaging and helpful.'},
 {'professor': 'Dr. Linda Wilson',
  'subject': 'Artificial I

In [35]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values":embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"],
        }
    })

In [36]:
processed_data[0]

{'values': [0.000877183,
  -0.038915027,
  -0.05447122,
  0.030719807,
  0.0001925547,
  -0.017187878,
  -0.014893708,
  0.045024637,
  -0.0031100123,
  -0.014672879,
  0.029051319,
  -0.00937911,
  -0.0035884758,
  -0.023677805,
  0.0031590855,
  0.0041252137,
  -0.016648075,
  0.011863439,
  0.044313077,
  0.03177488,
  0.039896492,
  -0.028707806,
  0.023886368,
  0.013421513,
  -0.030351758,
  -0.04816532,
  0.030719807,
  0.0045760735,
  0.0048183724,
  0.0082136225,
  0.06948762,
  -0.000100638346,
  -0.007459122,
  -0.042227466,
  -0.026057854,
  0.017678611,
  -0.02551805,
  0.0030210672,
  -0.014488854,
  0.013507391,
  0.028045319,
  0.010409647,
  0.006606476,
  0.03344337,
  0.048582442,
  0.020218147,
  -0.04262005,
  0.010618208,
  0.03530815,
  0.051281467,
  -0.053882346,
  0.009974123,
  0.047478296,
  0.04549083,
  -0.08219757,
  0.046864882,
  0.02005866,
  0.058789663,
  -0.001380183,
  -0.06830986,
  0.039234,
  -0.006143348,
  0.03243737,
  0.005713958,
  -0.01845

In [46]:
index = pc.Index('rag')
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))
vector_dim = 100
vector_count = 20

# Example generator that generates many (id, vector) pairs
example_data_generator = map(lambda i: (f'id-{i}', [random.random() for _ in range(vector_dim)]), range(vector_count))

# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(example_data_generator, batch_size=200):
    index.upsert(vectors=ids_vectors_chunk, namespace='ns1') 

In [47]:
index.describe_index_stats()

{'dimension': 100,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 10200}},
 'total_vector_count': 10200}