In [1]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os
import json

In [2]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rmp-ai",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [3]:
# Load the review data
with open("reviews.json") as file:
    data = json.load(file)

In [4]:
# Initialize OpenAI client
client = OpenAI()

# Create embeddings for each review
processed_data = []

for review in data:  # `data` is a list, so iterate directly over it
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

# Print or use `processed_data` as needed
print(processed_data)

[{'values': [0.014487707, 0.0009171546, 0.016261984, 0.043543164, 0.011119248, 0.021157922, 0.030923117, 0.031243287, -0.013227036, 0.012066419, 0.003056626, 0.025253434, 0.0124066, -0.044850525, 0.005019337, 0.042929504, 0.017475963, -0.026213946, 0.03409814, 0.054695763, 0.032417245, -0.005616321, 0.026894307, -0.020544263, -0.046424698, -0.05397538, 0.010318823, 0.023719285, -0.019383645, -0.01333376, 0.07097109, -0.026293987, 0.0050360123, -0.036392692, -0.034284905, 0.04895938, 0.00092215725, 0.024306264, 0.018449815, 0.004689161, 0.033991415, 0.03135001, -0.027401244, 0.01332042, 0.008204364, -0.013400462, -0.026013838, -0.01800958, 0.0347118, 0.023799328, -0.03753997, 0.020224093, 0.01145276, -0.019463688, -0.08906071, 0.02693433, 0.004689161, 0.016888985, 0.003848714, -0.03537882, 0.040581588, -0.018409794, -0.0047025015, -0.0068970025, -0.029108819, 0.009958631, -0.0402881, 0.016862303, -0.003691964, 0.0067269118, -0.00017175805, 0.0047425227, -0.07230513, 0.03351116, -0.03721

In [5]:
processed_data[0]

{'values': [0.014487707,
  0.0009171546,
  0.016261984,
  0.043543164,
  0.011119248,
  0.021157922,
  0.030923117,
  0.031243287,
  -0.013227036,
  0.012066419,
  0.003056626,
  0.025253434,
  0.0124066,
  -0.044850525,
  0.005019337,
  0.042929504,
  0.017475963,
  -0.026213946,
  0.03409814,
  0.054695763,
  0.032417245,
  -0.005616321,
  0.026894307,
  -0.020544263,
  -0.046424698,
  -0.05397538,
  0.010318823,
  0.023719285,
  -0.019383645,
  -0.01333376,
  0.07097109,
  -0.026293987,
  0.0050360123,
  -0.036392692,
  -0.034284905,
  0.04895938,
  0.00092215725,
  0.024306264,
  0.018449815,
  0.004689161,
  0.033991415,
  0.03135001,
  -0.027401244,
  0.01332042,
  0.008204364,
  -0.013400462,
  -0.026013838,
  -0.01800958,
  0.0347118,
  0.023799328,
  -0.03753997,
  0.020224093,
  0.01145276,
  -0.019463688,
  -0.08906071,
  0.02693433,
  0.004689161,
  0.016888985,
  0.003848714,
  -0.03537882,
  0.040581588,
  -0.018409794,
  -0.0047025015,
  -0.0068970025,
  -0.029108819,
  

In [6]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rmp-ai")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

print(f"Upserted count: {upsert_response['upserted_count']}")

Upserted count: 49


In [7]:
# Print index statistics
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
