#Explanation

Gather reviews from JSON file and convert each review into a vector. Then, insert each vector into the Pinecone vector database.

In [6]:
from dotenv import load_dotenv
load_dotenv()
import os
import openai
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

In [None]:
pc = Pinecone(api_key = os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name = "rmp", dimension = 1536, metric = "cosine", spec = ServerlessSpec(cloud="aws", region = "us-east-1") 
)

In [None]:
import json 
data = json.load(open("reviews.json"))
data['reviews']

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input = review['review'],
        model = 'text-embedding-3-small',
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values" : embedding,
        "id" : review["professor"],
        "metadata" : {
            "review" : review["review"],
            "subject" : review["subject"],
            "stars" : review["stars"],
            "date" : review["date"],
        }
    })

In [9]:
processed_data[0]

{'values': [-0.016116119921207428,
  0.002050961833447218,
  0.010668977163732052,
  0.011643101461231709,
  0.03186114504933357,
  0.01945597305893898,
  -0.03869989141821861,
  0.011152725666761398,
  0.011258753016591072,
  0.05364971235394478,
  0.042490363121032715,
  -0.014499206095933914,
  -0.053861767053604126,
  -0.010993685573339462,
  -0.0018124007619917393,
  -0.006964655127376318,
  0.0005069421022199094,
  0.02860081009566784,
  0.011106339283287525,
  0.030138203874230385,
  0.044266317039728165,
  -0.012444931082427502,
  0.056565459817647934,
  0.017786046490073204,
  -0.03374312445521355,
  -0.07363582402467728,
  -0.007064055651426315,
  -0.02347175031900406,
  -0.013677496463060379,
  0.028680332005023956,
  0.08434455841779709,
  -0.015029341913759708,
  0.0049136932939291,
  -0.013796776533126831,
  -0.04482295736670494,
  0.07941430062055588,
  0.021311447024345398,
  0.024479007348418236,
  0.025419997051358223,
  0.003308376995846629,
  0.005500155966728926,
 

In [10]:
index = pc.Index('rmp')
index.upsert(
    vectors = processed_data,
    namespace = 'ns1'
)

{'upserted_count': 47}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 46}},
 'total_vector_count': 46}