In [16]:
from dotenv import load_dotenv
load_dotenv()
import os
import torch
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoTokenizer, AutoModel

In [17]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", 
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [18]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Jane Doe',
  'subject': 'Physics',
  'stars': 5,
  'review': 'Amazing professor! Makes complex topics easy to understand.'},
 {'professor': 'Dr. John Smith',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Very knowledgeable but sometimes hard to follow.'},
 {'professor': 'Dr. Alice Johnson',
  'subject': 'Chemistry',
  'stars': 3,
  'review': 'Good content, but lectures are a bit dry.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Engaging and practical. One of the best CS professors!'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Biology',
  'stars': 2,
  'review': 'Too much reading and not enough explanation.'},
 {'professor': 'Dr. Michael Wilson',
  'subject': 'History',
  'stars': 4,
  'review': 'Interesting lectures, but exams are tough.'},
 {'professor': 'Dr. Sarah Miller',
  'subject': 'Economics',
  'stars': 3,
  'review': 'Average professor, decent lectures.'},
 {'professor': 'Dr. Daniel Anderso

In [19]:
processed_data = []
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

for review in data["reviews"]:
    inputs = tokenizer(review['review'], return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().tolist()
    
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )



In [20]:
processed_data[0]

{'values': [-0.1510464996099472,
  0.1422683596611023,
  -0.06996234506368637,
  -0.06696899980306625,
  -0.24244903028011322,
  -0.29410770535469055,
  0.035952161997556686,
  0.01380439754575491,
  -0.0538240522146225,
  0.11732327193021774,
  -0.1681586652994156,
  0.33571362495422363,
  -0.23730643093585968,
  0.4143395721912384,
  -0.1001080796122551,
  -0.07381176948547363,
  -0.1153927743434906,
  -0.1937093734741211,
  -0.346631795167923,
  -0.42381536960601807,
  0.1563255935907364,
  0.2275293618440628,
  0.11645998805761337,
  -0.31045788526535034,
  -0.11558296531438828,
  0.12664909660816193,
  -0.029945457354187965,
  -0.09121561050415039,
  0.5156161189079285,
  -0.3438089191913605,
  -0.14175739884376526,
  0.47835132479667664,
  0.06757547706365585,
  0.14122770726680756,
  -0.17098529636859894,
  0.09991606324911118,
  0.030365079641342163,
  0.5542579293251038,
  0.26296520233154297,
  0.02020135521888733,
  -0.3231176435947418,
  0.17470693588256836,
  0.23491926491

In [21]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [22]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}