In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(name="rag-prof", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

In [3]:
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Computer Science',
  'stars': 5,
  'reviews': 'Great lecturer with deep knowledge. Makes complex topics easy to understand.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 4,
  'reviews': 'Very clear explanations, but sometimes moves too fast through material.'},
 {'professor': 'Prof. Michael Brown',
  'subject': 'Physics',
  'stars': 3,
  'reviews': 'The lectures are informative but can be a bit dry. Overall a good professor.'},
 {'professor': 'Dr. Sarah Davis',
  'subject': 'Chemistry',
  'stars': 4,
  'reviews': 'Engaging and always willing to help. Labs are well organized.'},
 {'professor': 'Prof. Robert Wilson',
  'subject': 'History',
  'stars': 5,
  'reviews': 'Passionate about the subject and makes history come alive!'},
 {'professor': 'Dr. Linda Martinez',
  'subject': 'Biology',
  'stars': 2,
  'reviews': 'Lectures are hard to follow and not very interactive. Needs improvement.'},
 {'professor': 'Pro

In [None]:
# processed_data = []
# # hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# for review in data['reviews']:
#     # embedding = hf_embeddings.embed_query(review['review'])
#     embedding = model.encode(review['review'])

#     processed_data.append({
#         "values": embedding,
#         "id": review['professor'],
#         "metadata": {
#             "review": review['review'],
#             "subject": review['subject'],
#             "stars": review['stars']
#         }
#     })

In [7]:
processed_data = []

# print(str(result['embedding'])[:50], '... TRIMMED]')

for review in data['reviews']:
    embedding = genai.embed_content(
        model="models/text-embedding-004",
        content=review['reviews'],
        task_type="retrieval_document",
        title="Embedding of single string")    

    processed_data.append({
        "values": embedding['embedding'],
        "id": review['professor'],
        "metadata": {
            "reviews": review['reviews'],
            "subject": review['subject'],
            "stars": review['stars']
        }
    })

In [8]:
processed_data[0]

{'values': [-0.019308217,
  0.0047932565,
  -0.090842046,
  0.0055260714,
  0.051078595,
  0.0009726668,
  -0.0017784483,
  0.04107562,
  -0.016787557,
  0.025405452,
  0.06420509,
  0.0053377985,
  0.036520492,
  -0.01913948,
  0.005485459,
  -0.086185895,
  0.03598118,
  0.015687043,
  -0.10185411,
  0.030569768,
  0.0068826815,
  -0.040635422,
  0.04722676,
  -0.047823012,
  -0.0166457,
  -0.005362746,
  0.010267265,
  -0.06609747,
  -0.011332266,
  -0.026507149,
  0.049780592,
  0.056296278,
  -0.015164997,
  -0.041057266,
  0.008179926,
  0.05120908,
  0.0021588455,
  -0.0025200448,
  0.053465724,
  -0.053193934,
  -0.031178685,
  0.010406119,
  -0.02579288,
  0.07255492,
  -0.06277538,
  -0.024286633,
  0.0055049546,
  0.038533032,
  -0.0043462734,
  0.053902574,
  0.0299156,
  0.04177621,
  -0.03192737,
  0.053435005,
  -0.0052060783,
  -0.03288378,
  -0.013845121,
  -0.022611331,
  0.012584211,
  -0.0109262895,
  -0.046139244,
  -0.032748584,
  -0.037278134,
  -0.073023334,
  -

In [9]:
index = pc.Index('rag-prof')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [10]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}