In [17]:
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
from pinecone import Pinecone,ServerlessSpec


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
import json


In [21]:
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Sarah Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Chen is an exceptional professor. Her lectures are engaging and she explains complex concepts clearly.'},
 {'professor': 'Prof. Michael Rodriguez',
  'subject': 'History',
  'stars': 4,
  'review': 'Prof. Rodriguez brings history to life with his passionate teaching style. Sometimes his assignments can be overwhelming.'},
 {'professor': 'Dr. Emily Watson',
  'subject': 'Biology',
  'stars': 5,
  'review': "Dr. Watson's labs are incredibly well-organized. She's always available for extra help and genuinely cares about student success."},
 {'professor': 'Prof. David Kim',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Prof. Kim is knowledgeable, but his teaching style can be confusing. Office hours are helpful for clarification.'},
 {'professor': 'Dr. Lisa Patel',
  'subject': 'Psychology',
  'stars': 5,
  'review': "Dr. Patel's lectures are fascinating. She incorporates real-world ex

In [23]:
pc=Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [24]:
path=os.getenv('REVIEWS_PATH') or 'reviews.json'

In [3]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004


In [1]:
pip install pandas


Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   -------

In [19]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [36]:
dat=[]
for review in data['reviews']:
    res=genai.embed_content(
        model='models/text-embedding-004',
        content=review['review']
    )
    embedding=res['embedding']
    dat.append({
    "values":embedding,
    "id":review['professor'],
    "metadata":{
        "review": review['review'],
        "subject": review['subject'],
        "stars":review['stars']
    }
})

In [30]:
dat[0]

{'values': [-0.015511699,
  0.011551123,
  -0.041256137,
  0.014992137,
  0.011100046,
  0.0085054515,
  0.0066481163,
  0.027505651,
  0.03788948,
  0.024503123,
  0.08252927,
  0.01922041,
  0.017241191,
  -0.0059917066,
  0.03941187,
  -0.07449147,
  -0.02580425,
  0.06567825,
  -0.11576736,
  0.03943941,
  0.024415433,
  0.001143901,
  -0.006793255,
  -0.040854897,
  0.0011727796,
  0.016738556,
  0.030107114,
  0.0046591796,
  0.022212079,
  -0.03736389,
  -0.019580798,
  0.03607076,
  -0.030993795,
  -0.01562597,
  -0.023763672,
  0.045189306,
  0.0161754,
  -0.038555946,
  0.07091107,
  0.0034161324,
  -0.005292899,
  -0.03275814,
  0.0072522294,
  0.033717025,
  -0.013894461,
  -0.045606542,
  0.032040488,
  0.0986436,
  -0.01257415,
  0.06297843,
  -0.021305745,
  0.04066609,
  -0.03451076,
  0.031047264,
  0.014022993,
  -0.017953023,
  -0.065339535,
  -0.025270242,
  0.03258478,
  0.033474505,
  -0.01214859,
  0.031427424,
  -0.017568652,
  -0.043816704,
  0.01929464,
  0.01

In [38]:
index= pc.Index("rag")
upsert_response=index.upsert(
    vectors=dat,
    namespace="ns1",
)
print(f"Upserted count:{upsert_response['upserted_count']}")
print(index.describe_index_stats())

Upserted count:20
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 21}},
 'total_vector_count': 21}
