In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import os
import json

In [37]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Load the model and tokenizer from Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Example model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



In [5]:
# Load the model and tokenizer from Hugging Face


def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the sentence embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# Test the model
test_text = "Hello, world!"
embeddings = get_embeddings(test_text)
print(embeddings)

[-1.86758161e-01  1.60997123e-01 -2.67068427e-02  7.02959299e-02
 -1.97098866e-01 -5.70063174e-01  1.55012146e-01  9.35192872e-03
 -2.08503664e-01  1.42686889e-01  2.07546368e-01  1.56744406e-01
  1.45996884e-01  5.37145250e-02 -2.63866663e-01 -2.45950162e-01
 -1.14997894e-01  5.27315252e-02 -6.73651159e-01  2.01303456e-02
  1.43493935e-01  3.26979131e-01 -7.52834156e-02  2.36951232e-01
 -4.31217819e-01 -6.22581542e-02  2.02568009e-01  1.99742869e-01
 -2.45356843e-01 -2.84341007e-01  2.38731384e-01  3.37002307e-01
  2.87383080e-01  4.27109487e-02 -7.78701380e-02  4.16504472e-01
 -3.82288069e-01 -3.79202962e-01  1.01378419e-01  7.92204812e-02
  1.59037784e-01 -2.61661261e-01 -3.04415822e-01 -1.18944325e-01
  3.62622924e-02  1.17296197e-01  3.11168134e-02  2.50195861e-01
  3.55965883e-01  1.69502035e-01 -2.67933786e-01 -2.90228158e-01
 -3.50596346e-02  9.85115469e-02  1.75356194e-01  2.73501631e-02
  5.27028181e-02 -2.58113950e-01  4.96397167e-02 -4.27142717e-02
 -3.07286292e-01  1.88174

In [6]:
# Load the review data
with open("reviews.json") as f:
    data = json.load(f)


In [7]:
processed_data = []

# Create embeddings for each review
for review in data["reviews"]:
    embedding = get_embeddings(review['review'])
    processed_data.append(
        {
            "values": embedding.tolist(),
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [8]:
# Placeholder function to represent vector store insertion
def insert_into_vector_store(vectors, namespace):
    # Replace this function with actual vector store logic
    print(f"Inserted {len(vectors)} vectors into the vector store under namespace '{namespace}'.")

In [9]:
# Insert the embeddings into the vector store
insert_into_vector_store(processed_data, namespace="ns1")

# Print a summary of inserted data
print(f"Processed {len(processed_data)} reviews.")

Inserted 20 vectors into the vector store under namespace 'ns1'.
Processed 20 reviews.
