In [6]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load your data
df = pd.read_csv('/Users/praveenkumarkumaresan/code/mlops_homework5/data/6000_all_categories_questions.csv')  # Adjust the path as necessary

# Generate embeddings for the 'excerpt' column
df['embedding'] = df['prompt'].apply(lambda x: model.encode(x).tolist())

# Save the embeddings for later use
df.to_pickle('/Users/praveenkumarkumaresan/code/mlops_homework5/data/embeddings.pkl')

In [4]:
print(df.columns)

Index(['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'], dtype='object')


In [None]:
# Build the inference pipeline

In [7]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the model and data
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pd.read_pickle('/Users/praveenkumarkumaresan/code/mlops_homework5/data/embeddings.pkl')

def retrieve_similar_excerpts(query, top_k=5):
    # Generate embedding for the query
    query_embedding = model.encode([query])

    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding, df['embedding'].tolist())[0]

    # Get indices of top_k similar excerpts
    top_indices = similarities.argsort()[-top_k:][::-1]

    # Retrieve the corresponding excerpts
    results = df.iloc[top_indices]
    return results[['excerpt', 'question']]


In [8]:
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
import pandas as pd

app = FastAPI()

# Load the model and data
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pd.read_pickle('/Users/praveenkumarkumaresan/code/mlops_homework5/data/embeddings.pkl')

class QueryRequest(BaseModel):
    query: str
    top_k: int = 5

class QueryResponse(BaseModel):
    excerpts: List[str]
    questions: List[str]

@app.post("/query", response_model=QueryResponse)
def query_excerpts(request: QueryRequest):
    results = retrieve_similar_excerpts(request.query, request.top_k)
    return QueryResponse(
        excerpts=results['excerpt'].tolist(),
        questions=results['question'].tolist()
    )
