# Test loding reviews, embedding them with openai and sentence transformers local, and finding similar embeddings based on cosine similarity

In [41]:
from path import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import openai
from openai import OpenAI
from typing import List
import time
from scipy import spatial

In [5]:
# read sample data
path = Path('/Users/saghar/Desktop/my-project-2025')
reviews_df = pd.read_csv(path / 'datasets/rotten-tomatoes-reviews/prep/reviews_w_movies_sample.csv')

In [6]:
def analyze_review_lengths(df: pd.DataFrame) -> None:
    """Understand review length distribution for chunking strategy"""    
    df['review_length'] = df['review_content'].str.len()
    
    print("\nReview Length Analysis:")
    print(f"Mean length: {df['review_length'].mean():.0f} chars")
    print(f"Median length: {df['review_length'].median():.0f} chars")
    print(f"Max length: {df['review_length'].max():.0f} chars")
    print(f"Reviews > 1000 chars: {(df['review_length'] > 1000).sum()}")
    print(f"Reviews > 2000 chars: {(df['review_length'] > 2000).sum()}")

In [7]:
# Analyze review lengths
analyze_review_lengths(reviews_df)


Review Length Analysis:
Mean length: 130 chars
Median length: 129 chars
Max length: 257 chars
Reviews > 1000 chars: 0
Reviews > 2000 chars: 0


In [37]:
def get_sentence_transformer_embedding(text: str, verbose = False) -> np.ndarray:
    """Get embedding using Sentence Transformers Local"""
    st_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Truncate to model max length (256 tokens for MiniLM)
    response = st_model.encode(text[:1000])
    if verbose:
        print("sentence transformer response looks like:")
        print(f"\n{response}")
    return response

    
def get_openai_embedding(text: str, verbose = False) -> np.ndarray:
    """Get embedding using OpenAI API"""
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text[:2000]
    )
    if verbose:
        print("OpenAI response looks like:")
        print(f"\n{response}")
    return np.array(response.data[0].embedding)



In [19]:
sample_review_text = reviews_df.sample(n=1).iloc[0]['review_content']
sample_review_text

'Schwimmer completely lacks what it takes to ever get this movie off the ground. The story is fairly formula and the actors are basically on mark, yet the jokes (most of which are so predictable they are telegraphed a mile in advance) land with a virtually'

In [21]:
emb = get_sentence_transformer_embedding(sample_review_text, verbose=True)
print(emb.shape)

sentence transformer response looks like:

[ 3.62196229e-02 -4.00959216e-02  2.41127312e-02 -9.74512845e-03
  6.88523874e-02  8.61959755e-02 -1.87520962e-02  4.03632931e-02
 -5.45066223e-02 -5.62848747e-02 -1.68221183e-02 -3.69461253e-02
 -7.19902047e-04  1.74872316e-02 -2.34141890e-02 -6.17805310e-03
  3.77200991e-02 -1.25256389e-01  1.13177905e-02 -5.00175497e-03
  7.52101690e-02 -2.91606318e-02  4.94331941e-02  7.43103549e-02
  3.57259326e-02 -3.48515622e-02 -7.07753301e-02  2.31353026e-02
 -7.71256536e-02  1.60581041e-02  1.06387958e-02  1.55146018e-01
 -5.63689172e-02 -3.33475098e-02 -9.08928167e-04  4.87216301e-02
 -9.97862546e-04  1.00982338e-01 -3.35534215e-02 -1.63945220e-02
 -2.70188763e-03 -5.10281213e-02  1.06751584e-02  1.66053288e-02
  4.06176932e-02 -6.34910911e-02  3.22255269e-02 -3.52837401e-03
 -1.58568900e-02  4.72278189e-04 -6.47259131e-02 -4.73665185e-02
  8.21182206e-02 -6.12049773e-02  2.55878828e-02  2.74506547e-02
  7.70138903e-03  2.46798526e-03  5.35224639e-0

In [25]:
emb = get_openai_embedding(sample_review_text, verbose=True)
print(emb.shape)

OpenAI response looks like:

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.02352297678589821, 0.10681765526533127, -0.017504042014479637, 0.01740577444434166, 0.028497809544205666, -0.047610994428396225, 0.023682663217186928, -0.020452091470360756, 0.012713462114334106, 0.00519900768995285, -0.00684807263314724, -0.045056018978357315, -0.048421710729599, 0.030364908277988434, 0.0018517434364184737, -0.003632856532931328, -0.0020421382505446672, -0.03301815316081047, -0.007578943390399218, 0.00516522815451026, 0.0493306927382946, 0.034762416034936905, -0.03758762776851654, -0.036629512906074524, 0.0032551377080380917, -0.016042301431298256, -0.07085145264863968, 0.03930732235312462, 0.014187486842274666, 0.04036370664834976, 0.00304478220641613, -0.014678828418254852, -0.01617741957306862, -0.008401940576732159, 0.05252441018819809, 0.019064052030444145, -0.02389148250222206, -0.023461557924747467, -0.006061926484107971, 0.013045118190348148, -0.008401940576732159, 0.00191316113

In [38]:
def find_similar_reviews(df: pd.DataFrame, query_review_idx: int = 0, top_k: int = 5, sample_size: int = 100):
    """Find similar reviews using embeddings"""
        
    # Get sample of reviews
    reviews_sample = df.sample(n=sample_size)
        
    # Embed all reviews
    print(f"\nEmbedding {sample_size} reviews...")
    embeddings = []
    for _, row in reviews_sample.iterrows():
        text = row['review_content'][:500]
        embedding = get_sentence_transformer_embedding(text)
        embeddings.append(embedding)
        
    embeddings = np.array(embeddings)
        
    # Calculate similarities to query review
    query_embedding = embeddings[query_review_idx]
    similarities = []
        
    for i, embedding in enumerate(embeddings):
        if i != query_review_idx:
            sim = cosine_similarity_from_scratch(query_embedding, embedding)
            similarities.append((i, sim))
        
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
        
    # Display results
    print(f"\nQuery Review (idx {query_review_idx}):")
    print(f"Movie: {reviews_sample.iloc[query_review_idx]['movie_title']}")
    print(f"Review: {reviews_sample.iloc[query_review_idx]['review_content'][:200]}...")
        
    print(f"\nTop {top_k} Similar Reviews:")
    for idx, sim in similarities[:top_k]:
        print(f"\nSimilarity: {sim:.3f}")
        print(f"Movie: {reviews_sample.iloc[idx]['movie_title']}")
        print(f"Review: {reviews_sample.iloc[idx]['review_content'][:200]}...")
        
    return similarities[:top_k]

In [39]:
def cosine_similarity_from_scratch(vec1: np.ndarray, vec2: np.ndarray) -> float:
    # Step 1: Calculate dot product
    dot_product = np.sum(vec1 * vec2)  # element-wise multiply then sum
        
    # Step 2: Calculate magnitudes  
    magnitude_a = np.sqrt(np.sum(vec1 ** 2))
    magnitude_b = np.sqrt(np.sum(vec2 ** 2))
        
    # Step 3: Avoid division by zero
    if magnitude_a * magnitude_b == 0:
        return 0.0
            
    # Step 4: Calculate cosine similarity
    cosine_sim = dot_product / (magnitude_a * magnitude_b)
    return cosine_sim

In [44]:
# Find similar reviews
find_similar_reviews(reviews_df, query_review_idx=0, top_k=3)


Embedding 100 reviews...

Query Review (idx 0):
Movie: 1,000 Times Good Night
Review: This potent Norwegian drama by Erik Poppe, himself a veteran war photographer, opens with a gripping sequence....

Top 3 Similar Reviews:

Similarity: 0.432
Movie: 1,000 Times Good Night
Review: An affecting drama made more poignant by honest-feeling autobiographical elements, Erik Poppe's A Thousand Times Goodnight examines the choice between family and career when that career represents wor...

Similarity: 0.353
Movie: The Situation
Review: A beautifully written and realized behind-the-scenes story aiming to make some sense of all the violence and chaos in Iraq today....

Similarity: 0.336
Movie: Molière
Review: While this film is rather talky and a long, it's also a wonderfully complex blend of wit and cheekiness, underscored by a surprisingly serious romance and a steely view of the battle of the sexes....


[(65, 0.43150902), (84, 0.35308316), (38, 0.33605778)]

In [43]:
print("\nTesting cosine similarity implementation...")
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
    
my_similarity = cosine_similarity_from_scratch(vec1, vec2)
numpy_similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
scipy_similarity = 1 - spatial.distance.cosine(vec1, vec2)
    
print(f"Your implementation: {my_similarity:.6f}")
print(f"NumPy verification: {numpy_similarity:.6f}")
print(f"Scipy verification: {scipy_similarity:.6f}")


Testing cosine similarity implementation...
Your implementation: 0.974632
NumPy verification: 0.974632
Scipy verification: 0.974632
