In [4]:
from model.processed_review_repository import ProcessedReviewRepository
from model.review_repository import ReviewRepository
from model.session_factory import get_db_session

# create repositories
session_generator = get_db_session()
session = next(session_generator)
reviewRepository = ReviewRepository(session=session)
processed_review_repo = ProcessedReviewRepository(session=session)



In [2]:
from load_dataset import download_dataset
# save businesses
import json
from load_dataset import load_dataset
path = download_dataset()
print(f"Extracted dataset to {path}")

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\sergiu\.cache\kagglehub\datasets\yelp-dataset\yelp-dataset\versions\4
Extracted dataset to C:\Users\sergiu\.cache\kagglehub\datasets\yelp-dataset\yelp-dataset\versions\4


In [3]:
from datetime import datetime

TARGET_LOW_STARS = 20000    # 1-2 stars
TARGET_MID_STARS = 20000    # 3 stars
TARGET_HIGH_STARS = 20000   # 4-5 stars

# Track counts
low_stars_count = 0   # 1-2 stars
mid_stars_count = 0   # 3 stars
high_stars_count = 0  # 4-5 stars
total_inserted = 0
total_scanned = 0

print(f"Target distribution:")
print(f"  1-2 stars: {TARGET_LOW_STARS}")
print(f"  3 stars: {TARGET_MID_STARS}")
print(f"  4-5 stars: {TARGET_HIGH_STARS}")
print(f"  Total: {TARGET_LOW_STARS + TARGET_MID_STARS + TARGET_HIGH_STARS}\n")

for review in load_dataset(directory_path=path, row_count=None, json_file_name='yelp_academic_dataset_review.json'):
    total_scanned += 1
    stars = review['stars']
    
    # Check if we still need reviews from this star category
    should_insert = False
    if stars in [1, 2] and low_stars_count < TARGET_LOW_STARS:
        should_insert = True
        category = "low"
    elif stars == 3 and mid_stars_count < TARGET_MID_STARS:
        should_insert = True
        category = "mid"
    elif stars in [4, 5] and high_stars_count < TARGET_HIGH_STARS:
        should_insert = True
        category = "high"
    
    if should_insert:
        reviewRepository.create(
            review_id=review['review_id'],
            stars=review['stars'],
            text=review['text'],
            date=datetime.strptime(review['date'], '%Y-%m-%d %H:%M:%S'),  
            # business_id=review['business_id']
        )
        
        # Update counts
        if category == "low":
            low_stars_count += 1
        elif category == "mid":
            mid_stars_count += 1
        elif category == "high":
            high_stars_count += 1
        
        total_inserted += 1
        
        if total_inserted % 1000 == 0:
            print(f"Scanned: {total_scanned:,} | Inserted: {total_inserted:,} | 1-2: {low_stars_count:,} | 3: {mid_stars_count:,} | 4-5: {high_stars_count:,}")
    
    # Check if all categories are full
    if (low_stars_count >= TARGET_LOW_STARS and 
        mid_stars_count >= TARGET_MID_STARS and 
        high_stars_count >= TARGET_HIGH_STARS):
        print(f"\nAll categories filled! Stopping.")
        break

print(f"\n{'='*60}")
print(f"Final Results:")
print(f"  Total scanned: {total_scanned:,}")
print(f"  Total inserted: {total_inserted:,}")
print(f"  1-2 stars: {low_stars_count:,} / {TARGET_LOW_STARS:,}")
print(f"  3 stars: {mid_stars_count:,} / {TARGET_MID_STARS:,}")
print(f"  4-5 stars: {high_stars_count:,} / {TARGET_HIGH_STARS:,}")
print(f"{'='*60}")

Target distribution:
  1-2 stars: 20000
  3 stars: 20000
  4-5 stars: 20000
  Total: 60000

Scanned: 1,000 | Inserted: 1,000 | 1-2: 186 | 3: 126 | 4-5: 688
Scanned: 2,000 | Inserted: 2,000 | 1-2: 377 | 3: 240 | 4-5: 1,383
Scanned: 3,000 | Inserted: 3,000 | 1-2: 572 | 3: 361 | 4-5: 2,067
Scanned: 4,000 | Inserted: 4,000 | 1-2: 754 | 3: 481 | 4-5: 2,765
Scanned: 5,000 | Inserted: 5,000 | 1-2: 940 | 3: 593 | 4-5: 3,467
Scanned: 6,000 | Inserted: 6,000 | 1-2: 1,100 | 3: 700 | 4-5: 4,200
Scanned: 7,000 | Inserted: 7,000 | 1-2: 1,288 | 3: 812 | 4-5: 4,900
Scanned: 8,000 | Inserted: 8,000 | 1-2: 1,477 | 3: 928 | 4-5: 5,595
Scanned: 9,000 | Inserted: 9,000 | 1-2: 1,659 | 3: 1,037 | 4-5: 6,304
Scanned: 10,000 | Inserted: 10,000 | 1-2: 1,842 | 3: 1,139 | 4-5: 7,019
Scanned: 11,000 | Inserted: 11,000 | 1-2: 2,023 | 3: 1,251 | 4-5: 7,726
Scanned: 12,000 | Inserted: 12,000 | 1-2: 2,184 | 3: 1,357 | 4-5: 8,459
Scanned: 13,000 | Inserted: 13,000 | 1-2: 2,383 | 3: 1,487 | 4-5: 9,130
Scanned: 14,000 | 

In [None]:
from review_service import ReviewService
from llm_service import llm_service
from model.processed_review_repository import ProcessedReviewRepository
from sentiment_service import SentimentService

# Initialize services
llm_svc = llm_service(model_name='gpt-3.5-turbo')
sentiment_service = SentimentService(model_path="models/distilbert_sentiment_classifier")

review_service = ReviewService(review_repository=reviewRepository, llm_service=llm_svc, sentiment_service=sentiment_service)

batch_size = 100
total_processed = 0 
start_offset = 0
max_reviews_to_process = 10

while True:
    reviews = reviewRepository.get_all(limit=batch_size, offset=start_offset)
    if total_processed >= max_reviews_to_process:
        print(f"Reached limit of {max_reviews_to_process} reviews. Stopping.")
        break
    if not reviews:
        print("No more reviews to process!")
        break
    print(f"\nProcessing batch starting at offset {start_offset} ({len(reviews)} reviews)...")

    # process in batches 
    for idx, review in enumerate(reviews):
          try:
              processed = review_service.process_review(review)

              if processed:
                  processed_review_repo.create(
                      review_id=processed.review_id,
                      summary=processed.summary,
                      passed_halucination_check=processed.passed_halucination_check,
                      llm_sentiment=processed.llm_sentiment,
                      dl_sentiment=processed.dl_sentiment
                  )
                  total_processed += 1

                  if (total_processed % 10) == 0:
                      print(f"  Processed {total_processed} reviews...")
          except Exception as e:
              print(f"  Error processing review {review.review_id}: {e}")
              continue
                    # Move to next batch
    start_offset += batch_size

print(f"\nFinish. Total processed: {total_processed}")



Processing batch starting at offset 0 (100 reviews)...
LLM Sentiment: Negative
DL Sentiment: neutral
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: Positive
DL Sentiment: neutral
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: Positive
DL Sentiment: neutral
LLM Sentiment: negative
DL Sentiment: negative
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: Negative
DL Sentiment: neutral
LLM Sentiment: Negative
DL Sentiment: neutral
  Processed 10 reviews...
LLM Sentiment: Positive
DL Sentiment: positive
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: Positive
DL Sentiment: neutral
LLM Sentiment: Positive
DL Sentiment: neutral
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: positive
DL Sentiment: positive
LLM Sentiment: Positive
DL Sentiment: positive
LLM Sentiment: positive
DL Senti

In [7]:
processed_review_id = 1
processed_review = processed_review_repo.get_by_id(processed_review_id)
if processed_review is None:
    print(f'No review found with id {processed_review_id}.')
else:
    print(f'Review with id {processed_review_id}:')
    print(f'  Review ID: {processed_review.review_id}')
    print(f'  Summary: {processed_review.summary}')
    print(f' LLM Sentiment: {processed_review.llm_sentiment}')
    print(f'  DL Sentiment: {processed_review.dl_sentiment}')
    print(f'Original review text {processed_review.review.text}')

Review with id 1:
  Review ID: KU_O5udG6zpxOg-VcAEodg
  Summary: The reviewer mentions that the restaurant takes about 2 hours from start to finish, which they find to be too long. They have tried the restaurant multiple times and while the food is good, it takes a very long time to be served. The waitstaff is described as young but usually pleasant. The reviewer has had multiple experiences where they spent too much time waiting and as a result, they often choose to dine at other establishments on weekends for a quicker dining experience.
 LLM Sentiment: Negative
  DL Sentiment: neutral
Original review text If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. 

The food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences 