In [1]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import string

dr_path = "D:/Projects/Agentic Fake Review Bounty Hunter/test_phase"

load_dotenv()

engine = create_engine(
    os.environ['SQL_ENGINE'] + "?keepalives=1&connect_timeout=30",
    isolation_level="AUTOCOMMIT",
    pool_size=5,
    max_overflow=10,
    pool_timeout=60,
    connect_args={"application_name": "FeatureExtractor"}
)

In [2]:
def load_data(source="sql", limit=10000):
    if source == "csv":
        return pd.read_csv("test_phase/data/processed/reviews.csv").head(limit)
    elif source == "sql":
        df = pd.read_sql(f"SELECT * FROM yelp_data LIMIT {limit}", engine)

In [3]:
from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment_label(text):
    try:
        return sentiment_analyzer(str(text)[:512])[0]["label"]
    except:
        return "NEUTRAL"
    
def get_sentiment_score(text):
    try:
        res = sentiment_analyzer(str(text)[:512])[0]
        return res["score"] if res["label"] == "POSITIVE" else -res["score"]
    except:
        return 0.0

Device set to use cpu


In [4]:
df = pd.read_sql("SELECT * FROM yelp_data", engine)

df["sentiment"] = df["text"].apply(get_sentiment_label)
df["sentiment_score"] = df["text"].apply(get_sentiment_score)

df.to_sql("yelp_data", engine, if_exists="replace", index=False)
df.to_csv(f"{dr_path}/data/processed/sentiment-analysis.csv", index=False, encoding="utf-8")
print("✅ Feature extraction completed and saved.")

✅ Feature extraction completed and saved.


In [5]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_name = "google/flan-t5-base"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# df = pd.read_csv(f"{dr_path}/data/processed/sample_prompts.csv")

# prompts = [
#     "Write a fake positive restaurant review.",
#     "Write a fake negative restaurant review.",
#     "Write a fake neutral restaurant review."
# ]

# generated_reviews = []

# for prompt in df["prompt"]:
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
#     outputs = model.generate(
#         input_ids,
#         max_length=100,
#         do_sample=True,
#         top_k=50,               # Keeping top 50 likely words
#         top_p=0.92,             # tokens covering 92% of probability mask
#         temperature=0.95,       #more variation
#         repetition_penalty=1.2, # reduce using repeated phrases
#         num_return_sequences=3, # multiple options at once
#     )
#     for i, output in enumerate(outputs):
#         fake_review = tokenizer.decode(output, skip_special_tokens=True)
#         generated_reviews.append({
#             "prompt": prompt,
#             "variation": i+1,
#             "fake_review": fake_review
#         })

# df = pd.DataFrame(generated_reviews)
# df.to_csv(f"{dr_path}/data/processed/generated_fakes.csv", index=False)
# print("✅ Generated reviews saved to generated_fakes.csv")


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

def generate_fake_review(review, target_sentiment):
    prompt = f"Rewrite the following review with a {target_sentiment} tone: {review}"
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).input_ids
    output = model.generate(
        input_ids,
        max_length=100,
        do_sample=True,
        top_k=50,
        top_p=0.92,
        temperature=0.95,
        repetition_penalty=1.2
    )
    return tokenizer.decode(output[0], skip_special_tokens=True), prompt

# Swap model to BART
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

df = pd.read_csv(f"{dr_path}/data/processed/sentiment-analysis.csv")

results = []
for _, row in df.iterrows():
    review = row["text"]
    sentiment = row["sentiment"]

    if sentiment == "positive":
        target_sentiment = "negative"
    elif sentiment == "negative":
        target_sentiment = "positive"
    else:
        target_sentiment = "positive"

    s_score = row["sentiment_score"]
    fake_review, prompt = generate_fake_review(review, target_sentiment)

    results.append({
        "original_review": review,
        "original_sentiment": sentiment,
        "sentiment_score": s_score,
        "target_sentiment": target_sentiment,
        "prompt": prompt,
        "generated_fake_review": fake_review
    })

# Save results
df = pd.DataFrame(results)
df.to_csv(f"{dr_path}/data/processed/generated_fakes_bart.csv", index=False)
print("✅ Generated BART fake reviews saved to generated_fakes_bart.csv")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]