In [None]:
import pandas as pd
import numpy as np
import torch
import cv2
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from transformers import DistilBertTokenizer, DistilBertModel, pipeline
from sentence_transformers import SentenceTransformer
from textstat import flesch_reading_ease
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from skimage import io
from datasets import load_dataset
from scipy.stats import kurtosis  # Added for sharpness estimation
import fasttext.util
from sklearn.model_selection import train_test_split
from concurrent.futures import ThreadPoolExecutor

# Download required NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
from datasets import load_dataset

category_reviews = "raw_review_All_Beauty"
category_meta = "raw_meta_All_Beauty"

# Load dataset from Hugging Face
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", category_reviews, split="full")
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", category_meta, split="full")

# ✅ Step 1: Remove reviews with 0 helpful votes BEFORE sampling
dataset_reviews = dataset_reviews.filter(lambda x: x["helpful_vote"] > 0)

# ✅ Step 2: Sample 10% of filtered data
num_samples = int(len(dataset_reviews) * 0.1)
dataset_reviews = dataset_reviews.shuffle(seed=42).select(range(num_samples))

num_samples_meta = int(len(dataset_meta) * 0.1)
dataset_meta = dataset_meta.shuffle(seed=42).select(range(num_samples_meta))

# Convert to Pandas DataFrame
df_reviews = dataset_reviews.to_pandas()
df_meta = dataset_meta.to_pandas()

# Display the shape of the dataset
print(f"Filtered dataset size (before sampling): {len(dataset_reviews)} reviews")
print(f"Sampled dataset size (after selecting 10%): {df_reviews.shape[0]} reviews")

# Optionally save the final filtered dataset
df_reviews.to_csv("filtered_sampled_reviews.csv", index=False)
print("Final dataset saved as filtered_sampled_reviews.csv")


In [None]:
# Handle missing values
df_reviews.dropna(subset=['text'], inplace=True)
df_reviews.fillna({'helpful_vote': 0}, inplace=True)
df_meta['price'] = pd.to_numeric(df_meta['price'], errors='coerce')

In [None]:
# Merge reviews with metadata on 'parent_asin'
df = pd.merge(df_reviews, df_meta, on='parent_asin', how='left')

In [None]:
df['helpfulness_score'] = df['helpful_vote'] / (df['helpful_vote'].max() + 1)

In [None]:
# Initialize NLP Models
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

In [None]:
# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [None]:
# Apply text cleaning
df['cleaned_text'] = df['text'].astype(str).apply(clean_text)

In [None]:
# Load FastText Model
fasttext.util.download_model('en', if_exists='ignore')  # Download model
ft = fasttext.load_model('cc.en.300.bin')  # Load model

In [None]:
# Get FastText Embeddings
def get_fasttext_embedding(text):
    words = text.split()
    word_vectors = [ft.get_word_vector(word) for word in words]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(300)

In [None]:
df['fasttext_embedding'] = df['cleaned_text'].apply(get_fasttext_embedding)

In [None]:
# Compute Other Text Features
df['sentiment'] = df['cleaned_text'].apply(lambda t: sia.polarity_scores(t)['compound'])
df['readability'] = df['cleaned_text'].apply(flesch_reading_ease)
df['review_length'] = df['cleaned_text'].apply(len)
df['punctuation_count'] = df['text'].apply(lambda t: t.count(".") if isinstance(t, str) else 0)

In [None]:
import cv2
import numpy as np
from scipy.ndimage import sobel

def estimate_image_quality(img):
    """ Compute an optimized quality score using Laplacian (blur), mean brightness, and Sobel (sharpness). """
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # Blur detection (Laplacian variance with slight Gaussian smoothing for noise reduction)
    blur_score = cv2.Laplacian(cv2.GaussianBlur(gray, (3,3), 0), cv2.CV_64F).var()

    # Brightness (Mean pixel value)
    brightness = np.mean(gray)

    # Faster Sharpness Estimation using Sobel Edge Magnitude
    sobel_x = sobel(gray, axis=0, mode='constant')
    sobel_y = sobel(gray, axis=1, mode='constant')
    sharpness = np.mean(np.sqrt(sobel_x**2 + sobel_y**2))

    # Weighted quality score (adjust weights if needed)
    quality_score = (blur_score * (-0.1)) + (sharpness * 0.7) + (brightness * 0.4)

    return quality_score


In [None]:
# Process images
def process_image(img_url):
    try:
        img = io.imread(img_url)
        return estimate_image_quality(img)
    except:
        return None  # Return None if the image cannot be processed

In [None]:
# Function to extract image URLs safely
def extract_image_urls(image_dict):
    """ Extract all available image URLs as a list. """
    if isinstance(image_dict, dict):
        urls = []
        for key in ['hi_res', 'large', 'thumb']:
            if key in image_dict:
                url = image_dict[key]
                urls.append(url[0] if isinstance(url, (list, np.ndarray)) else url)
        return urls if urls else None
    return None

In [None]:
# Extract and store URLs as a list in a single column
df['image_urls'] = df['images_y'].apply(extract_image_urls)

In [None]:
def compute_average_quality(urls):
    if not urls:
        return None  # Skip if there are no URLs
    with ThreadPoolExecutor() as executor:
        scores = list(executor.map(process_image, urls))
    scores = [s for s in scores if s is not None]  # Remove None values
    return np.mean(scores) if scores else None  # Compute average score

In [None]:
from tqdm import tqdm

# Apply tqdm progress bar
tqdm.pandas(desc="Processing Images")

# Apply function with progress bar
df['avg_quality_score'] = df['image_urls'].progress_apply(compute_average_quality)


In [None]:
# Metadata Features
df['verified_purchase'] = df['verified_purchase'].astype(int)
df['has_image'] = df['avg_quality_score'].notna().astype(int)

In [None]:

# Install CLIP and image libraries
%pip install transformers torchvision pillow


In [None]:

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embedding(url):
    try:
        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            embedding = clip_model.get_image_features(**inputs)
        return embedding.squeeze().numpy()
    except Exception as e:
        print(f"Image error for {url}: {e}")
        return np.zeros(512)


In [None]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def chunk_text(text, chunk_size=128, stride=32):
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    chunks = []
    for i in range(0, len(tokens), chunk_size - stride):
        chunk = tokens[i:i+chunk_size]
        chunks.append(chunk.tolist())
        if i + chunk_size >= len(tokens):
            break
    return chunks


In [None]:

from sklearn.preprocessing import MinMaxScaler

# Convert review time to datetime and days since
df_reviews['review_date'] = pd.to_datetime(df_reviews['review_date'], errors='coerce')
df_reviews['days_since_review'] = (datetime.now() - df_reviews['review_date']).dt.days

# Normalize helpful votes and review length
df_reviews['review_length'] = df_reviews['review_body'].apply(lambda x: len(x.split()))
df_reviews['helpfulness_score'] = df_reviews['helpful_vote'] / df_reviews['total_vote'].replace(0, 1)

# Apply readability and sentiment score
df_reviews['flesch'] = df_reviews['review_body'].apply(flesch_reading_ease)
sia = SentimentIntensityAnalyzer()
df_reviews['sentiment'] = df_reviews['review_body'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [None]:

# Optional: If your metadata has image URLs
if 'imageURLHighRes' in df_meta.columns:
    df_meta['image_embedding'] = df_meta['imageURLHighRes'].apply(lambda x: get_image_embedding(x[0]) if isinstance(x, list) and len(x) > 0 else np.zeros(512))

# Merge metadata with reviews
df_merged = df_reviews.merge(df_meta, on='asin', how='left')

# Chunk the review text
df_merged['text_chunks'] = df_merged['review_body'].apply(chunk_text)


In [None]:
# CLIP-based image-text similarity (alignment) score
def image_text_similarity(text, image):
    inputs = clip_processor(text=[text], images=image, return_tensors="pt", padding=True)
    outputs = clip_model(**inputs)
    return torch.cosine_similarity(outputs.image_embeds, outputs.text_embeds).item()


In [None]:
# Reviewer-level features (if reviewerID is available)
reviewer_stats = df.groupby("reviewerID").agg({
    "reviewText": "count",
    "helpful": lambda x: sum([h[0]/h[1] if h[1] != 0 else 0 for h in x])
}).rename(columns={
    "reviewText": "review_count_by_user",
    "helpful": "avg_helpfulness_by_user"
})

df = df.merge(reviewer_stats, on="reviewerID", how="left")


In [None]:
# Add target labels
def label_helpfulness(row):
    if row['total_votes'] == 0:
        return "no_votes"
    ratio = row['helpful_votes'] / row['total_votes']
    if ratio >= 0.75:
        return "helpful"
    elif ratio >= 0.5:
        return "somewhat_helpful"
    else:
        return "not_helpful"

df["helpfulness_label"] = df.apply(label_helpfulness, axis=1)
df["helpfulness_regression"] = df["helpful_votes"] / df["total_votes"].replace(0, 1)


In [None]:
# Filter bad data
df = df[df['total_votes'] > 0]
df = df[df['reviewText'].str.len() > 5]
df = df[df['imageURLHighRes'].notna() & df['imageURLHighRes'].str.len() > 0]
