In [1]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

GPU available: True
GPU name: NVIDIA GeForce RTX 4070


## Amazon Reviews Data

In [12]:
import pandas as pd
file_path = 'Data/amazon_reviews.json'  
amazon_df = pd.read_json(file_path, lines=True, compression=None)

In [13]:
amazon_df.shape

(8201231, 12)

In [14]:
amazon_df.columns

Index(['overall', 'vote', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'image',
       'style'],
      dtype='object')

In [18]:
amazon_df.isna().sum()

overall                 0
vote              7239598
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          502
reviewText           7117
summary              2531
unixReviewTime          0
image             7999253
style             6211781
dtype: int64

In [21]:
# Word count (good proxy for token count)
amazon_df['review_len_words'] = amazon_df['reviewText'].astype(str).apply(lambda x: len(x.split()))

# Character count
amazon_df['review_len_chars'] = amazon_df['reviewText'].astype(str).apply(len)

# Stats
amazon_df[['review_len_words', 'review_len_chars']].describe()

Unnamed: 0,review_len_words,review_len_chars
count,8201231.0,8201231.0
mean,36.50901,193.5326
std,57.10543,309.6152
min,0.0,1.0
25%,8.0,41.0
50%,21.0,109.0
75%,43.0,223.0
max,5905.0,32563.0


In [25]:
import re

def clean_text(text):
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<.*?>', '', text)  # if HTML present
    return text

In [27]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
model.max_seq_length = 256  # safe default for 75–90% of your reviews


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
#clean_reviews = [clean_text(t) for t in reviews if isinstance(t, str) and len(t.strip()) > 10]
#embeddings = model.encode(clean_reviews, batch_size=128, show_progress_bar=True)

In [29]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import re

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
model.max_seq_length = 256

# Clean function
def clean_text(text):
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<.*?>', '', text)
    return text

# Clean the review text
amazon_df['reviewText_clean'] = amazon_df['reviewText'].apply(lambda x: clean_text(x) if pd.notnull(x) else None)

# Initialize embedding list
embeddings = []

# Generate embeddings in batches
BATCH_SIZE = 128
texts = amazon_df['reviewText_clean'].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE)):
    batch = texts[i:i+BATCH_SIZE]
    
    # Separate valid (non-null) and null
    valid_indices = [j for j, t in enumerate(batch) if t]
    valid_texts = [batch[j] for j in valid_indices]
    
    # Generate embeddings for non-null
    if valid_texts:
        batch_embeddings = model.encode(valid_texts)
    else:
        batch_embeddings = []
    
    # Insert into full list with placeholder None for nulls
    batch_result = [None] * len(batch)
    for j, idx in enumerate(valid_indices):
        batch_result[idx] = batch_embeddings[j]
    
    embeddings.extend(batch_result)


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|████████████████████████████████████████████████████████████████████████████| 64073/64073 [47:31<00:00, 22.47it/s]


In [30]:
amazon_df['embedding'] = embeddings

# Save as Parquet (efficient binary format)
amazon_df.to_parquet("amazon_reviews_with_embeddings.parquet", index=True)

In [33]:
amazon_df['embedding'].shape

(8201231,)

In [37]:
embeddings[0]

array([-1.65801088e-03,  1.09449439e-02,  2.27515437e-02, -7.84764439e-02,
       -1.71768758e-02, -2.43904255e-02,  3.96318696e-02,  3.94715294e-02,
        8.01011994e-02,  2.62124576e-02, -3.35558429e-02,  5.36860414e-02,
        2.68798973e-02,  1.24948928e-02, -1.79120973e-02,  1.41302152e-02,
        9.96875092e-02, -1.13082975e-01, -1.67907160e-02,  2.17386484e-02,
        2.89422162e-02, -9.44497064e-02,  2.92110862e-03, -7.31409639e-02,
       -4.73110657e-03,  3.81219126e-02, -2.46449895e-02,  3.96338813e-02,
       -1.10096419e-02, -9.60111171e-02, -4.38720360e-03,  8.51178765e-02,
       -3.05877271e-04,  1.90579649e-02, -4.34743538e-02, -2.97613237e-02,
       -2.21696496e-02,  5.40518016e-03, -3.09613328e-02, -4.86020148e-02,
       -3.72039154e-02,  9.26720276e-02,  3.07666417e-02,  8.48193988e-02,
        1.22561073e-02, -3.92790809e-02, -5.93561344e-02, -2.35227831e-02,
        1.46206506e-02, -2.80780308e-02, -3.84305194e-02,  1.67447962e-02,
        3.05327345e-02,  