In [7]:
import sys

# Upgrade pip safely inside Anaconda
!{sys.executable} -m pip install --upgrade pip

# Required packages
!{sys.executable} -m pip install selenium
!{sys.executable} -m pip install webdriver-manager
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install pandas




In [8]:
# =========================================================
# üßæ BeMinimalist Review Scraper ‚Äì Include Rating Column
#   Product: Alpha Arbutin 2%
# =========================================================

# !pip install selenium beautifulsoup4 pandas webdriver-manager

import os, time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.common.exceptions import WebDriverException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from webdriver_manager.chrome import ChromeDriverManager

# ---------------- Setup Chrome ----------------
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")

# If Chrome binary exists locally, set it
possible_paths = [
    r"C:\Program Files\Google\Chrome\Application\chrome.exe",
    r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
]
for p in possible_paths:
    if os.path.exists(p):
        chrome_options.binary_location = p
        break

driver = webdriver.Chrome(
    service=ChromeService(ChromeDriverManager().install()),
    options=chrome_options
)

# ---------------- Target Product URL (Updated) ----------------
url = "https://beminimalist.co/collections/skin/products/alpha-arbutin-2"

print("üåê Loading BeMinimalist product page...")
driver.get(url)
time.sleep(10)

# Scroll to load reviews
driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
time.sleep(5)

# ---------------- Pagination Loop ----------------
page = 1
max_pages = 50
collected_html = ""

while page <= max_pages:
    print(f"üìÑ Scraping page {page}...")
    time.sleep(4)

    collected_html += driver.page_source

    try:
        next_link = driver.find_element(By.CSS_SELECTOR, "a[aria-label='Navigate to next page']")
        if next_link.get_attribute("aria-disabled") == "true":
            print("‚úÖ Reached last available page of reviews.")
            break

        driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
        time.sleep(2)
        ActionChains(driver).move_to_element(next_link).click().perform()
        page += 1
        time.sleep(5)

    except Exception:
        print("‚úÖ No further 'Next' pagination link found ‚Äî finished.")
        break

print(f"\nüõë Stopped at page {page} (limit reached or end of pages).")

# ---------------- Parse All Reviews ----------------
soup = BeautifulSoup(collected_html, "html.parser")
review_blocks = soup.select("div.yotpo-review")

reviews = []

for i, r in enumerate(review_blocks, 1):
    name = r.select_one(".yotpo-reviewer-name")
    date = r.select_one(".yotpo-review-date")
    rating_div = r.select_one(".yotpo-star-rating.yotpo-review-star-rating")
    title = r.select_one(".yotpo-review-title strong, .yotpo-review-title")
    text = r.select_one(".yotpo-read-more-text, .content-review, .yotpo-review-content")

    rating_text = rating_div.get("aria-label") if rating_div and rating_div.has_attr("aria-label") else ""
    rating = rating_text.split()[0] if rating_text else ""

    reviews.append({
        "S.No": i,
        "Name": name.get_text(strip=True) if name else "Anonymous",
        "Date": date.get_text(strip=True) if date else "",
        "Rating": rating,
        "Title": title.get_text(strip=True) if title else "",
        "Review": text.get_text(strip=True) if text else "",
    })

driver.quit()

# ---------------- Save to CSV ----------------
df = pd.DataFrame(reviews)
df.to_csv("minimalist_reviews_alpha_arbutin_2.csv", index=False, encoding="utf-8-sig")

print(f"\n‚úÖ Extracted {len(df)} total reviews.")
print("üíæ Saved as 'minimalist_reviews_alpha_arbutin_2.csv'")


üåê Loading BeMinimalist product page...
üìÑ Scraping page 1...
üìÑ Scraping page 2...
üìÑ Scraping page 3...
üìÑ Scraping page 4...
üìÑ Scraping page 5...
üìÑ Scraping page 6...
üìÑ Scraping page 7...
üìÑ Scraping page 8...
üìÑ Scraping page 9...
üìÑ Scraping page 10...
üìÑ Scraping page 11...
üìÑ Scraping page 12...
üìÑ Scraping page 13...
üìÑ Scraping page 14...
üìÑ Scraping page 15...
üìÑ Scraping page 16...
üìÑ Scraping page 17...
üìÑ Scraping page 18...
üìÑ Scraping page 19...
üìÑ Scraping page 20...
üìÑ Scraping page 21...
üìÑ Scraping page 22...
üìÑ Scraping page 23...
üìÑ Scraping page 24...
üìÑ Scraping page 25...
üìÑ Scraping page 26...
üìÑ Scraping page 27...
üìÑ Scraping page 28...
üìÑ Scraping page 29...
üìÑ Scraping page 30...
üìÑ Scraping page 31...
üìÑ Scraping page 32...
üìÑ Scraping page 33...
üìÑ Scraping page 34...
üìÑ Scraping page 35...
üìÑ Scraping page 36...
üìÑ Scraping page 37...
üìÑ Scraping page 38...
üìÑ Scraping pag

In [9]:
df = pd.DataFrame(reviews)
df.to_csv("minimalist_reviews_alpha_arbutin_2.csv", index=False, encoding="utf-8-sig")
print(f"\n‚úÖ Extracted {len(df)} total reviews.")
print("üíæ Saved as 'minimalist_reviews_alpha_arbutin_2.csv'")



‚úÖ Extracted 250 total reviews.
üíæ Saved as 'minimalist_reviews_alpha_arbutin_2.csv'


In [10]:
# ================================================
# üì¶ INSTALLS (run once per environment)
# ================================================
import sys
pip = sys.executable

# Safer installs inside Anaconda/Jupyter
!{pip} -m pip install --upgrade pip
!{pip} -m pip install pandas numpy scikit-learn nltk spacy gensim langdetect deep-translator emoji matplotlib
!{pip} -m spacy download en_core_web_sm

# Optional Hindi tokenizer (for better pre-translation cleanup; rule-based)
# (No Transformer models are used.)


Collecting spacy
  Downloading spacy-3.8.8-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ------------------- ---------------- 524.3/981.5 kB 635.4 kB/s eta 0:00:01
     ------------------- ---------------- 524.3/981.5 kB 635.4 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 588.0 kB/s eta 0:00:01
     --------------------------------------- 981.5/981.5 kB 602.6 kB/s  0:00:01
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel:

In [15]:
import sys, os
# Core libs (non-Transformer)
!{sys.executable} -m pip install --quiet langdetect deep-translator spacy gensim scikit-learn matplotlib

# NLTK is optional; we handle offline gracefully if corpora can't download
!{sys.executable} -m pip install --quiet nltk

# Try to fetch a small spaCy model (non-Transformer). If your network blocks this, we'll fall back.
try:
    import spacy
    spacy.load("en_core_web_sm")
except Exception:
    try:
        !{sys.executable} -m spacy download en_core_web_sm
    except Exception as e:
        print("spaCy model download failed; will use rule-based fallbacks.")


In [21]:
import sys, os
# Core libs (non-Transformer)
!{sys.executable} -m pip install --quiet langdetect deep-translator spacy gensim scikit-learn matplotlib

# NLTK is optional; we handle offline gracefully if corpora can't download
!{sys.executable} -m pip install --quiet nltk

# Try to fetch a small spaCy model (non-Transformer). If your network blocks this, we'll fall back.
try:
    import spacy
    spacy.load("en_core_web_sm")
except Exception:
    try:
        !{sys.executable} -m spacy download en_core_web_sm
    except Exception as e:
        print("spaCy model download failed; will use rule-based fallbacks.")


In [22]:
# STEP 1 ‚Äî load data
import os, pandas as pd

INPUT_CSV = "minimalist_reviews_alpha_arbutin_2.csv"  # change if needed
assert os.path.exists(INPUT_CSV), f"CSV not found at {INPUT_CSV}"

df = pd.read_csv(INPUT_CSV)
expected_cols = {"Review","Rating","Name","Date","Title"}
missing = expected_cols - set(df.columns)
print("Columns:", list(df.columns))
print("Missing expected:", missing)

# keep the columns you need
df = df.rename(columns={c:c.strip() for c in df.columns})
df = df[[c for c in ["S.No","Name","Date","Rating","Title","Review"] if c in df.columns]]
df = df.dropna(subset=["Review"]).reset_index(drop=True)
print("Rows:", len(df))
df.head(3)


Columns: ['S.No', 'Name', 'Date', 'Rating', 'Title', 'Review']
Missing expected: set()
Rows: 250


Unnamed: 0,S.No,Name,Date,Rating,Title,Review
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products


In [25]:
import pandas as pd
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42

df = pd.read_csv("minimalist_reviews_alpha_arbutin_2.csv")

# Ensure column name is correct
assert "Review" in df.columns, "CSV must contain a column named 'Review'"

def detect_lang_safe(t):
    try:
        return detect(str(t)) if str(t).strip() else "unknown"
    except:
        return "unknown"

df["lang"] = df["Review"].astype(str).map(detect_lang_safe)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en


In [26]:
from deep_translator import GoogleTranslator

def translate_to_en(text, lang):
    if lang == "en" or lang == "unknown":
        return text
    try:
        return GoogleTranslator(source=lang, target="en").translate(text)
    except:
        return text

df["text_en"] = df.apply(lambda r: translate_to_en(r["Review"], r["lang"]), axis=1)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...


In [27]:
import re

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www.\S+", " ", s)
    s = re.sub(r"[^a-z0-9\s.,!?]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["clean"] = df["text_en"].map(clean_text)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...


In [28]:
def tokenize(s):
    return re.findall(r"[a-z]+", s)

df["tokens"] = df["clean"].map(tokenize)
df.head()



Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go..."


In [29]:
STOPWORDS = set("""
a an the and or but if while is are was were be been being am to for from in on at by of with as into through during
about over under again further then once here there when where why how all any both each few more most other some such
no nor not only own same so than too very can will just should now 
i me my we you he she it they this that these those them her his him
""".split())

df["tokens_nostop"] = df["tokens"].map(lambda toks: [t for t in toks if t not in STOPWORDS])
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]"


In [30]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

df["stemmed"] = df["tokens_nostop"].map(lambda toks: [ps.stem(t) for t in toks])
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop,stemmed
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]","[sach, good, serum]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]","[s, realli, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]","[great, product]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,...","[see, differ, skin, skin, use, dull, ha, chang..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]","[say, product, good, good, enough, individu]"


In [31]:
df.to_csv("processed_reviews.csv", index=False)
print("‚úÖ Saved: processed_reviews.csv")


‚úÖ Saved: processed_reviews.csv


In [32]:
import pandas as pd

df = pd.read_csv("processed_reviews.csv")
print(df.head())


   S.No            Name                    Date  Rating  \
0     1   Anamika M. üáÆüá≥  Published date02/10/25       5   
1     2    Mahima K. üáÆüá≥  Published date30/09/25       5   
2     3     Nancy R. üáÆüá≥  Published date30/09/25       5   
3     4    Meghna M. üáÆüá≥  Published date13/07/25       5   
4     5  Vaishali k. üáÆüá≥  Published date26/08/25       3   

                     Title                                             Review  \
0  Sach a good serums. . .                            Sach a good serums. . .   
1         It's really good                                   It's really good   
2           Great products                                     Great products   
3          Its really good  I can see the difference in my skin. My skin u...   
4               My opinion  I can only say that the product is good but no...   

  lang                                            text_en  \
0   en                            Sach a good serums. . .   
1   en

In [33]:
import re

# Try spaCy
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")  # non-Transformer
        SPACY_OK = True
    except:
        nlp = spacy.blank("en")             # tokenizer only
        SPACY_OK = False
except:
    nlp = None
    SPACY_OK = False

def pos_tag_safe(text):
    if SPACY_OK:
        doc = nlp(str(text))
        return [(t.text, t.pos_) for t in doc]
    else:
        words = re.findall(r"[a-zA-Z]+", str(text))
        return [(w, "NOUN" if w[0].isupper() else "X") for w in words]  # rule-based fallback

df["pos"] = df["clean"].map(pos_tag_safe)
df[["clean","pos"]].head()


Unnamed: 0,clean,pos
0,sach a good serums. . .,"[(sach, VERB), (a, DET), (good, ADJ), (serums,..."
1,it s really good,"[(it, PRON), (s, VERB), (really, ADV), (good, ..."
2,great products,"[(great, ADJ), (products, NOUN)]"
3,i can see the difference in my skin. my skin u...,"[(i, PRON), (can, AUX), (see, VERB), (the, DET..."
4,i can only say that the product is good but no...,"[(i, PRON), (can, AUX), (only, ADV), (say, VER..."


In [34]:
import re

def ner_safe(text):
    if SPACY_OK:
        doc = nlp(str(text))
        return [(ent.text, ent.label_) for ent in doc.ents]
    else:
        pattern = r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"
        return [(m.group(1), "ENTITY") for m in re.finditer(pattern,str(text))]

df["entities"] = df["clean"].map(ner_safe)
df[["clean","entities"]].head()


Unnamed: 0,clean,entities
0,sach a good serums. . .,[]
1,it s really good,[]
2,great products,[]
3,i can see the difference in my skin. my skin u...,"[(3, CARDINAL)]"
4,i can only say that the product is good but no...,[]


In [37]:
# FIX ‚Äî Replace NaN with a safe empty string
df["clean"] = df["clean"].fillna("")

# Also drop rows where clean text is empty (optional but recommended)
df = df[df["clean"].str.strip() != ""].reset_index(drop=True)

print("Rows after removing empty:", len(df))
df["clean"].head()


Rows after removing empty: 249


0                              sach a good serums. . .
1                                     it s really good
2                                       great products
3    i can see the difference in my skin. my skin u...
4    i can only say that the product is good but no...
Name: clean, dtype: object

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(min_df=2, max_df=0.9)
X_bow = bow.fit_transform(df["clean"])

print("BOW Shape:", X_bow.shape)


BOW Shape: (249, 332)


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean"])

print("TF-IDF Shape:", X_tfidf.shape)



TF-IDF Shape: (249, 672)


In [40]:
import os, re, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np


In [41]:
df = pd.read_csv("minimalist_reviews_alpha_arbutin_2.csv")

assert "Review" in df.columns, "CSV must contain a 'Review' column"
df = df.dropna(subset=["Review"]).reset_index(drop=True)

print("Loaded rows:", len(df))
df.head()


Loaded rows: 250


Unnamed: 0,S.No,Name,Date,Rating,Title,Review
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...


In [42]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42

def detect_lang_safe(text):
    try:
        return detect(str(text)) if str(text).strip() else "unknown"
    except:
        return "unknown"

df["lang"] = df["Review"].map(detect_lang_safe)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en


In [43]:
from deep_translator import GoogleTranslator

def translate_to_en(text, lang):
    if lang in ("en","unknown"):
        return text
    try:
        return GoogleTranslator(source=lang, target="en").translate(text)
    except:
        return text   # keep original if blocked

df["text_en"] = df.apply(lambda r: translate_to_en(r["Review"], r["lang"]), axis=1)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...


In [44]:
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www\.\S+"," ", s)
    s = re.sub(r"[^a-z0-9\s.,!?]", " ", s)
    s = re.sub(r"\s+"," ", s).strip()
    return s

df["clean"] = df["text_en"].map(clean_text)

# FIX: Remove NaN and empty rows
df["clean"] = df["clean"].fillna("")
df = df[df["clean"].str.strip() != ""].reset_index(drop=True)

print("Rows after cleaning:", len(df))
df.head()


Rows after cleaning: 249


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...


In [45]:
def tokenize_regex(s):
    return re.findall(r"[a-z]+", str(s))

df["tokens"] = df["clean"].map(tokenize_regex)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go..."


In [46]:
STOPWORDS = set("""
a an the and or but if while is are was were be been being am to for from in on at by of with as into through during
about over under again further then once here there when where why how all any both each few more most other some such
no nor not only own same so than too very can will just should now 
i me my we you he she it they this that these those them her his him
""".split())

df["tokens_nostop"] = df["tokens"].map(lambda toks: [t for t in toks if t not in STOPWORDS])
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]"


In [47]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

df["stemmed"] = df["tokens_nostop"].map(lambda toks: [ps.stem(t) for t in toks])
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop,stemmed
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]","[sach, good, serum]"
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]","[s, realli, good]"
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]","[great, product]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,...","[see, differ, skin, skin, use, dull, ha, chang..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]","[say, product, good, good, enough, individu]"


In [48]:
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
        SPACY_OK = True
    except:
        nlp = spacy.blank("en")  # tokenizer only
        SPACY_OK = False
except:
    nlp = None
    SPACY_OK = False

def pos_safe(text):
    if SPACY_OK:
        return [(t.text, t.pos_) for t in nlp(text)]
    else:
        # fallback rule-based
        words = re.findall(r"[A-Za-z]+", text)
        return [(w, "NOUN" if w[0].isupper() else "X") for w in words]

df["pos"] = df["clean"].map(pos_safe)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop,stemmed,pos
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]","[sach, good, serum]","[(sach, VERB), (a, DET), (good, ADJ), (serums,..."
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]","[s, realli, good]","[(it, PRON), (s, VERB), (really, ADV), (good, ..."
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]","[great, product]","[(great, ADJ), (products, NOUN)]"
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,...","[see, differ, skin, skin, use, dull, ha, chang...","[(i, PRON), (can, AUX), (see, VERB), (the, DET..."
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]","[say, product, good, good, enough, individu]","[(i, PRON), (can, AUX), (only, ADV), (say, VER..."


In [49]:
def ner_safe(text):
    if SPACY_OK:
        return [(e.text, e.label_) for e in nlp(text).ents]
    else:
        pattern = r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"
        return [(m.group(1),"ENTITY") for m in re.finditer(pattern,text)]

df["entities"] = df["clean"].map(ner_safe)
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop,stemmed,pos,entities
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]","[sach, good, serum]","[(sach, VERB), (a, DET), (good, ADJ), (serums,...",[]
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]","[s, realli, good]","[(it, PRON), (s, VERB), (really, ADV), (good, ...",[]
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]","[great, product]","[(great, ADJ), (products, NOUN)]",[]
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,...","[see, differ, skin, skin, use, dull, ha, chang...","[(i, PRON), (can, AUX), (see, VERB), (the, DET...","[(3, CARDINAL)]"
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]","[say, product, good, good, enough, individu]","[(i, PRON), (can, AUX), (only, ADV), (say, VER...",[]


In [50]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(min_df=2, max_df=0.9)
X_bow = bow.fit_transform(df["clean"])

print("BOW shape:", X_bow.shape)


BOW shape: (249, 332)


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df["clean"])

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (249, 672)


In [52]:
from gensim.models import Word2Vec

sentences = df["stemmed"].tolist()

w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=2, epochs=20)

print("similar words to 'skin':")
try:
    print(w2v.wv.most_similar("skin"))
except:
    print("not found")


similar words to 'skin':
[('serum', 0.9994896650314331), ('use', 0.9993718266487122), ('result', 0.9992791414260864), ('acn', 0.9992438554763794), ('after', 0.9992356300354004), ('product', 0.9991717338562012), ('see', 0.99917072057724), ('bottl', 0.9991689920425415), ('packag', 0.9991459846496582), ('work', 0.9991435408592224)]


In [53]:
from sklearn.decomposition import TruncatedSVD

lsa = TruncatedSVD(n_components=5, random_state=42)
lsa_matrix = lsa.fit_transform(X_tfidf)

terms = tfidf.get_feature_names_out()

for i, comp in enumerate(lsa.components_):
    idx = comp.argsort()[::-1][:10]
    print(f"TOPIC {i+1}:", [terms[j] for j in idx])


TOPIC 1: ['good', 'product', 'good product', 'it', 'the', 'and', 'my', 'this', 'for', 'skin']
TOPIC 2: ['good', 'good product', 'very good', 'product', 'good one', 'it good', 'very', 'good for', 'feel', 'good results']
TOPIC 3: ['product', 'good product', 'the', 'this product', 'the product', 'this', 'nice product', 'nice', 'love', 'product is']
TOPIC 4: ['the', 'very', 'the product', 'very good', 'effective', 'product is', 'really', 'is', 'love', 'it really']
TOPIC 5: ['skin', 'and', 'very', 'my skin', 'my', 'good for', 'good product', 'very good', 'glowing', 'skin and']


In [54]:
!pip install afinn --quiet

from afinn import Afinn
af = Afinn()

df["sentiment"] = df["clean"].map(lambda s: af.score(str(s)))
df.head()


Unnamed: 0,S.No,Name,Date,Rating,Title,Review,lang,text_en,clean,tokens,tokens_nostop,stemmed,pos,entities,sentiment
0,1,Anamika M. üáÆüá≥,Published date02/10/25,5,Sach a good serums. . .,Sach a good serums. . .,en,Sach a good serums. . .,sach a good serums. . .,"[sach, a, good, serums]","[sach, good, serums]","[sach, good, serum]","[(sach, VERB), (a, DET), (good, ADJ), (serums,...",[],3.0
1,2,Mahima K. üáÆüá≥,Published date30/09/25,5,It's really good,It's really good,en,It's really good,it s really good,"[it, s, really, good]","[s, really, good]","[s, realli, good]","[(it, PRON), (s, VERB), (really, ADV), (good, ...",[],3.0
2,3,Nancy R. üáÆüá≥,Published date30/09/25,5,Great products,Great products,en,Great products,great products,"[great, products]","[great, products]","[great, product]","[(great, ADJ), (products, NOUN)]",[],3.0
3,4,Meghna M. üáÆüá≥,Published date13/07/25,5,Its really good,I can see the difference in my skin. My skin u...,en,I can see the difference in my skin. My skin u...,i can see the difference in my skin. my skin u...,"[i, can, see, the, difference, in, my, skin, m...","[see, difference, skin, skin, used, dull, has,...","[see, differ, skin, skin, use, dull, ha, chang...","[(i, PRON), (can, AUX), (see, VERB), (the, DET...","[(3, CARDINAL)]",1.0
4,5,Vaishali k. üáÆüá≥,Published date26/08/25,3,My opinion,I can only say that the product is good but no...,en,I can only say that the product is good but no...,i can only say that the product is good but no...,"[i, can, only, say, that, the, product, is, go...","[say, product, good, good, enough, individually]","[say, product, good, good, enough, individu]","[(i, PRON), (can, AUX), (only, ADV), (say, VER...",[],1.0


In [55]:
from sklearn.cluster import AgglomerativeClustering

X_dense = X_tfidf.toarray()

n_clusters = min(5, max(2, len(df)//100))

clust = AgglomerativeClustering(
    n_clusters=n_clusters,
    metric="cosine",
    linkage="average"
)

df["cluster"] = clust.fit_predict(X_dense)
df["cluster"].value_counts()


ValueError: Cosine affinity cannot be used when X contains zero vectors

In [None]:
from sklearn.preprocessing import normalize
import numpy as np

V = normalize(X_dense)

representatives = []

for c in sorted(df["cluster"].unique()):
    idx = df.index[df["cluster"] == c].tolist()
    centroid = V[idx].mean(axis=0)
    sims = V[idx] @ centroid
    rep_index = idx[np.argmax(sims)]
    representatives.append((c, df.loc[rep_index,"clean"]))

representatives


In [None]:
df.to_csv("final_processed_reviews.csv", index=False)
print("‚úÖ final_processed_reviews.csv saved successfully!")
