In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
   ---------------------------------------- 0.0/608.4 kB ? eta -:--:--
   ----------------- ---------------------- 262.1/608.4 kB ? eta -:--:--
   ---------------------------------------- 608.4/608.4 kB 1.5 MB/s  0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.15.0


In [4]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 1.4 MB/s  0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0


In [6]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [8]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.4-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.4-cp311-cp311-win_amd64.whl (299 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4


In [10]:
! pip install gradio

Collecting gradio
  Downloading gradio-5.47.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading Brotli-1.1.0-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.117.1-py3-none-any.whl.metadata (28 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.1-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.13.2 (from gradio)
  Downloading gradio_client-1.13.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting huggingface-hub<1.0,>=0.33.5 (from gradio)
  Downloading huggingface_hub-0.35.1-py3-none-any.whl.metadata (14 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.11.3-cp311-cp311-win_amd64.whl.metadata (43 kB)
Collecting pydantic<2.12,>=2.0 (from gradio)
  D

In [1]:
from __future__ import annotations
import os, re, json, math, argparse, random, string, warnings
from pathlib import Path
from typing import List, Tuple, Optional, Dict

import numpy as np
import pandas as pd
from tqdm import tqdm

# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import emoji as emoji_lib

# Sentiment toolkits
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# UI
import gradio as gr

warnings.filterwarnings("ignore")

# -------------- Utils --------------
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
HTML_TAG_RE = re.compile(r"<[^>]+>")
NON_ASCII_RE = re.compile(r"[^\x00-\x7F]+")
PUNCT_TABLE = str.maketrans('', '', string.punctuation)

lemmatizer = WordNetLemmatizer()


def ensure_dir(p: str | Path) -> Path:
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text
    # HTML unescape & strip tags
    t = BeautifulSoup(t, "html5lib").get_text(" ")
    # URLs
    t = URL_RE.sub(" ", t)
    # emoji -> text alias (optional)
    t = emoji_lib.demojize(t, delimiters=(" ", " "))
    # lower
    t = t.lower()
    # remove punctuation
    t = t.translate(PUNCT_TABLE)
    # remove non-ascii
    t = NON_ASCII_RE.sub(" ", t)
    # collapse whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t


def tokenize_and_lemmatize(t: str, stop_words: set[str]) -> List[str]:
    toks = [lemmatizer.lemmatize(w) for w in t.split() if w not in stop_words and len(w) > 2]
    return toks


def rule_sentiment_score(text: str, use_vader=True, use_textblob=True) -> float:
    """Return compound polarity in [-1,1]. Combines VADER + TextBlob by averaging when both used."""
    scores = []
    if use_vader:
        vs = SentimentIntensityAnalyzer()
        scores.append(vs.polarity_scores(text).get('compound', 0.0))
    if use_textblob:
        try:
            scores.append(TextBlob(text).sentiment.polarity)
        except Exception:
            pass
    if not scores:
        return 0.0
    return float(np.mean(scores))


def score_to_label(score: float, pos=0.2, neg=-0.2) -> str:
    if score >= pos:
        return "positive"
    if score <= neg:
        return "negative"
    return "neutral"


# -------------- Data Loading --------------

def load_dataset(path: str | Path, text_col: str, rating_col: Optional[str] = None,
                 id_col: Optional[str] = None, category_col: Optional[str] = None,
                 label_col: Optional[str] = None) -> pd.DataFrame:
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Data not found: {path}")
    if path.suffix.lower() == ".json":
        df = pd.read_json(path, lines=True)
    else:
        df = pd.read_csv(path)
    # Normalize column names for safer matching
    df.columns = [c.strip() for c in df.columns]

    # Minimal schema
    req = [text_col]
    for c in req:
        if c not in df.columns:
            raise ValueError(f"Column '{c}' not found. Available: {df.columns.tolist()}")

    keep = {text_col: 'reviewText'}
    if rating_col and rating_col in df.columns:
        keep[rating_col] = 'rating'
    if id_col and id_col in df.columns:
        keep[id_col] = 'productID'
    if category_col and category_col in df.columns:
        keep[category_col] = 'productCategory'
    if label_col and label_col in df.columns:
        keep[label_col] = 'label'

    out = df[list(keep.keys())].rename(columns=keep)
    return out


# -------------- Training --------------

def build_ml_pipeline(max_features=50000, ngram_range=(1,2), C=4.0) -> Pipeline:
    return Pipeline([
        ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=ngram_range)),
        ("clf", LogisticRegression(C=C, max_iter=200, n_jobs=-1)),
    ])


def prepare_labels_for_ml(df: pd.DataFrame, rating_as_label: bool = True) -> pd.DataFrame:
    out = df.copy()
    if 'label' in out.columns and not rating_as_label:
        # Expect string labels like 'positive'/'negative'
        return out
    # Construct label from rating if present else fallback via rule-based score
    if 'rating' in out.columns and pd.api.types.is_numeric_dtype(out['rating']):
        out['label'] = out['rating'].apply(lambda r: 'positive' if r >= 4 else ('negative' if r <= 2 else 'neutral'))
    else:
        out['label'] = out['reviewText'].fillna("").apply(lambda t: score_to_label(rule_sentiment_score(t)))
    return out


def train_ml(df: pd.DataFrame, outdir: Path, test_size=0.2, seed=42) -> Dict:
    df = df.dropna(subset=['reviewText']).copy()
    df = prepare_labels_for_ml(df)
    # Drop neutral to make a stricter binary classifier if too few neutrals
    label_counts = df['label'].value_counts()
    if label_counts.get('neutral', 0) < 50:
        df = df[df['label'] != 'neutral']

    X_train, X_valid, y_train, y_valid = train_test_split(
        df['reviewText'], df['label'], test_size=test_size, random_state=seed, stratify=df['label'])

    pipe = build_ml_pipeline()
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_valid)
    y_proba = None
    try:
        y_proba = pipe.predict_proba(X_valid)
    except Exception:
        pass

    report = classification_report(y_valid, y_pred, output_dict=True)
    cm = confusion_matrix(y_valid, y_pred, labels=sorted(df['label'].unique()))

    # Save artifacts
    ensure_dir(outdir)
    import joblib
    joblib.dump(pipe, outdir / 'model_tfidf_lr.joblib')

    # Save evals
    pd.DataFrame(report).to_csv(outdir / 'eval_classification_report.csv')
    pd.DataFrame(cm, index=sorted(df['label'].unique()), columns=sorted(df['label'].unique())).to_csv(outdir / 'eval_confusion_matrix.csv')

    return {
        'pipe': pipe,
        'report': report,
        'cm': cm,
        'labels': sorted(df['label'].unique()),
        'X_valid': X_valid.reset_index(drop=True),
        'y_valid': y_valid.reset_index(drop=True),
        'y_pred': pd.Series(y_pred),
        'y_proba': y_proba
    }


# -------------- EDA & Visuals --------------

def make_distribution_plot(df: pd.DataFrame, outdir: Path):
    plt.figure()
    ax = sns.countplot(x='label', data=df, order=['negative','neutral','positive'])
    ax.set_title('Sentiment distribution')
    for c in ax.containers:
        ax.bar_label(c)
    plt.tight_layout()
    plt.savefig(outdir / 'viz_sentiment_distribution.png', dpi=160)
    plt.close()


def make_rating_vs_sentiment_heatmap(df: pd.DataFrame, outdir: Path):
    if 'rating' not in df.columns:
        return
    tmp = df.copy()
    tmp['rating'] = pd.to_numeric(tmp['rating'], errors='coerce')
    tmp = tmp.dropna(subset=['rating','label'])
    pivot = pd.crosstab(tmp['rating'], tmp['label'])
    if pivot.empty:
        return
    plt.figure()
    sns.heatmap(pivot, annot=True, fmt='d')
    plt.title('Rating × Sentiment')
    plt.tight_layout()
    plt.savefig(outdir / 'viz_rating_x_sentiment_heatmap.png', dpi=160)
    plt.close()


def make_wordclouds(df: pd.DataFrame, outdir: Path):
    texts = {
        'positive': ' '.join(df[df['label']=='positive']['reviewText'].astype(str).tolist())[:3_000_000],
        'negative': ' '.join(df[df['label']=='negative']['reviewText'].astype(str).tolist())[:3_000_000],
    }
    for lab, txt in texts.items():
        if not txt:
            continue
        wc = WordCloud(width=1200, height=600, background_color='white').generate(txt)
        wc.to_file(str(outdir / f'viz_wordcloud_{lab}.png'))


def top_keywords(df: pd.DataFrame, label: str, k: int = 20) -> List[Tuple[str,int]]:
    stop = set(stopwords.words('english'))
    words = []
    for t in df[df['label']==label]['reviewText'].dropna():
        t = clean_text(t)
        words.extend([w for w in t.split() if w not in stop and len(w)>2 and not w.isdigit()])
    vc = pd.Series(words).value_counts().head(k)
    return list(vc.items())


def insight_products_with_hidden_issues(df: pd.DataFrame, rating_threshold=4) -> pd.DataFrame:
    # High average rating but with meaningful fraction of negative labels
    if 'productID' not in df.columns:
        return pd.DataFrame()
    agg = df.groupby('productID').agg(
        avg_rating=('rating','mean'),
        n_reviews=('reviewText','count'),
        neg_frac=('label', lambda s: (s=='negative').mean())
    ).reset_index()
    out = agg[(agg['avg_rating']>=rating_threshold) & (agg['neg_frac']>=0.15) & (agg['n_reviews']>=10)]
    return out.sort_values(['neg_frac','n_reviews'], ascending=[False,False]).head(20)


# -------------- Prediction helpers --------------

def predict_rule(df: pd.DataFrame) -> pd.DataFrame:
    vs = SentimentIntensityAnalyzer()
    rows = []
    for t in tqdm(df['reviewText'].fillna(""), desc='Rule sentiment'):
        comp = vs.polarity_scores(t)['compound']
        lab = score_to_label(comp)
        rows.append((comp, lab))
    out = df.copy()
    out['score'] = [r[0] for r in rows]
    out['label'] = [r[1] for r in rows]
    return out


def predict_ml(pipe: Pipeline, texts: List[str]) -> Tuple[List[str], Optional[np.ndarray]]:
    try:
        proba = pipe.predict_proba(texts)
        idx = np.argmax(proba, axis=1)
        labels = pipe.classes_[idx]
        return labels.tolist(), proba
    except Exception:
        labels = pipe.predict(texts)
        return labels.tolist(), None


# -------------- Gradio UI --------------

def launch_app(outdir: Path):
    import joblib
    pipe = None
    model_path = outdir / 'model_tfidf_lr.joblib'
    if model_path.exists():
        pipe = joblib.load(model_path)

    def _predict_one(text):
        if not text or not text.strip():
            return "", 0.0
        if pipe is None:
            comp = rule_sentiment_score(text)
            return score_to_label(comp), float(comp)
        else:
            lab, proba = predict_ml(pipe, [text])
            conf = 0.0
            if proba is not None:
                conf = float(np.max(proba))
            return lab[0], conf

    def _predict_batch(texts):
        items = [t.strip() for t in texts.split('\n') if t.strip()]
        if not items:
            return ""
        out = []
        for t in items:
            lab, score = _predict_one(t)
            out.append({"text": t, "sentiment": lab, "confidence": score})
        return pd.DataFrame(out)

    with gr.Blocks(title="ReviewSentiment Analyzer") as demo:
        gr.Markdown("# 🛒 ReviewSentiment Analyzer\nEnter a review to classify sentiment.")
        with gr.Tab("Single"):
            inp = gr.Textbox(label="Review text", lines=4)
            btn = gr.Button("Predict")
            out_lab = gr.Label(num_top_classes=3, label="Sentiment (top)"
                               ) if pipe is not None else gr.Textbox(label="Sentiment")
            out_conf = gr.Number(label="Confidence / Score")
            btn.click(_predict_one, inputs=inp, outputs=[out_lab, out_conf])
        with gr.Tab("Batch"):
            tb = gr.Textbox(label="One review per line", lines=8)
            btnb = gr.Button("Predict batch")
            outdf = gr.Dataframe(interactive=False)
            btnb.click(_predict_batch, inputs=tb, outputs=outdf)
        gr.Markdown("Model: " + ("TF‑IDF + LogisticRegression" if pipe is not None else "Rule‑based (VADER/TextBlob)"))
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)


# -------------- Main CLI --------------

def init_nltk():
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)


def do_eda(df: pd.DataFrame, outdir: Path):
    make_distribution_plot(df, outdir)
    make_rating_vs_sentiment_heatmap(df, outdir)
    make_wordclouds(df, outdir)

    # Keyword tables
    pos_kw = top_keywords(df, 'positive', 25)
    neg_kw = top_keywords(df, 'negative', 25)
    pd.DataFrame(pos_kw, columns=['keyword','count']).to_csv(outdir / 'top_keywords_positive.csv', index=False)
    pd.DataFrame(neg_kw, columns=['keyword','count']).to_csv(outdir / 'top_keywords_negative.csv', index=False)

    # Hidden issues
    hidden = insight_products_with_hidden_issues(df)
    if not hidden.empty:
        hidden.to_csv(outdir / 'products_hidden_issues.csv', index=False)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--data', type=str, help='Path to CSV/JSON with reviews')
    ap.add_argument('--text-col', type=str, default='reviewText')
    ap.add_argument('--rating-col', type=str, default=None)
    ap.add_argument('--id-col', type=str, default=None)
    ap.add_argument('--category-col', type=str, default=None)
    ap.add_argument('--label-col', type=str, default=None, help='If provided, supervised labels in {positive,negative,neutral}')

    ap.add_argument('--engine', choices=['rule','ml'], default='ml')
    ap.add_argument('--predict-only', action='store_true', help='Skip training; just score with rule engine')
    ap.add_argument('--do-eda', action='store_true')
    ap.add_argument('--outdir', type=str, default='./sentiment_out')
    ap.add_argument('--seed', type=int, default=42)
    ap.add_argument('--app', action='store_true', help='Launch Gradio app')
    ap.add_argument('--init-nltk', action='store_true')

    args = ap.parse_args()

    outdir = ensure_dir(args.outdir)

    if args.init_nltk:
        init_nltk()
        print('[OK] NLTK resources downloaded.')
        if not args.data and not args.app:
            return

    if args.app:
        launch_app(outdir)
        return

    if not args.data:
        raise SystemExit('Provide --data path to run training/scoring.')

    # Load data and map columns
    df_raw = load_dataset(
        args.data,
        text_col=args.text_col,
        rating_col=args.rating_col,
        id_col=args.id_col,
        category_col=args.category_col,
        label_col=args.label_col,
    )

    # Basic cleaning & labeling
    print(f"[INFO] Loaded {len(df_raw)} rows. Columns: {df_raw.columns.tolist()}")
    df = df_raw.copy()
    df['reviewText'] = df['reviewText'].astype(str).map(clean_text)

    # Sentiment path
    if args.engine == 'rule' or args.predict_only:
        print('[INFO] Using rule-based sentiment (VADER/TextBlob).')
        df_scored = predict_rule(df)
        df_scored.to_csv(outdir / 'predictions_rule_based.csv', index=False)
        print('[OK] Saved predictions_rule_based.csv')
        if args.do_eda:
            do_eda(df_scored, outdir)
        return

    # ML path (train + eval + visuals)
    print('[INFO] Training ML model (TF‑IDF + LogisticRegression) ...')
    res = train_ml(df, outdir, seed=args.seed)

    eval_df = pd.DataFrame({
        'text': res['X_valid'],
        'label_true': res['y_valid'],
        'label_pred': res['y_pred']
    })
    eval_df.to_csv(outdir / 'eval_predictions_valid.csv', index=False)
    print('[OK] Wrote eval artifacts to', outdir)

    # EDA on full dataset using model labels for consistency
    df_for_eda = df.copy()
    # Get labels for whole corpus (might be slow on huge datasets)
    print('[INFO] Scoring entire corpus for EDA ...')
    labels_all, _ = predict_ml(res['pipe'], df_for_eda['reviewText'].tolist())
    df_for_eda['label'] = labels_all

    do_eda(df_for_eda, outdir)
    print('[OK] Visuals exported to', outdir)


if __name__ == '__main__':
    main()


  from .autonotebook import tqdm as notebook_tqdm
usage: ipykernel_launcher.py [-h] [--data DATA] [--text-col TEXT_COL] [--rating-col RATING_COL] [--id-col ID_COL]
                             [--category-col CATEGORY_COL] [--label-col LABEL_COL] [--engine {rule,ml}] [--predict-only] [--do-eda]
                             [--outdir OUTDIR] [--seed SEED] [--app] [--init-nltk]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\NXTWAVE\AppData\Roaming\jupyter\runtime\kernel-0d5d4d2f-5910-4440-8d3b-68d412d0efa7.json


SystemExit: 2