In [2]:
try:
    get_ipython
    %pip -q install pandas numpy scikit-learn vaderSentiment joblib
except Exception:
    pass

# %% 1) Imports & paths
from __future__ import annotations
from pathlib import Path
from typing import Iterable, Tuple, Optional
import numpy as np
import pandas as pd
import joblib

# Set your artifacts/output folder
OUT_DIR = Path(r"C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer")

# Try to find a trained model saved by your pipelines
MODEL_PATHS = [
    OUT_DIR / "model_tfidf_lr.joblib",   # from the full pipeline script
    OUT_DIR / "model_sentiment.pkl",     # from the build-artifacts script
]

def _load_model():
    for mp in MODEL_PATHS:
        if mp.exists():
            return joblib.load(mp)
    return None

# Fallback sentiment (no model)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
_vs = SentimentIntensityAnalyzer()

def _rule_score(text: str) -> float:
    return float(_vs.polarity_scores(text).get("compound", 0.0))

def _score_to_label(score: float, pos: float=0.2, neg: float=-0.2) -> str:
    return "positive" if score >= pos else ("negative" if score <= neg else "neutral")

MODEL = _load_model()
print("Model:", type(MODEL).__name__ if MODEL else "None (will use VADER fallback)")

# %% 2) Predict helpers (single, many, from file)
from typing import List

def predict_one(text: str) -> Tuple[str, float]:
    """
    Returns (label, confidence/score).
    - If a trained model is available: label from the model, confidence = max proba.
    - Else: VADER fallback, confidence = |compound| score.
    """
    t = (text or "").strip()
    if not t:
        return "", 0.0
    if MODEL is not None:
        lab = MODEL.predict([t])[0]
        try:
            proba = MODEL.predict_proba([t])
            conf = float(np.max(proba))
        except Exception:
            conf = 0.0
        return str(lab), conf
    # Fallback
    comp = _rule_score(t)
    return _score_to_label(comp), float(abs(comp))

def predict_many(texts: Iterable[str]) -> pd.DataFrame:
    rows = []
    for t in texts:
        lab, conf = predict_one(t)
        rows.append({"text": t, "sentiment": lab, "confidence": conf})
    return pd.DataFrame(rows)

def predict_from_file(input_path: Path,
                      text_col: str = "reviewText",
                      out_csv: Optional[Path] = None) -> pd.DataFrame:
    """
    Loads CSV/JSON, predicts sentiment for each row, writes predictions CSV.
    Keeps original columns and appends ['sentiment', 'confidence'].
    """
    if not input_path.exists():
        raise FileNotFoundError(input_path)
    if input_path.suffix.lower() == ".json":
        df_in = pd.read_json(input_path, lines=True)
    else:
        df_in = pd.read_csv(input_path)

    # Auto-fallback if text_col not present
    if text_col not in df_in.columns:
        for cand in ["Text", "text", "review", "content", "reviewText"]:
            if cand in df_in.columns:
                text_col = cand
                break
        else:
            raise ValueError(
                f"Text column '{text_col}' not found. Available: {df_in.columns.tolist()}"
            )

    preds = predict_many(df_in[text_col].astype(str).fillna("").tolist())
    out = pd.concat([df_in.reset_index(drop=True), preds[["sentiment", "confidence"]]], axis=1)
    if out_csv is None:
        out_csv = OUT_DIR / "predictions.csv"
    out.to_csv(out_csv, index=False)
    print(f"[OK] Wrote predictions → {out_csv}")
    return out

# %% 3) Quick ad-hoc test (edit the list and run)
TEST_REVIEWS = [
    "The camera quality is amazing but the battery dies too fast.",
    "Worst purchase ever. Completely stopped working in a week.",
    "Packaging was okay. Works as expected.",
    "Love the sound quality and the build! Totally worth it.",
]
res_df = predict_many(TEST_REVIEWS)
res_df

# %% 4) Predict from your file (CSV/JSON) and save results
# Point to your dataset; set the correct text column name (e.g., 'Text' or 'reviewText').
IN_FILE = Path(r"C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer\archive\Reviews.csv")
TEXT_COL = "Text"   # change to 'reviewText' if that's your column
OUT_FILE = OUT_DIR / "predictions_from_reviews.csv"

# Uncomment to run:
# file_out = predict_from_file(IN_FILE, text_col=TEXT_COL, out_csv=OUT_FILE)
# display(file_out.head(10))

# %% 5) (Optional) Quick metrics if ground truth exists (Score/Rating → label)
from sklearn.metrics import classification_report

def derive_label_from_rating(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return s.apply(lambda r: "positive" if r >= 4 else ("negative" if r <= 2 else "neutral"))

# After running the file prediction above, if the file had 'Score' or 'Rating':
# gt_col = "Score"   # or "Rating"
# if gt_col in file_out.columns:
#     y_true = derive_label_from_rating(file_out[gt_col])
#     y_pred = file_out["sentiment"]
#     print(classification_report(y_true, y_pred))


Note: you may need to restart the kernel to use updated packages.
Model: Pipeline
