In [1]:
from __future__ import annotations
from pathlib import Path
from typing import Iterable, Tuple, Optional
import json
import numpy as np
import pandas as pd
import joblib

# Reuse the same OUT_DIR used when training
OUT_DIR = Path(r"C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer")
MODEL_PATH = OUT_DIR / 'model_sentiment.pkl'

# Fallback rule-based utils (copy from training notebook)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def rule_score(text: str) -> float:
    vs = SentimentIntensityAnalyzer()
    return float(vs.polarity_scores(text).get('compound', 0.0))


def score_to_label(score: float, pos: float=0.2, neg: float=-0.2) -> str:
    return 'positive' if score>=pos else ('negative' if score<=neg else 'neutral')


def predict_one(text: str) -> Tuple[str, float]:
    """Predict a single review. Returns (label, confidence/score)."""
    text = (text or "").strip()
    if not text:
        return "", 0.0
    try:
        pipe = joblib.load(MODEL_PATH)
        lab = pipe.predict([text])[0]
        try:
            proba = pipe.predict_proba([text])
            conf = float(np.max(proba))
        except Exception:
            conf = 0.0
        return lab, conf
    except Exception:
        # fallback (no model): VADER
        comp = rule_score(text)
        return score_to_label(comp), float(abs(comp))


def predict_many(texts: Iterable[str]) -> pd.DataFrame:
    rows = []
    for t in texts:
        lab, conf = predict_one(t)
        rows.append({"text": t, "sentiment": lab, "confidence": conf})
    return pd.DataFrame(rows)


def predict_from_file(input_path: Path, text_col: str = 'reviewText', out_csv: Optional[Path] = None) -> pd.DataFrame:
    """Predict for each row in a CSV/JSON and write a predictions CSV."""
    if not input_path.exists():
        raise FileNotFoundError(input_path)
    if input_path.suffix.lower() == '.json':
        df_in = pd.read_json(input_path, lines=True)
    else:
        df_in = pd.read_csv(input_path)

    if text_col not in df_in.columns:
        for cand in ['Text','text','review','content','reviewText']:
            if cand in df_in.columns:
                text_col = cand
                break
        else:
            raise ValueError(f"Text column '{text_col}' not found. Available: {df_in.columns.tolist()}")

    preds = predict_many(df_in[text_col].astype(str).fillna("").tolist())
    out = pd.concat([df_in.reset_index(drop=True), preds[['sentiment','confidence']]], axis=1)

    if out_csv is None:
        out_csv = OUT_DIR / 'predictions.csv'
    out.to_csv(out_csv, index=False)
    print(f"[OK] Wrote predictions → {out_csv}")
    return out

# %% [markdown]
# ### A) Single / Batch prediction in Notebook

# %%
TEST_REVIEWS = [
    "The camera quality is amazing but the battery dies too fast.",
    "Worst purchase ever. Completely stopped working in a week.",
    "Packaging was okay. Works as expected.",
    "Love the sound quality and the build! Totally worth it.",
]
res_df = predict_many(TEST_REVIEWS)
res_df

# %% [markdown]
# ### B) Predict from a file and save results

# %%
# Example (uncomment and edit the text column name):
# file_out = predict_from_file(
#     input_path=Path(r"C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer\archive\Reviews.csv"),
#     text_col='Text',
#     out_csv=OUT_DIR / 'predictions_from_reviews.csv'
# )
# display(file_out.head(10))

# %% [markdown]
# ### C) Optional: Quick metrics if ground truth available

# %%
from sklearn.metrics import classification_report

def derive_label_from_rating(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors='coerce')
    return s.apply(lambda r: 'positive' if r>=4 else ('negative' if r<=2 else 'neutral'))

# Example (uncomment after running section B above):
# gt_col = 'Score'  # or 'Rating'
# if gt_col in file_out.columns:
#     y_true = derive_label_from_rating(file_out[gt_col])
#     y_pred = file_out['sentiment']
#     print(classification_report(y_true, y_pred))

# %% [markdown]
# ---
# ## Standalone Script — `predict_reviews.py`
# Save the following as `predict_reviews.py` and run from PowerShell.
# Usage examples:
# ```powershell
# # Predict a single text
# python predict_reviews.py --text "Battery life is poor but display is gorgeous"
#
# # Predict for a CSV (text column named 'Text') and write predictions CSV
# python predict_reviews.py --file "C:\\Users\\NXTWAVE\\Downloads\\Review Sentiment Analyzer\\archive\\Reviews.csv" --text-col Text --out "C:\\Users\\NXTWAVE\\Downloads\\Review Sentiment Analyzer\\predictions_from_reviews.csv"
# ```

# %%
PREDICT_SCRIPT = r"""
#!/usr/bin/env python3
import argparse, json
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

OUT_DIR = Path(r"C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer")
MODEL_PATH = OUT_DIR / 'model_sentiment.pkl'

def rule_score(text: str) -> float:
    vs = SentimentIntensityAnalyzer()
    return float(vs.polarity_scores(text).get('compound', 0.0))

def score_to_label(score: float, pos=0.2, neg=-0.2) -> str:
    return 'positive' if score>=pos else ('negative' if score<=neg else 'neutral')

def predict_one(text: str):
    text = (text or "").strip()
    if not text:
        return "", 0.0
    try:
        pipe = joblib.load(MODEL_PATH)
        lab = pipe.predict([text])[0]
        try:
            proba = pipe.predict_proba([text])
            conf = float(np.max(proba))
        except Exception:
            conf = 0.0
        return lab, conf
    except Exception:
        comp = rule_score(text)
        return score_to_label(comp), float(abs(comp))

def predict_file(file_path: Path, text_col: str, out_path: Path):
    if file_path.suffix.lower() == '.json':
        df_in = pd.read_json(file_path, lines=True)
    else:
        df_in = pd.read_csv(file_path)
    if text_col not in df_in.columns:
        raise SystemExit(f"Column '{text_col}' not in file. Available: {df_in.columns.tolist()}")
    rows = []
    for t in df_in[text_col].astype(str).fillna(""):
        lab, conf = predict_one(t)
        rows.append((lab, conf))
    df_in['sentiment'] = [r[0] for r in rows]
    df_in['confidence'] = [r[1] for r in rows]
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df_in.to_csv(out_path, index=False)
    print(f"[OK] Wrote predictions → {out_path}")

if __name__ == '__main__':
    ap = argparse.ArgumentParser()
    ap.add_argument('--text', type=str, help='Single review text to predict')
    ap.add_argument('--file', type=str, help='CSV/JSON path for batch prediction')
    ap.add_argument('--text-col', type=str, default='reviewText', help='Text column name in the file')
    ap.add_argument('--out', type=str, default=str(OUT_DIR / 'predictions.csv'))
    args = ap.parse_args()

    if args.text:
        lab, conf = predict_one(args.text)
        print(json.dumps({'text': args.text, 'sentiment': lab, 'confidence': conf}, ensure_ascii=False))
    elif args.file:
        predict_file(Path(args.file), args.text_col, Path(args.out))
    else:
        ap.print_help()
"""

# Write the script to disk next to your OUT_DIR for convenience
with open(OUT_DIR / 'predict_reviews.py', 'w', encoding='utf-8') as f:
    f.write(PREDICT_SCRIPT)
print("Wrote:", OUT_DIR / 'predict_reviews.py')


Wrote: C:\Users\NXTWAVE\Downloads\Review Sentiment Analyzer\predict_reviews.py
