In [1]:
from __future__ import annotations
import argparse, json, sys
from pathlib import Path
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd
import joblib
import re, string
from bs4 import BeautifulSoup

# ---------------- Configuration ----------------
OUT_DIR   = Path(r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner")
MODEL_PATH = OUT_DIR / 'model_aspect_sentiment.pkl'
DEFAULT_INPUTS = [
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptop_Train_v2.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptops_Test_Data_PhaseA.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptops_Test_Data_PhaseB.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Restaurants_Test_Data_PhaseA.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Restaurants_Train_v2.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\restaurants-trial.csv",
]

# Column candidates across ABSA datasets
TEXT_CANDS      = ["text","Text","sentence","Sentence","Review","review","ReviewText","reviewText"]
CATEGORY_CANDS  = ["category","Category","AspectCategory","aspect","Aspect","target","Target","Term","Aspect Term","Aspect_Term"]
ID_CANDS        = ["id","ID","SentenceID","sentence_id","ReviewID","review_id"]

# ---------------- Helpers ----------------
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
PUNCT_TABLE = str.maketrans('', '', string.punctuation)

def ensure_outdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def standardize_category(v: str) -> str:
    if not isinstance(v, str):
        return "UNCAT"
    t = v.strip().replace(' ', '_').upper()
    return t or "UNCAT"


def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    t = BeautifulSoup(s, "html5lib").get_text(" ")
    t = URL_RE.sub(" ", t).lower().translate(PUNCT_TABLE)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def pick_col(df: pd.DataFrame, cands: List[str]) -> Optional[str]:
    for c in cands:
        if c in df.columns: return c
    return None


def normalize_input(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    tcol = pick_col(df, TEXT_CANDS)
    if not tcol:
        raise ValueError(f"No text column found in {list(df.columns)}")
    ccol = pick_col(df, CATEGORY_CANDS)
    icol = pick_col(df, ID_CANDS)
    keep = {tcol: 'text'}
    if ccol: keep[ccol] = 'category'
    if icol: keep[icol] = 'id'
    out = df[list(keep.keys())].rename(columns=keep)
    out['text'] = out['text'].astype(str)
    if 'category' in out.columns:
        out['category'] = out['category'].astype(str).map(standardize_category)
    else:
        out['category'] = 'UNCAT'
    return out


def prepare_features(df: pd.DataFrame) -> pd.Series:
    tclean = df['text'].map(clean_text)
    cat = df['category'].fillna('UNCAT')
    feat = '[' + cat + '] ' + tclean
    return feat


def load_model(path: Path):
    if not path.exists():
        raise SystemExit(f"Model not found: {path}\nRun the builder script first to create model_aspect_sentiment.pkl.")
    return joblib.load(path)


def predict_one_file(file_path: Path, model, out_dir: Path) -> Path:
    # read CSV with safe fallbacks
    try:
        df_in = pd.read_csv(file_path)
    except Exception:
        df_in = pd.read_csv(file_path, encoding='latin-1')

    base = file_path.stem
    try:
        X = normalize_input(df_in)
    except Exception as e:
        raise SystemExit(f"{file_path.name}: {e}")

    feats = prepare_features(X)

    # Predict labels and probabilities (if available)
    try:
        proba = model.predict_proba(feats)
        idx = np.argmax(proba, axis=1)
        yhat = model.classes_[idx]
        conf = proba.max(axis=1)
    except Exception:
        yhat = model.predict(feats)
        proba = None
        conf = np.zeros(len(yhat))

    # Build output frame: original + normalized + predictions
    out = df_in.copy()
    out['predicted_polarity'] = pd.Series(yhat).astype(str)
    out['confidence'] = conf

    # include per-class probabilities if available
    if proba is not None:
        for i, cls in enumerate(model.classes_):
            out[f'proba_{cls}'] = proba[:, i]

    out_path = out_dir / f"predictions_{base}.csv"
    out.to_csv(out_path, index=False)

    # insights json for this file
    counts = pd.Series(yhat).value_counts().to_dict()
    insights = {"file": str(file_path), "distribution": counts}
    with open(out_dir / f"insights_{base}.json", 'w', encoding='utf-8') as f:
        json.dump(insights, f, ensure_ascii=False, indent=2)

    print(f"[OK] {file_path.name} → {out_path}")
    return out_path


# ---------------- Main ----------------

def main(argv: Optional[List[str]] = None):
    ap = argparse.ArgumentParser()
    ap.add_argument('--files', nargs='*', help='Paths to CSV files to score')
    ap.add_argument('--outdir', type=str, default=str(OUT_DIR))
    args, _ = ap.parse_known_args(argv)

    outdir = Path(args.outdir)
    ensure_outdir(outdir)

    model = load_model(MODEL_PATH)

    # Resolve file list: explicit or defaults
    files: List[Path] = []
    if args.files:
        files = [Path(p) for p in args.files if Path(p).exists()]
    else:
        files = [Path(p) for p in DEFAULT_INPUTS if Path(p).exists()]

    if not files:
        raise SystemExit("No input files found. Pass --files paths or place CSVs in the archive folder.")

    outputs = []
    for fp in files:
        outputs.append(predict_one_file(fp, model, outdir))

    # If multiple, also concatenate into predictions_all.csv
    if len(outputs) > 1:
        frames = []
        for op in outputs:
            try:
                dfp = pd.read_csv(op)
                dfp['__source__'] = Path(op).name
                frames.append(dfp)
            except Exception:
                pass
        if frames:
            all_df = pd.concat(frames, ignore_index=True)
            all_path = outdir / 'predictions_all.csv'
            all_df.to_csv(all_path, index=False)
            print(f"[OK] Combined → {all_path}")


if __name__ == '__main__':
    main(sys.argv[1:])


[OK] Laptop_Train_v2.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_Laptop_Train_v2.csv
[OK] Laptops_Test_Data_PhaseA.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_Laptops_Test_Data_PhaseA.csv
[OK] Laptops_Test_Data_PhaseB.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_Laptops_Test_Data_PhaseB.csv
[OK] Restaurants_Test_Data_PhaseA.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_Restaurants_Test_Data_PhaseA.csv
[OK] Restaurants_Train_v2.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_Restaurants_Train_v2.csv
[OK] restaurants-trial.csv → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_restaurants-trial.csv
[OK] Combined → C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\predictions_all.csv
