In [1]:
from __future__ import annotations
import json, yaml, joblib, re, string, warnings
from pathlib import Path
from typing import List, Optional, Tuple, Dict

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

warnings.filterwarnings("ignore")

# -------------------- CONFIG (edit if needed) --------------------
IN_FILES = [
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptop_Train_v2.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptops_Test_Data_PhaseA.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Laptops_Test_Data_PhaseB.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Restaurants_Test_Data_PhaseA.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\Restaurants_Train_v2.csv",
    r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\archive\restaurants-trial.csv",
]
OUT_DIR = Path(r"C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner")

# Candidate column names across ABSA datasets
TEXT_CANDS      = ["text","Text","sentence","Sentence","Review","review","ReviewText","reviewText"]
CATEGORY_CANDS  = ["category","Category","AspectCategory","aspect","Aspect","target","Target","Term","Aspect Term","Aspect_Term"]
POLARITY_CANDS  = ["polarity","Polarity","sentiment","Sentiment","label","Label","Opinion"]
ID_CANDS        = ["id","ID","SentenceID","sentence_id","ReviewID","review_id"]

# -------------------- Utils --------------------
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
PUNCT_TABLE = str.maketrans('', '', string.punctuation)


def ensure_out():
    OUT_DIR.mkdir(parents=True, exist_ok=True)


def pick_first(df: pd.DataFrame, cands: List[str]) -> Optional[str]:
    for c in cands:
        if c in df.columns:
            return c
    return None


def standardize_category(v: str) -> str:
    if not isinstance(v, str):
        return "UNCAT"
    t = v.strip()
    t = t.replace(' ', '_')
    t = t.upper()
    return t or "UNCAT"


def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    t = BeautifulSoup(s, "html5lib").get_text(" ")
    t = URL_RE.sub(" ", t)
    t = t.lower().translate(PUNCT_TABLE)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def normalize_absa(df: pd.DataFrame) -> pd.DataFrame:
    cols_orig = list(df.columns)
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    tcol  = pick_first(df, TEXT_CANDS)
    ccol  = pick_first(df, CATEGORY_CANDS)
    pcol  = pick_first(df, POLARITY_CANDS)
    idcol = pick_first(df, ID_CANDS)
    if not tcol:
        raise ValueError(f"No text column found in {cols_orig}")
    keep = {tcol:"text"}
    if ccol: keep[ccol] = "category"
    if pcol: keep[pcol] = "polarity"
    if idcol: keep[idcol] = "id"
    out = df[list(keep.keys())].rename(columns=keep)
    # Standardize
    out['text'] = out['text'].astype(str)
    if 'category' in out.columns:
        out['category'] = out['category'].astype(str).map(standardize_category)
    if 'polarity' in out.columns:
        out['polarity'] = out['polarity'].astype(str).str.lower().str.strip()
        out['polarity'] = out['polarity'].replace({
            'conflict':'neutral', 'mixed':'neutral', 'none':'neutral', 'unknown':'neutral'
        })
        out['polarity'] = out['polarity'].replace({'pos':'positive','neg':'negative'})
        out['polarity'] = out['polarity'].replace({'neu':'neutral','neutrality':'neutral'})
        # Keep only three classes for modeling
        out = out[out['polarity'].isin(['positive','negative','neutral']) | out['polarity'].isna()]
    return out


def load_all(paths: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    labeled = []
    unlabeled = []
    seen_fps = set()
    for p in paths:
        fp = Path(p)
        if not fp.exists():
            continue
        # skip duplicates by name and size
        sig = (fp.name, fp.stat().st_size)
        if sig in seen_fps:
            continue
        seen_fps.add(sig)
        try:
            df = pd.read_csv(fp)
        except Exception:
            try:
                df = pd.read_csv(fp, encoding='latin-1')
            except Exception:
                continue
        try:
            dfN = normalize_absa(df)
        except Exception:
            continue
        has_lab = ('polarity' in dfN.columns) and dfN['polarity'].notna().any()
        if has_lab:
            labeled.append(dfN)
        else:
            unlabeled.append(dfN)
    lab = pd.concat(labeled, ignore_index=True, sort=False) if labeled else pd.DataFrame(columns=['text','category','polarity'])
    unlab = pd.concat(unlabeled, ignore_index=True, sort=False) if unlabeled else pd.DataFrame(columns=['text','category'])
    return lab, unlab


# -------------------- Modeling --------------------

def build_pipeline(max_features=100000, ngram=(1,2), C=4.0) -> Pipeline:
    return Pipeline([
        ("tfidf", TfidfVectorizer(max_features=max_features, ngram_range=ngram)),
        ("clf", LogisticRegression(C=C, max_iter=300, class_weight='balanced', n_jobs=-1))
    ])


def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['text_clean'] = df['text'].map(clean_text)
    if 'category' in df.columns:
        df['feat'] = '[' + df['category'].fillna('UNCAT') + '] ' + df['text_clean']
    else:
        df['feat'] = df['text_clean']
    return df


def train_model(df_labeled: pd.DataFrame, seed=42) -> Tuple[Pipeline, Dict, pd.DataFrame]:
    df = df_labeled.dropna(subset=['text']).copy()
    df = prepare_features(df)
    # Drop rows without polarity
    df = df.dropna(subset=['polarity'])
    # Small neutral class? keep 3-class anyway; LR with class_weight balances
    Xtr, Xva, ytr, yva = train_test_split(df['feat'], df['polarity'], test_size=0.2, random_state=seed, stratify=df['polarity'])
    pipe = build_pipeline()
    pipe.fit(Xtr, ytr)
    yhat = pipe.predict(Xva)
    try:
        yproba = pipe.predict_proba(Xva)
    except Exception:
        yproba = None
    rep = classification_report(yva, yhat, output_dict=True)
    cm = confusion_matrix(yva, yhat, labels=sorted(df['polarity'].unique()))
    eval_df = pd.DataFrame({
        'text': Xva,
        'label_true': yva.reset_index(drop=True),
        'label_pred': pd.Series(yhat)
    })
    return pipe, {"report":rep, "cm":cm, "labels":sorted(df['polarity'].unique())}, eval_df


# -------------------- Insights --------------------

def compute_insights(df_all: pd.DataFrame) -> Dict:
    info: Dict[str, object] = {}
    if 'polarity' in df_all.columns and df_all['polarity'].notna().any():
        dist = df_all['polarity'].value_counts().to_dict()
        info['sentiment_distribution'] = dist
    if 'category' in df_all.columns and 'polarity' in df_all.columns:
        grouped = df_all.dropna(subset=['category']).groupby(['category','polarity']).size().unstack(fill_value=0)
        # Top negative share categories
        if 'negative' in grouped.columns:
            total = grouped.sum(axis=1)
            neg_share = (grouped['negative'] / total.replace(0, np.nan)).fillna(0).sort_values(ascending=False)
            info['top_negative_categories'] = neg_share.head(20).round(4).to_dict()
        info['by_category'] = grouped.head(50).to_dict()  # trim size
    return info


# -------------------- Save artifacts --------------------

def save_everything(outdir: Path, pipe: Pipeline, metrics: Dict, eval_df: pd.DataFrame, df_labeled: pd.DataFrame, df_unlabeled: pd.DataFrame, loaded_files: List[str]):
    outdir.mkdir(parents=True, exist_ok=True)

    # PKL model
    joblib.dump(pipe, outdir/"model_aspect_sentiment.pkl")

    # HDF5 with labeled (+ optional unlabeled)
    try:
        store_df = df_labeled.copy()
        # Add model predictions on labeled (for reference)
        feats = prepare_features(store_df)['feat']
        store_df['pred'] = pipe.predict(feats)
        store_df.to_hdf(outdir/"processed_absa.h5", key='labeled', mode='w')
        if not df_unlabeled.empty:
            u = prepare_features(df_unlabeled.copy())
            # You can also uncomment to score unlabeled:
            # df_unlabeled['pred'] = pipe.predict(u['feat'])
            df_unlabeled.to_hdf(outdir/"processed_absa.h5", key='unlabeled')
    except Exception:
        pass

    # Metrics CSVs
    pd.DataFrame(metrics['report']).to_csv(outdir/"metrics_report.csv")
    cm_df = pd.DataFrame(metrics['cm'], index=metrics['labels'], columns=metrics['labels'])
    cm_df.to_csv(outdir/"confusion_matrix.csv")
    eval_df.to_csv(outdir/"eval_predictions_valid.csv", index=False)

    # Insights JSON
    insights = compute_insights(df_labeled)
    with open(outdir/"insights.json", 'w', encoding='utf-8') as f:
        json.dump(insights, f, ensure_ascii=False, indent=2)

    # YAML metadata
    meta = {
        'inputs': loaded_files,
        'schema_labeled': list(df_labeled.columns),
        'schema_unlabeled': list(df_unlabeled.columns),
        'counts': {
            'labeled_rows': int(len(df_labeled)),
            'unlabeled_rows': int(len(df_unlabeled)),
            'labels': df_labeled['polarity'].value_counts().to_dict() if 'polarity' in df_labeled.columns else {}
        },
        'model': 'TFIDF + LogisticRegression (class_weight=balanced)',
    }
    with open(outdir/"build_metadata.yaml", 'w', encoding='utf-8') as f:
        yaml.safe_dump(meta, f, sort_keys=False)


# -------------------- Main --------------------

def main():
    ensure_out()
    labeled, unlabeled = load_all(IN_FILES)
    loaded = [p for p in IN_FILES if Path(p).exists()]

    if labeled.empty:
        raise SystemExit("No labeled rows with polarity found. Check your train CSVs.")

    # Train model
    pipe, metrics, eval_df = train_model(labeled)

    # Save everything
    save_everything(OUT_DIR, pipe, metrics, eval_df, labeled, unlabeled, loaded)

    print("\n[OK] Artifacts written to:", OUT_DIR)
    for fn in [
        "model_aspect_sentiment.pkl","processed_absa.h5","build_metadata.yaml","insights.json",
        "eval_predictions_valid.csv","metrics_report.csv","confusion_matrix.csv"
    ]:
        print(" ", OUT_DIR/fn)


if __name__ == '__main__':
    main()



[OK] Artifacts written to: C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\model_aspect_sentiment.pkl
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\processed_absa.h5
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\build_metadata.yaml
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\insights.json
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\eval_predictions_valid.csv
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\metrics_report.csv
  C:\Users\NXTWAVE\Downloads\Aspect Based Review Miner\confusion_matrix.csv
