In [None]:
#!/usr/bin/env python3

import os
import sys
import argparse
import shutil

import torch
import torch.nn as nn
import torch.optim as optim

import wandb
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
    classification_report,
)

from tqdm import tqdm

# ──────────────────────────────────────────────────────────────────────────────
# Inlined MySingleDataset
# ──────────────────────────────────────────────────────────────────────────────
class MySingleDataset(torch.utils.data.Dataset):
    def __init__(self, mode, b_path, layer_to_select=-1):
        self.sents_reps = torch.load(b_path + f"{mode}_sents.pt")
        self.labels = torch.load(b_path + f"{mode}_labels.pt")
        self.sample_num = self.labels.shape[0]
        self.layer_to_select = layer_to_select

    def __getitem__(self, index):
        if self.sents_reps.dim() == 3:
            layer_embedding = self.sents_reps[index, self.layer_to_select, :]
        else:
            layer_embedding = self.sents_reps[index, :]
        return layer_embedding, self.labels[index]

    def __len__(self):
        return self.sample_num


# ──────────────────────────────────────────────────────────────────────────────
# Inlined DownstreamModelSingle
# ──────────────────────────────────────────────────────────────────────────────
class DownstreamModelSingle(nn.Module):
    def __init__(self, embed_size: int, class_num: int, common_dim: int = None):
        super().__init__()
        if common_dim and (common_dim != embed_size):
            self.compress = nn.Sequential(
                nn.Linear(embed_size, common_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
            )
            final_input_dim = common_dim
        else:
            self.compress = nn.Identity()
            final_input_dim = embed_size

        self.fc1 = nn.Linear(final_input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.act1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.act2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, class_num)

    def forward(self, x):
        out = self.compress(x)
        out = self.fc1(out)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout2(out)
        return self.fc3(out)


# ──────────────────────────────────────────────────────────────────────────────
# Inlined Train & Test from model_ops_single
# ──────────────────────────────────────────────────────────────────────────────
def Train(dataloader, device, model, loss_fn, optimizer, batch_num, wandb_logger=None):
    model.train()
    all_preds, all_labels = [], []
    total_loss, num_batches = 0.0, 0

    for batch_emb, batch_labels in tqdm(dataloader, desc="Training"):
        batch_emb = batch_emb.to(device).float()
        batch_labels = batch_labels.to(device)

        logits = model(batch_emb)
        loss = loss_fn(logits, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        pred_y = torch.argmax(logits, dim=1)
        all_preds.extend(pred_y.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

    avg_loss = total_loss / num_batches
    acc = accuracy_score(all_labels, all_preds)
    metrics = {
        "train_loss": avg_loss,
        "train_accuracy": acc,
        "train_macro_f1": f1_score(all_labels, all_preds, average="macro"),
        "train_micro_f1": f1_score(all_labels, all_preds, average="micro"),
        "train_weighted_f1": f1_score(all_labels, all_preds, average="weighted"),
        "train_weighted_precision": precision_score(all_labels, all_preds, average="weighted"),
        "train_weighted_recall": recall_score(all_labels, all_preds, average="weighted"),
        "train_weighted_accuracy": acc,
    }
    if wandb_logger:
        wandb_logger.log(metrics)
    return metrics


def Test(dataloader, device, model, loss_fn, batch_num, wandb_logger=None, mode="val"):
    model.eval()
    all_preds, all_labels = [], []
    total_loss, num_batches = 0.0, 0

    with torch.no_grad():
        for batch_emb, batch_labels in tqdm(dataloader, desc=f"Evaluating({mode})"):
            batch_emb = batch_emb.to(device).float()
            batch_labels = batch_labels.to(device)
            logits = model(batch_emb)
            loss = loss_fn(logits, batch_labels)
            total_loss += loss.item()
            num_batches += 1
            pred_y = torch.argmax(logits, dim=1)
            all_preds.extend(pred_y.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    avg_loss = total_loss / num_batches
    acc = accuracy_score(all_labels, all_preds)
    metrics = {
        f"{mode}_loss": avg_loss,
        f"{mode}_accuracy": acc,
        f"{mode}_macro_f1": f1_score(all_labels, all_preds, average="macro"),
        f"{mode}_micro_f1": f1_score(all_labels, all_preds, average="micro"),
        f"{mode}_weighted_f1": f1_score(all_labels, all_preds, average="weighted"),
        f"{mode}_weighted_precision": precision_score(all_labels, all_preds, average="weighted"),
        f"{mode}_weighted_recall": recall_score(all_labels, all_preds, average="weighted"),
        f"{mode}_weighted_accuracy": acc,
    }
    if wandb_logger:
        wandb_logger.log(metrics)
    return metrics


# ──────────────────────────────────────────────────────────────────────────────
# 1. LABEL MAPPING (same as before)
# ──────────────────────────────────────────────────────────────────────────────
LABEL_MAPPING = {
    "finance, marketing & human resources": 0,
    "information technology & electronics": 1,
    "consumer & supply chain": 2,
    "civil, mechanical & electrical": 3,
    "medical": 4,
    "sports, media & entertainment": 5,
    "education": 6,
    "government, defense & legal": 7,
    "travel, food & hospitality": 8,
    "non-profit": 9,
}
REVERSE_LABEL_MAPPING = {v: k for k, v in LABEL_MAPPING.items()}


# ──────────────────────────────────────────────────────────────────────────────
# Evaluation & Save Predictions (same as before)
# ──────────────────────────────────────────────────────────────────────────────
def evaluate_and_save_single(dataloader, model, device, results_dir):
    model.eval()
    all_probs, all_preds, all_labels = [], [], []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device).float()
            labels = labels.to(device)
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(probs, dim=1)
            all_probs.extend(probs.cpu().numpy().tolist())
            all_preds.extend(preds.cpu().numpy().tolist())
            all_labels.extend(labels.cpu().numpy().tolist())

    df = pd.DataFrame({
        "True Label": [REVERSE_LABEL_MAPPING[l] for l in all_labels],
        "Predicted Label": [REVERSE_LABEL_MAPPING[p] for p in all_preds],
        "Probabilities": all_probs,
    })
    df.to_csv(os.path.join(results_dir, "predictions.csv"), index=False)

    acc = accuracy_score(all_labels, all_preds)
    w_acc = balanced_accuracy_score(all_labels, all_preds)
    w_prec = precision_score(all_labels, all_preds, average="weighted")
    w_rec = recall_score(all_labels, all_preds, average="weighted")
    w_f1 = f1_score(all_labels, all_preds, average="weighted")

    with open(os.path.join(results_dir, "test_results.txt"), "w") as f:
        f.write(f"Overall Accuracy:  {acc:.4f}\n")
        f.write(f"Weighted Accuracy: {w_acc:.4f}\n")
        f.write(f"Weighted Precision: {w_prec:.4f}\n")
        f.write(f"Weighted Recall:   {w_rec:.4f}\n")
        f.write(f"Weighted F1:       {w_f1:.4f}\n")

    report = classification_report(all_labels, all_preds, digits=4)
    with open(os.path.join(results_dir, "class_based_results.txt"), "w") as f:
        f.write(report)

    cm = confusion_matrix(all_labels, all_preds)
    class_names = [None] * len(LABEL_MAPPING)
    for name, idx in LABEL_MAPPING.items():
        class_names[idx] = name

    plt.figure(figsize=(16, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names,
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.savefig(os.path.join(results_dir, "confusion_matrix.png"))
    plt.close()

    print(f"[✔] Evaluation artifacts saved to {results_dir}")


# ──────────────────────────────────────────────────────────────────────────────
# 3. TRAINING LOGIC + PUSH TO HUB
# ──────────────────────────────────────────────────────────────────────────────
def run_single(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dataset_root = os.path.join(args.embeddings_root, args.dataset_subdir, "dataset_tensor/")
    results_dir = os.path.join(args.results_root, args.model_variant + "_single")
    os.makedirs(results_dir, exist_ok=True)

    # Initialize W&B
    wandb.init(
        project=args.wandb_project,
        entity=args.wandb_entity,
        name=args.wandb_name,
        dir=args.log_dir,
        config={
            "embed_size": args.embed_size,
            "common_dim": args.common_dim,
            "epochs": args.epochs,
            "batch_size": args.batch_size,
            "learning_rate": args.learning_rate,
        },
    )

    # Load datasets & dataloaders
    train_ds = MySingleDataset("train", b_path=dataset_root)
    val_ds = MySingleDataset("validation", b_path=dataset_root)
    test_ds = MySingleDataset("test", b_path=dataset_root)

    train_loader = DataLoader(train_ds, args.batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_ds, args.batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_ds, args.batch_size, shuffle=False, num_workers=4, pin_memory=True)

    # Build model
    model = DownstreamModelSingle(args.embed_size, class_num=10, common_dim=args.common_dim).to(device)
    wandb.watch(model, log="all")
    loss_fn = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    best_loss = float("inf")
    best_model_fp = os.path.join(results_dir, "best_model.pt")

    # Training loop
    for ep in range(1, args.epochs + 1):
        print(f"\n=== Epoch {ep}/{args.epochs} ===")
        train_metrics = Train(train_loader, device, model, loss_fn, optimizer, 10, wandb)
        print("Train:", train_metrics)

        if train_metrics["train_loss"] < best_loss:
            best_loss = train_metrics["train_loss"]
            torch.save(model.state_dict(), best_model_fp)

        if ep % args.val_check_interval == 0:
            val_metrics = Test(val_loader, device, model, loss_fn, 10, wandb, mode="val")
            print("Val:", val_metrics)

    # Final Test + Save predictions
    if os.path.exists(best_model_fp):
        model.load_state_dict(torch.load(best_model_fp, map_location=device))
    Test(test_loader, device, model, loss_fn, 10, wandb, mode="test")
    evaluate_and_save_single(test_loader, model, device, results_dir)

    wandb.finish()

    # ──────────────────────────────────────────────────────────────────────────────
    # 4. PUSH THE BEST MODEL TO HUGGING FACE HUB (raw PyTorch approach)
    # ──────────────────────────────────────────────────────────────────────────────

    from huggingface_hub import Repository

    HF_USERNAME = "Shahriar"  # e.g. "shahriarshayesteh"
    REPO_NAME = "SoACer"
    REPO_URL = f"https://huggingface.co/{HF_USERNAME}/{REPO_NAME}"

    # 4.1) Create (or re‐use) a local folder for the HF repo
    local_repo_dir = os.path.join(results_dir, "hf_repo_soacer")
    repo = Repository(
        local_dir=local_repo_dir,
        clone_from=REPO_URL,
        use_auth_token=True
    )

    # 4.2) Copy the best_model.pt into that repo as "pytorch_model.bin"
    dst_path = os.path.join(local_repo_dir, "pytorch_model.bin")
    shutil.copy(best_model_fp, dst_path)

    # 4.3) Write a minimal config.json if you want (optional).  
    #       This can store hyperparameters such as embed_size, common_dim, class_num.
    config = {
        "embed_size": args.embed_size,
        "common_dim": args.common_dim,
        "class_num": 10,
        "architecture": "DownstreamModelSingle"
    }
    import json
    with open(os.path.join(local_repo_dir, "config.json"), "w") as fp:
        json.dump(config, fp, indent=2)

    # 4.4) Write a README.md explaining usage
    readme_text = f"""\
    # SoACer

    This Hugging Face repository contains the best‐performing 
    `DownstreamModelSingle` classifier head trained on SoAC embeddings.

    ## Model Files
     - `pytorch_model.bin`: PyTorch `state_dict()` of the classifier head.
     - `config.json`: JSON with model hyperparameters (embed_size, common_dim, etc.).
     - `README.md`: Usage instructions.

    ## How to load

    ```python
    import torch
    from DownstreamModelSingle import DownstreamModelSingle

    # 1) Clone the repo
    #    git clone https://huggingface.co/{HF_USERNAME}/{REPO_NAME}

    # 2) Load config.json
    import json
    cfg = json.load(open("{REPO_NAME}/config.json"))

    model = DownstreamModelSingle(
        embed_size=cfg["embed_size"],
        class_num=cfg["class_num"],
        common_dim=cfg["common_dim"]
    )

    # 3) Load weights
    state_dict = torch.load("{REPO_NAME}/pytorch_model.bin", map_location="cpu")
    model.load_state_dict(state_dict)
    model.eval()

    # 4) Use `model` on new precomputed embeddings:
    #    given `new_embedding`: a 1D Tensor of size (embed_size,)
    #    logits = model(new_embedding.unsqueeze(0))  # shape (1, class_num)
    #    probs = torch.softmax(logits, dim=-1)
    # ```
    """
    with open(os.path.join(local_repo_dir, "README.md"), "w") as fp:
        fp.write(readme_text)

    # 4.5) Commit & push to the Hub
    repo.push_to_hub(commit_message="Upload best SoACer classifier head")
    print(f"[✔] Pushed classifier head to https://huggingface.co/{HF_USERNAME}/{REPO_NAME}")


# ──────────────────────────────────────────────────────────────────────────────
# 5. ARGPARSE
# ──────────────────────────────────────────────────────────────────────────────
def parse_args():
    parser = argparse.ArgumentParser(description="Train single-embedding classifier.")

    parser.add_argument("--model_variant", type=str, required=True)
    parser.add_argument("--embed_size", type=int, required=True)
    parser.add_argument("--common_dim", type=int, default=512)
    parser.add_argument("--dataset_subdir", type=str, required=True)

    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--learning_rate", type=float, default=1e-4)
    parser.add_argument("--val_check_interval", type=int, default=1)

    parser.add_argument("--embeddings_root", type=str, required=True)
    parser.add_argument("--results_root", type=str, required=True)
    parser.add_argument("--wandb_project", type=str, default="SoAC")
    parser.add_argument("--wandb_name", type=str, required=True)
    parser.add_argument("--log_dir", type=str, default="./wandb_logs")
    parser.add_argument("--wandb_entity", type=str, default=None)

    return parser.parse_args()


# def main():
#     args = parse_args()
#     run_single(args)


# if __name__ == "__main__":
#     main()


from types import SimpleNamespace

args = SimpleNamespace(
    model_variant="Meta-Llama-3-8B",
    embed_size=4096,
    common_dim=4096,
    dataset_subdir="Meta-Llama-3-8B/",  # e.g., "ablation", "summary", etc.
    epochs=10,
    batch_size=8,
    learning_rate=2e-4,
    val_check_interval=1,
    embeddings_root="/data/sxs7285/Porjects_code/thesis/DocEng/classification/embeddings/model_embeddings/",  # adjust as needed
    results_root="/data/sxs7285/Projects_code/thesis/SoAC/resultss",
    wandb_project="SoAC",
    wandb_name="ablation_single_2048_512",
    log_dir="./wandb_logs",
    wandb_entity="shahriar92",  # Optional, if you're part of an org
)

run_single(args)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# from huggingface_hub import HfApi, HfFolder

# # Get your token (assumes you're logged in already)
# token = HfFolder.get_token()

# # Set your repo and user/org
# username = "Shahriar"              # your HF username (not display name)
# repo_name = "SoACer"
# repo_id = f"{username}/{repo_name}"

# # Create the repo using updated API
# api = HfApi()
# repo_url = api.create_repo(
#     repo_id=repo_id,
#     token=token,
#     exist_ok=False,     # raises error if repo exists
#     private=False       # set to True if you want the repo to be private
# )

# print(f"[✔] Repo created: {repo_url}")


[✔] Repo created: https://huggingface.co/Shahriar/SoACer


In [None]:
from transformers import AutoTokenizer, AutoModel
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from huggingface_hub import hf_hub_download
from collections import defaultdict
import torch.nn as nn
import torch
import json
import numpy as np
import os
from transformers import (
    AutoConfig, 
    AutoModel, 
    AutoTokenizer,
    AutoModelForCausalLM
)

import requests
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import logging

class SoACerCrawler:
    def __init__(self, user_agent="SoACerBot"):
        self.user_agent = user_agent
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': self.user_agent})

    def is_allowed_by_robots(self, url):
        parsed_url = urlparse(url)
        robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
        rp = RobotFileParser()
        try:
            rp.set_url(robots_url)
            rp.read()
            return rp.can_fetch(self.user_agent, url)
        except Exception as e:
            logging.warning(f"Failed to parse robots.txt for {url}: {e}")
            return False

    def extract_content(self, html, url):
        try:
            from boilerpy3 import extractors
            extractor = extractors.ArticleExtractor()
            doc = extractor.get_doc(html)
            return doc.content
        except Exception:
            soup = BeautifulSoup(html, 'html.parser')
            for tag in soup(['script', 'style']):
                tag.decompose()
            return '\n'.join(chunk.strip() for chunk in soup.get_text().splitlines() if chunk.strip())

    def fetch_and_clean(self, url):
        try:
            if not self.is_allowed_by_robots(url):
                return None, f"Disallowed by robots.txt: {url}"

            resp = self.session.get(url, timeout=10)
            if resp.status_code != 200:
                return None, f"HTTP error {resp.status_code} on {url}"

            html = resp.text
            cleaned = self.extract_content(html, url)
            return cleaned, None
        except requests.exceptions.RequestException as e:
            return None, f"Request error on {url}: {str(e)}"

    def crawl(self, url, max_links=3):
        if not url.startswith("http"):
            url = "http://" + url

        visited = set()
        texts = {}
        queue = [url]

        for depth in range(4):  # 2-level BFS
            next_queue = []
            for link in queue:
                if link in visited:
                    continue
                visited.add(link)
                text, error = self.fetch_and_clean(link)
                if text:
                    texts[link] = text
                elif error:
                    logging.warning(error)

                try:
                    html = self.session.get(link, timeout=10).text
                    soup = BeautifulSoup(html, "html.parser")
                    hrefs = [urljoin(link, tag.get("href")) for tag in soup.find_all("a", href=True)]
                    hrefs = [h for h in hrefs if urlparse(h).netloc == urlparse(url).netloc]
                    hrefs = [h for h in hrefs if all(k not in h.lower() for k in ['privacy', 'terms', 'policy'])]
                    prioritized = [h for h in hrefs if any(p in h.lower() for p in ['about', 'service'])]
                    remaining = [h for h in hrefs if h not in prioritized]
                    next_queue.extend(prioritized[:max_links] + remaining[:max_links])
                except Exception as e:
                    logging.warning(f"Failed to extract links from {link}: {e}")
            queue = next_queue

        combined = "\n".join(texts.values())
        return combined if combined.strip() else None


# DownstreamModelSingle remains the same as in training
class DownstreamModelSingle(nn.Module):
    def __init__(self, embed_size: int, class_num: int, common_dim: int = None):
        super().__init__()
        if common_dim and (common_dim != embed_size):
            self.compress = nn.Sequential(
                nn.Linear(embed_size, common_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
            )
            final_input_dim = common_dim
        else:
            self.compress = nn.Identity()
            final_input_dim = embed_size

        self.fc1 = nn.Linear(final_input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.act1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.act2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, class_num)

    def forward(self, x):
        out = self.compress(x)
        out = self.fc1(out)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout2(out)
        return self.fc3(out)


class SoACerPredictor:
    def __init__(
        self,
        embedder_name="meta-llama/Meta-Llama-3-8B",
        classifier_source="Shahriar/SoACer",
        classifier_local_dir=None,
        device=None
    ):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.embedder_name = embedder_name
        self.classifier_source = classifier_source
        self.classifier_local_dir = classifier_local_dir
        self.label_names = [
        "finance, marketing & human resources",
        "information technology & electronics",
        "consumer & supply chain",
        "civil, mechanical & electrical",
        "medical",
        "sports, media & entertainment",
        "education",
        "government, defense & legal",
        "travel, food & hospitality",
        "non-profit",
        ]


        self.embedding_model, self.tokenizer = self.load_embedder()
        # embedding = embedding.to(self.classifier.device)

        self.classifier = self.load_classifier()
        self.classifier.eval()

    def load_embedder(self):
        tokenizer = AutoTokenizer.from_pretrained(self.embedder_name, trust_remote_code=True)
        # model = AutoModel.from_pretrained(self.embedder_name, trust_remote_code=True)
        # If model doesn't have a pad_token, assign the eos_token as pad_token
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"


        # ----- Model Config + Loading -----
        config_kwargs = {
            "trust_remote_code": True,
            "revision": 'main',
            "use_auth_token": None,
            "output_hidden_states": True  # Enable hidden state outputs
        }

        model_config = AutoConfig.from_pretrained(self.embedder_name, **config_kwargs)


        model = AutoModelForCausalLM.from_pretrained(
        self.embedder_name,
        config=model_config,
        # device_map="auto",        # <--- Automatic sharding across available GPUs
        device_map=None,  # Do not shard

        # device_map = device,
        torch_dtype=torch.float16,  # Use half precision for efficiency (optional)
        attn_implementation="eager"
    )

        model.to(self.device)
        return model, tokenizer

    def load_classifier(self):
        if self.classifier_local_dir:
            config_path = os.path.join(self.classifier_local_dir, "config.json")
            model_path = os.path.join(self.classifier_local_dir, "pytorch_model.bin")
        else:
            config_path = hf_hub_download(repo_id=self.classifier_source, filename="config.json")
            model_path = hf_hub_download(repo_id=self.classifier_source, filename="pytorch_model.bin")

        with open(config_path) as f:
            config = json.load(f)

        model = DownstreamModelSingle(
            embed_size=config["embed_size"],
            class_num=config["class_num"],
            common_dim=config.get("common_dim", config["embed_size"])
        )
        state_dict = torch.load(model_path, map_location=self.device)
        model.load_state_dict(state_dict)
        model.to(self.device)
        model = model.half()  # Convert weights to float16

        return model

    def summarize_text(self, text: str, sentences_count=12) -> str:
        parser = PlaintextParser.from_string(text, SumyTokenizer("english"))
        summarizer = LexRankSummarizer()
        summarizer.threshold = 0.1  # Set a custom threshold
        summarizer.epsilon = 0.05   # Set a custom epsilon
        summary = summarizer(parser.document, sentences_count)
        summarized_text = ' '.join(str(sentence) for sentence in summary)
        return summarized_text if len(summarized_text) >= 50 else "NaN"




    def embed_text(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)

          
        return torch.mean(outputs.hidden_states[-1], dim=1)

    def predict(self, embedding: torch.Tensor, hash_key: str, url: str, summarized_text: str) -> dict:
        with torch.no_grad():
            logits = self.classifier(embedding)
            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()

        predicted_classes = (probs > 0.5).astype(int)

        result = {
            hash_key: {
                "predicted_label": list(np.array(self.label_names)[np.where(predicted_classes == 1)[0]]),
                "all_probabilities": dict(zip(self.label_names, probs.tolist())),
                "summaries": summarized_text,
                "url": url
            }
        }
        return result
    def __call__(self, input: str):
        from urllib.parse import urlparse

        def is_url(text):
            return urlparse(text).scheme in ["http", "https"]

        def get_domain(input_url):
            parsed = urlparse(input_url)
            return f"{parsed.scheme}://{parsed.netloc}"

        if is_url(input):
            crawler = SoACerCrawler()
            domain_url = get_domain(input)

            try:
                combined_text = crawler.crawl(domain_url)
            except Exception as e:
                return {
                    "error": f"Unhandled scraping failure: {str(e)}",
                    "url": input
                }

            if not combined_text or len(combined_text.strip()) < 50:
                return {
                    "error": "Unable to extract meaningful content from URL",
                    "url": input
                }

            summarized_text = self.summarize_text(combined_text)
        else:
            summarized_text = self.summarize_text(input)

        if summarized_text == "NaN":
            return {
                "error": "Text too short to summarize",
                "url": input
            }

        try:
            embedding = self.embed_text(summarized_text)
            with torch.no_grad():
                logits = self.classifier(embedding)
                probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
            top_label_index = int(np.argmax(probs))
            top_label = self.label_names[top_label_index]

            return {
                "predicted_label": top_label,
                "all_probabilities": dict(zip(self.label_names, probs.tolist())),
                "summaries": summarized_text,
                "url": input
            }

        except Exception as e:
            return {
                "error": f"Classification failure: {str(e)}",
                "url": input
            }




In [None]:
predictor = SoACerPredictor(
    embedder_name="meta-llama/Meta-Llama-3-8B",
    classifier_source="Shahriar/SoACer"  # or classifier_local_dir="path/to/local_dir"
)

url_to_scrape = "https://www.interpayafrica.com/Home/Privacy"
url_to_scrape = "https://www.barneshawaii.com/privacy/"

result = predictor(
    input=url_to_scrape  
)

print(result)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


{'predicted_label': 'finance, marketing & human resources', 'all_probabilities': {'finance, marketing & human resources': 1.0, 'information technology & electronics': 0.0, 'consumer & supply chain': 0.0, 'civil, mechanical & electrical': 0.0, 'medical': 0.0, 'sports, media & entertainment': 0.0, 'education': 0.0, 'government, defense & legal': 0.0, 'travel, food & hospitality': 0.0, 'non-profit': 0.0}, 'summaries': 'Alesia Barnes represents the absolute best in the luxury real estate market in Hawaii. I know this from experience because she was dealing\xa0 with a tough personal situation of her own when helping my wife and I to get our dream home, but she still took the time to help us and responded when we called her. Helped me rent a Single Family home in Kailua, HI I don’t like to do reviews, but since my realtor did an AMAZING job, this is the best i can do for her..It was SUCH a pleasure to work with Chelsey (Chelz) Doria. “Not pushy” at all, and\xa0 really understanding our deman

In [None]:
from datasets import load_dataset
import json

# Load your dataset from Hugging Face
dataset = load_dataset("Shahriar/SoAC_Corpus", split="train")

# Pick the first sample summary
text = dataset[0]["Website_Summary"]

# # Run your predictor
# predictor = SoACerPredictor(
#     embedder_name="meta-llama/Meta-Llama-3-8B",
#     classifier_source="Shahriar/SoACer"  # or classifier_local_dir="path/to/local_dir"
# )
result = predictor(text)

# Print output nicely
print(json.dumps(result, indent=2))


{
  "predicted_label": "finance, marketing & human resources",
  "all_probabilities": {
    "finance, marketing & human resources": 1.0,
    "information technology & electronics": 0.0,
    "consumer & supply chain": 7.593631744384766e-05,
    "civil, mechanical & electrical": 0.0,
    "medical": 0.0,
    "sports, media & entertainment": 0.0,
    "education": 0.0,
    "government, defense & legal": 0.0,
    "travel, food & hospitality": 0.0,
    "non-profit": 0.0
  },
  "summaries": "Site Overview: Title: Interpay Content: Who can be an Interpay merchant? Account Holders What are some of the benefits of having an Interpay wallet? You can pay your fees, bills, and invoices instantly and securely anytime and anywhere You reduce the risk of using cash and of exposing your bank account when it comes to making payments You have real time access to a transaction history to help you track all your payments You can top up your Interpay wallet from anytime and from anywhere You\u2019re the firs

In [None]:
# remove hash-key from the pipeline
# check the prediction logic
# make it executable with command line args
# add a simple CLI interface to run the predictor
# add it as a tool to Langchain and AutoGen
# look at the lexrank summarizer implementation
# can i remove the gpu memory before fedding it into linear head

# you don;t scrap exactly like scrapy

# add await and stuff
# add batch processing

In [None]:
def embed_text(self, text: str) -> torch.Tensor:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)


NameError: name 'torch' is not defined

In [None]:
import asyncio
import aiohttp
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import logging

from boilerpy3 import extractors
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download

import torch
import torch.nn as nn
import numpy as np
import json
import os


class AsyncSoACerCrawler:
    """
    Asynchronous web crawler that:
      - Respects robots.txt
      - Performs a 2-level BFS within the same domain
      - Extracts and cleans text via boilerpy3 or BeautifulSoup
    """
    def __init__(self, user_agent="SoACerBot", max_links=3, concurrency=10):
        self.user_agent = user_agent
        self.max_links = max_links
        # Semaphore to limit concurrent HTTP requests
        self._http_semaphore = asyncio.Semaphore(concurrency)
        # Cache robots.txt parsers per domain
        self._robots_cache = {}
        # aiohttp session (initialized on first use)
        self._session = None

    async def _get_session(self):
        if self._session is None or self._session.closed:
            self._session = aiohttp.ClientSession(headers={"User-Agent": self.user_agent})
        return self._session

    async def _fetch_html(self, url: str) -> (str, str):
        """
        Fetch HTML content for a given URL, respecting a semaphore to limit concurrency.
        Returns (html_text, error_msg). If error_msg is not None, html_text will be None.
        """
        session = await self._get_session()

        async with self._http_semaphore:
            try:
                async with session.get(url, timeout=10) as resp:
                    if resp.status != 200:
                        return None, f"HTTP error {resp.status} on {url}"
                    text = await resp.text()
                    return text, None
            except asyncio.TimeoutError:
                return None, f"Timeout fetching {url}"
            except aiohttp.ClientError as e:
                return None, f"Request error on {url}: {str(e)}"
            except Exception as e:
                return None, f"Unexpected error on {url}: {str(e)}"

    async def _allowed_by_robots(self, url: str) -> bool:
        """
        Check robots.txt for the domain of 'url'. Fetch and parse robots.txt if not cached.
        """
        parsed = urlparse(url)
        domain = f"{parsed.scheme}://{parsed.netloc}"
        rp = self._robots_cache.get(domain)

        if rp is None:
            rp = RobotFileParser()
            robots_url = f"{domain}/robots.txt"
            session = await self._get_session()
            try:
                async with session.get(robots_url, timeout=10) as resp:
                    if resp.status == 200:
                        text = await resp.text()
                        rp.parse(text.splitlines())
                    else:
                        # If no robots.txt or inaccessible, allow by default
                        rp.allow_all = True
            except Exception:
                rp.allow_all = True
            self._robots_cache[domain] = rp

        return rp.can_fetch(self.user_agent, url)

    def _extract_content(self, html: str, url: str) -> str:
        """
        Extract and clean main text from HTML using boilerpy3. If that fails, fallback to BeautifulSoup.
        """
        try:
            extractor = extractors.ArticleExtractor()
            doc = extractor.get_doc(html)
            return doc.content
        except Exception:
            soup = BeautifulSoup(html, 'html.parser')
            for tag in soup(['script', 'style']):
                tag.decompose()
            text_chunks = [
                chunk.strip()
                for chunk in soup.get_text().splitlines()
                if chunk.strip()
            ]
            return "\n".join(text_chunks)

    async def _crawl_one(self, url: str) -> (str, list, str):
        """
        Crawl a single URL:
          - Check robots.txt
          - Fetch and clean content
          - Extract up to max_links child URLs within same domain
        Returns (cleaned_text, link_list, error_msg).
        """
        # Check robots.txt
        allowed = await self._allowed_by_robots(url)
        if not allowed:
            return None, [], f"Disallowed by robots.txt: {url}"

        # Fetch HTML
        html, error = await self._fetch_html(url)
        if error:
            return None, [], error

        # Clean text
        cleaned = self._extract_content(html, url)

        # Extract child links within same domain, excluding privacy/terms/policy pages
        soup = BeautifulSoup(html, "html.parser")
        parsed_seed = urlparse(url)
        domain_netloc = parsed_seed.netloc

        hrefs = []
        for tag in soup.find_all("a", href=True):
            href = urljoin(url, tag.get("href"))
            parsed_href = urlparse(href)
            if parsed_href.netloc != domain_netloc:
                continue
            lower_href = href.lower()
            if any(keyword in lower_href for keyword in ['privacy', 'terms', 'policy']):
                continue
            hrefs.append(href)

        # Prioritize links containing 'about' or 'service'
        prioritized = [h for h in hrefs if any(p in h.lower() for p in ['about', 'service'])]
        remaining = [h for h in hrefs if h not in prioritized]

        # Take up to max_links from each list
        child_links = prioritized[: self.max_links] + remaining[: self.max_links]
        return cleaned, child_links, None

    async def crawl(self, seed_url: str) -> str:
        """
        Perform a 2-level BFS crawl starting from seed_url. Return concatenated text from all visited pages.
        """
        if not seed_url.startswith("http"):
            seed_url = "http://" + seed_url

        visited = set()
        texts = {}
        queue = [seed_url]

        # Up to 2 “hops” from the seed: depth 0 → 3 (inclusive)
        for depth in range(4):
            tasks = []
            for link in queue:
                if link in visited:
                    continue
                visited.add(link)
                tasks.append(asyncio.create_task(self._crawl_one(link)))

            if not tasks:
                break

            results = await asyncio.gather(*tasks, return_exceptions=True)
            next_queue = []

            for idx, result in enumerate(results):
                if isinstance(result, Exception):
                    # Unexpected exception in _crawl_one
                    logging.warning(f"Exception crawling {queue[idx]}: {str(result)}")
                    continue

                cleaned, child_links, error = result
                current_url = queue[idx]

                if cleaned:
                    texts[current_url] = cleaned
                elif error:
                    logging.warning(error)

                # Enqueue child links for next depth
                for child in child_links:
                    if child not in visited:
                        next_queue.append(child)

            queue = next_queue

        # Close session when done
        if self._session and not self._session.closed:
            await self._session.close()

        # Return combined text
        combined = "\n".join(texts.values())
        return combined if combined.strip() else None


class DownstreamModelSingle(nn.Module):
    """
    The same classifier architecture used in training.
    """

    def __init__(self, embed_size: int, class_num: int, common_dim: int = None):
        super().__init__()
        if common_dim and (common_dim != embed_size):
            self.compress = nn.Sequential(
                nn.Linear(embed_size, common_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
            )
            final_input_dim = common_dim
        else:
            self.compress = nn.Identity()
            final_input_dim = embed_size

        self.fc1 = nn.Linear(final_input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.act1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.act2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, class_num)

    def forward(self, x):
        out = self.compress(x)
        out = self.fc1(out)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout2(out)
        return self.fc3(out)


class SoACerPredictor:
    """
    Predictor that wraps:
      - AsyncSoACerCrawler for asynchronous crawling
      - Summarization via Sumy (blocking, offloaded to executor)
      - Embedding + classification via HuggingFace (blocking, offloaded to executor)
    """

    def __init__(
        self,
        embedder_name="meta-llama/Meta-Llama-3-8B",
        classifier_source="Shahriar/SoACer",
        classifier_local_dir=None,
        device=None,
        crawler_max_links=3,
        crawler_concurrency=10,
    ):
        # Device setup for PyTorch
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Label names (must match classifier training)
        self.label_names = [
            "finance, marketing & human resources",
            "information technology & electronics",
            "consumer & supply chain",
            "civil, mechanical & electrical",
            "medical",
            "sports, media & entertainment",
            "education",
            "government, defense & legal",
            "travel, food & hospitality",
            "non-profit",
        ]

        # Initialize the asynchronous crawler
        self.async_crawler = AsyncSoACerCrawler(
            user_agent="SoACerBot",
            max_links=crawler_max_links,
            concurrency=crawler_concurrency,
        )

        # Load embedder + classifier
        self.embedder_name = embedder_name
        self.classifier_source = classifier_source
        self.classifier_local_dir = classifier_local_dir

        self.embedding_model, self.tokenizer = self._load_embedder()
        self.classifier = self._load_classifier()
        self.classifier.eval()

        # Get event loop and (optionally) a dedicated ThreadPoolExecutor
        self.loop = asyncio.get_event_loop()

    def _load_embedder(self):
        """
        Load tokenizer and causal LM for embeddings. Returns (model, tokenizer).
        """
        tokenizer = AutoTokenizer.from_pretrained(self.embedder_name, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        config_kwargs = {
            "trust_remote_code": True,
            "revision": "main",
            "use_auth_token": None,
            "output_hidden_states": True,
        }
        model_config = AutoConfig.from_pretrained(self.embedder_name, **config_kwargs)

        model = AutoModelForCausalLM.from_pretrained(
            self.embedder_name,
            config=model_config,
            device_map=None,            # No automatic sharding
            torch_dtype=torch.float16,  # Use float16 for efficiency
            attn_implementation="eager",
        )
        model.to(self.device)
        return model, tokenizer

    def _load_classifier(self):
        """
        Load the downstream classifier from either a local directory or HuggingFace Hub.
        """
        if self.classifier_local_dir:
            config_path = os.path.join(self.classifier_local_dir, "config.json")
            model_path = os.path.join(self.classifier_local_dir, "pytorch_model.bin")
        else:
            config_path = hf_hub_download(repo_id=self.classifier_source, filename="config.json")
            model_path = hf_hub_download(repo_id=self.classifier_source, filename="pytorch_model.bin")

        with open(config_path) as f:
            config = json.load(f)

        model = DownstreamModelSingle(
            embed_size=config["embed_size"],
            class_num=config["class_num"],
            common_dim=config.get("common_dim", config["embed_size"]),
        )
        state_dict = torch.load(model_path, map_location=self.device)
        model.load_state_dict(state_dict)
        model.to(self.device)
        model = model.half()  # Convert weights to float16
        return model

    def summarize_text(self, text: str, sentences_count=12) -> str:
        """
        Synchronous summarization using Sumy (LexRank).
        """
        parser = PlaintextParser.from_string(text, SumyTokenizer("english"))
        summarizer = LexRankSummarizer()
        summarizer.threshold = 0.1
        summarizer.epsilon = 0.05
        summary = summarizer(parser.document, sentences_count)
        summarized_text = " ".join(str(sentence) for sentence in summary)
        return summarized_text if len(summarized_text) >= 50 else "NaN"

    def embed_text(self, text: str) -> torch.Tensor:
        """
        Synchronous embedding: tokenize, run through the LM, and mean-pool the last hidden state.
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
        # Mean-pool over sequence dimension
        return torch.mean(outputs.hidden_states[-1], dim=1)

    def _classify_from_embedding(self, embedding: torch.Tensor, url: str, summarized_text: str) -> dict:
        """
        Synchronous classification from an embedding. Returns the result dict.
        """
        with torch.no_grad():
            logits = self.classifier(embedding)
            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()

        top_label_index = int(np.argmax(probs))
        top_label = self.label_names[top_label_index]
        return {
            "predicted_label": top_label,
            "all_probabilities": dict(zip(self.label_names, probs.tolist())),
            "summaries": summarized_text,
            "url": url,
        }

    async def __call_async__(self, input_str: str) -> dict:
        """
        Asynchronous pipeline:
          1. If input_str is a URL: crawl domain (async)
          2. Summarize text (blocking → run_in_executor)
          3. Embed text (blocking → run_in_executor)
          4. Classify embedding (blocking → run_in_executor)
        Returns a dict with keys: predicted_label, all_probabilities, summaries, url OR error.
        """
        # Helper to detect URLs
        def is_url(text: str) -> bool:
            parsed = urlparse(text)
            return parsed.scheme in ("http", "https")

        def get_domain(input_url: str) -> str:
            parsed = urlparse(input_url)
            return f"{parsed.scheme}://{parsed.netloc}"

        # 1. Crawl if URL, else treat input as raw text
        if is_url(input_str):
            domain_url = get_domain(input_str)
            try:
                combined_text = await self.async_crawler.crawl(domain_url)
            except Exception as e:
                return {"error": f"Unhandled scraping failure: {str(e)}", "url": input_str}

            if not combined_text or len(combined_text.strip()) < 50:
                return {"error": "Unable to extract meaningful content from URL", "url": input_str}
        else:
            combined_text = input_str

        # 2. Summarize (offload to executor to avoid blocking the event loop)
        summarized_text = await self.loop.run_in_executor(None, self.summarize_text, combined_text)
        if summarized_text == "NaN":
            return {"error": "Text too short to summarize", "url": input_str}

        # 3. Embed (offload to executor)
        try:
            embedding = await self.loop.run_in_executor(None, self.embed_text, summarized_text)
        except Exception as e:
            return {"error": f"Embedding failure: {str(e)}", "url": input_str}

        # 4. Classify (offload to executor)
        try:
            result = await self.loop.run_in_executor(
                None, self._classify_from_embedding, embedding, input_str, summarized_text
            )
            return result
        except Exception as e:
            return {"error": f"Classification failure: {str(e)}", "url": input_str}

    def __call__(self, input_str: str) -> dict:
        """
        Synchronous wrapper that runs the async pipeline to completion.
        """
        return self.loop.run_until_complete(self.__call_async__(input_str))


# Example usage (synchronous):
# predictor = SoACerPredictor(
#     embedder_name="meta-llama/Meta-Llama-3-8B",
#     classifier_source="Shahriar/SoACer",
#     classifier_local_dir=None,
#     device=None,
# )
# result = predictor("https://example.com")
# print(result)


In [None]:
import asyncio
import json
from datasets import load_dataset

# Assume SoACerPredictor and classify_multiple_websites have already been defined/imported
# from the code you added previously.

async def classify_multiple_websites(url_list):
    """
    Given a list of URLs, run SoACerPredictor.__call_async__ on each concurrently.
    Returns a list of result dicts (in the same order as url_list).
    """
    predictor = SoACerPredictor(
        embedder_name="meta-llama/Meta-Llama-3-8B",
        classifier_source="Shahriar/SoACer",
        classifier_local_dir=None,
        device=None,
        crawler_max_links=3,
        crawler_concurrency=10,
    )

    tasks = [asyncio.create_task(predictor.__call_async__(url)) for url in url_list]
    results = await asyncio.gather(*tasks, return_exceptions=False)
    return results

if __name__ == "__main__":
    # # 1. Load the Hugging Face dataset
    # dataset = load_dataset("Shahriar/SoAC_Corpus", split="train")

    # # 2. Extract the first 10 Privacy_Policy_URL values
    # urls = [dataset[i]["Privacy_Policy_URL"] for i in range(10)]

    # # 3. Run the async classification on those 10 URLs
    # all_results = asyncio.run(classify_multiple_websites(urls))

    # # 4. Pretty-print each result
    # for idx, (url, res) in enumerate(zip(urls, all_results), start=1):
    #     print(f"\n=== Result for URL #{idx}: {url} ===")
    #     print(json.dumps(res, indent=2))

    # 1. Load the Hugging Face dataset
    from datasets import load_dataset
    dataset = load_dataset("Shahriar/SoAC_Corpus", split="train")

    # 2. Extract the first 10 Privacy_Policy_URL values
    urls = [dataset[i]["Privacy_Policy_URL"] for i in range(10)]

    # 3. Call the async function with `await` directly
    all_results = await classify_multiple_websites(urls)

    # 4. Pretty-print each result
    import json
    for idx, (url, res) in enumerate(zip(urls, all_results), start=1):
        print(f"\n=== Result for URL #{idx}: {url} ===")
        print(json.dumps(res, indent=2))





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




=== Result for URL #1: https://www.interpayafrica.com/Home/Privacy ===
{
  "error": "Unable to extract meaningful content from URL",
  "url": "https://www.interpayafrica.com/Home/Privacy"
}

=== Result for URL #2: https://bizfacility.com/privacy-policy/ ===
{
  "error": "Unable to extract meaningful content from URL",
  "url": "https://bizfacility.com/privacy-policy/"
}

=== Result for URL #3: https://www.barneshawaii.com/privacy/ ===
{
  "error": "Unable to extract meaningful content from URL",
  "url": "https://www.barneshawaii.com/privacy/"
}

=== Result for URL #4: https://bhogalpartners.co.uk/privacy-policy/ ===
{
  "error": "Unable to extract meaningful content from URL",
  "url": "https://bhogalpartners.co.uk/privacy-policy/"
}

=== Result for URL #5: https://bnrmediagroup.ca/privacy-policy/ ===
{
  "error": "Unable to extract meaningful content from URL",
  "url": "https://bnrmediagroup.ca/privacy-policy/"
}

=== Result for URL #6: https://www.copra.org/privacy-policy ===
{
  

In [None]:
import asyncio
import aiohttp
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
import logging

from boilerpy3 import extractors
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download

import torch
import torch.nn as nn
import numpy as np
import json
import os


import asyncio
import aiohttp
import logging
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from bs4 import BeautifulSoup
from boilerpy3 import extractors

class AsyncSoACerCrawler:
    """
    Robust asynchronous web crawler with:
      - Per-domain aiohttp session
      - Resilient to TLS teardown / anti-bot protection
      - Retry logic, optional link BFS
    """
    def __init__(self, user_agent=None, max_links=3, concurrency=3, retries=2, crawl_delay=0.3):
        self.user_agent = user_agent or (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/114.0.0.0 Safari/537.36"
        )
        self.max_links = max_links
        self.concurrency = concurrency
        self.retries = retries
        self.crawl_delay = crawl_delay
        self._robots_cache = {}

    async def _fetch_html(self, url: str, session: aiohttp.ClientSession) -> (str, str):
        """
        Attempt to fetch HTML with retry and exponential backoff.
        Returns (html, error_message).
        """
        for attempt in range(self.retries + 1):
            try:
                async with session.get(url, timeout=60) as resp:
                    if resp.status != 200:
                        return None, f"HTTP error {resp.status} on {url}"
                    return await resp.text(), None
            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
                if attempt < self.retries:
                    await asyncio.sleep(2 ** attempt * 0.5)
                    continue
                return None, f"Request error on {url}: {str(e)}"
            except Exception as e:
                return None, f"Unexpected error on {url}: {str(e)}"

    async def _allowed_by_robots(self, url: str, session: aiohttp.ClientSession) -> bool:
        parsed = urlparse(url)
        domain = f"{parsed.scheme}://{parsed.netloc}"
        rp = self._robots_cache.get(domain)

        if rp is None:
            rp = RobotFileParser()
            robots_url = f"{domain}/robots.txt"
            try:
                async with session.get(robots_url, timeout=60) as resp:
                    if resp.status == 200:
                        text = await resp.text()
                        rp.parse(text.splitlines())
                    else:
                        rp.allow_all = True
            except Exception:
                rp.allow_all = True
            self._robots_cache[domain] = rp

        can_fetch = rp.can_fetch(self.user_agent, url)
        if not can_fetch:
            logging.warning(f"robots.txt disallows fetching: {url}")
        return True  # Always try, just log warning

    def _extract_content(self, html: str, url: str) -> str:
        try:
            extractor = extractors.ArticleExtractor()
            return extractor.get_doc(html).content
        except Exception:
            soup = BeautifulSoup(html, 'html.parser')
            for tag in soup(['script', 'style']):
                tag.decompose()
            lines = [line.strip() for line in soup.get_text().splitlines() if line.strip()]
            return "\n".join(lines)

    async def _crawl_one(self, url: str, session: aiohttp.ClientSession) -> (str, list, str):
        await self._allowed_by_robots(url, session)
        html, error = await self._fetch_html(url, session)
        if error:
            return None, [], error

        cleaned = self._extract_content(html, url)

        # Skip BFS if disabled
        if self.max_links == 0:
            return cleaned, [], None

        soup = BeautifulSoup(html, "html.parser")
        domain_netloc = urlparse(url).netloc
        hrefs = []

        for tag in soup.find_all("a", href=True):
            href = urljoin(url, tag.get("href"))
            if urlparse(href).netloc != domain_netloc:
                continue
            if any(k in href.lower() for k in ['privacy', 'terms', 'policy']):
                continue
            hrefs.append(href)

        prioritized = [h for h in hrefs if any(p in h.lower() for p in ['about', 'service'])]
        remaining = [h for h in hrefs if h not in prioritized]
        child_links = prioritized[:self.max_links] + remaining[:self.max_links]

        return cleaned, child_links, None

    async def crawl(self, seed_url: str) -> str:
        """
        Perform a 2-level BFS starting from the seed URL.
        Returns combined text or None.
        """
        if not seed_url.startswith("http"):
            seed_url = "http://" + seed_url

        visited = set()
        texts = {}
        queue = [seed_url]

        async with aiohttp.ClientSession(headers={"User-Agent": self.user_agent}) as session:
            for depth in range(4):
                tasks = []
                for link in queue:
                    if link in visited:
                        continue
                    visited.add(link)
                    tasks.append(self._crawl_one(link, session))

                if not tasks:
                    break

                results = await asyncio.gather(*tasks, return_exceptions=True)
                next_queue = []

                for i, result in enumerate(results):
                    if isinstance(result, Exception):
                        logging.warning(f"Exception crawling {queue[i]}: {str(result)}")
                        continue

                    cleaned, child_links, error = result
                    if cleaned:
                        texts[queue[i]] = cleaned
                    elif error:
                        logging.warning(error)

                    next_queue.extend(child for child in child_links if child not in visited)

                queue = next_queue
                await asyncio.sleep(self.crawl_delay)

        combined = "\n".join(texts.values())
        return combined if combined.strip() else None



class DownstreamModelSingle(nn.Module):
    """
    The same classifier architecture used in training.
    """

    def __init__(self, embed_size: int, class_num: int, common_dim: int = None):
        super().__init__()
        if common_dim and (common_dim != embed_size):
            self.compress = nn.Sequential(
                nn.Linear(embed_size, common_dim),
                nn.ReLU(),
                nn.Dropout(0.3),
            )
            final_input_dim = common_dim
        else:
            self.compress = nn.Identity()
            final_input_dim = embed_size

        self.fc1 = nn.Linear(final_input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.act1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.act2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3 = nn.Linear(256, class_num)

    def forward(self, x):
        out = self.compress(x)
        out = self.fc1(out)
        out = self.bn1(out)
        out = self.act1(out)
        out = self.dropout1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.act2(out)
        out = self.dropout2(out)
        return self.fc3(out)


class SoACerPredictor:
    """
    Predictor that wraps:
      - AsyncSoACerCrawler for async crawling
      - Summarization via Sumy (blocking, offloaded to executor)
      - Embedding + classification via HuggingFace (blocking, offloaded)
    """

    def __init__(
        self,
        embedder_name="meta-llama/Meta-Llama-3-8B",
        classifier_source="Shahriar/SoACer",
        classifier_local_dir=None,
        device=None,
        crawler_max_links=3,
        crawler_concurrency=10,
    ):
        # PyTorch device
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Label names (must match the classifier training)
        self.label_names = [
            "finance, marketing & human resources",
            "information technology & electronics",
            "consumer & supply chain",
            "civil, mechanical & electrical",
            "medical",
            "sports, media & entertainment",
            "education",
            "government, defense & legal",
            "travel, food & hospitality",
            "non-profit",
        ]

        # Async crawler
        self.async_crawler = AsyncSoACerCrawler(
            user_agent=None,  # uses realistic browser UA by default
            max_links=crawler_max_links,
            concurrency=crawler_concurrency,
        )

        # Load embedder + tokenizer
        self.embedder_name = embedder_name
        self.classifier_source = classifier_source
        self.classifier_local_dir = classifier_local_dir

        self.embedding_model, self.tokenizer = self._load_embedder()
        self.classifier = self._load_classifier()
        self.classifier.eval()

        # Use the existing event loop
        self.loop = asyncio.get_event_loop()

    def _load_embedder(self):
        """
        Load tokenizer and causal LM for embeddings. Returns (model, tokenizer).
        """
        tokenizer = AutoTokenizer.from_pretrained(self.embedder_name, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"

        config_kwargs = {
            "trust_remote_code": True,
            "revision": "main",
            "use_auth_token": None,
            "output_hidden_states": True,
        }
        model_config = AutoConfig.from_pretrained(self.embedder_name, **config_kwargs)

        model = AutoModelForCausalLM.from_pretrained(
            self.embedder_name,
            config=model_config,
            device_map=None,            # No automatic sharding
            torch_dtype=torch.float16,  # Use float16 for efficiency
            attn_implementation="eager",
        )
        model.to(self.device)
        return model, tokenizer

    def _load_classifier(self):
        """
        Load the downstream classifier from either a local directory or HuggingFace Hub.
        """
        if self.classifier_local_dir:
            config_path = os.path.join(self.classifier_local_dir, "config.json")
            model_path = os.path.join(self.classifier_local_dir, "pytorch_model.bin")
        else:
            config_path = hf_hub_download(repo_id=self.classifier_source, filename="config.json")
            model_path = hf_hub_download(repo_id=self.classifier_source, filename="pytorch_model.bin")

        with open(config_path) as f:
            config = json.load(f)

        model = DownstreamModelSingle(
            embed_size=config["embed_size"],
            class_num=config["class_num"],
            common_dim=config.get("common_dim", config["embed_size"]),
        )
        state_dict = torch.load(model_path, map_location=self.device)
        model.load_state_dict(state_dict)
        model.to(self.device)
        model = model.half()  # Convert weights to float16
        return model

    def summarize_text(self, text: str, sentences_count=12) -> str:
        """
        Synchronous summarization with Sumy (LexRank).
        If fewer than 50 chars, returns "NaN".
        """
        parser = PlaintextParser.from_string(text, SumyTokenizer("english"))
        summarizer = LexRankSummarizer()
        summarizer.threshold = 0.1
        summarizer.epsilon = 0.05
        summary = summarizer(parser.document, sentences_count)
        summarized_text = " ".join(str(sentence) for sentence in summary)
        return summarized_text if len(summarized_text) >= 50 else "NaN"

    def embed_text(self, text: str) -> torch.Tensor:
        """
        Synchronous embedding: tokenize, run through LM, and mean-pool last hidden state.
        """
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.embedding_model(**inputs)
        return torch.mean(outputs.hidden_states[-1], dim=1)

    def _classify_from_embedding(self, embedding: torch.Tensor, url: str, final_text: str) -> dict:
        """
        Synchronous classification from an embedding. Returns the result dict.
        """
        with torch.no_grad():
            logits = self.classifier(embedding)
            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()

        top_label_index = int(np.argmax(probs))
        top_label = self.label_names[top_label_index]
        return {
            "predicted_label": top_label,
            "all_probabilities": dict(zip(self.label_names, probs.tolist())),
            "summaries": final_text,
            "url": url,
        }

    async def __call_async__(self, input_str: str) -> dict:
        """
        Asynchronous pipeline:
          1. Crawl if input_str is a URL
          2. Summarize (offloaded to executor)
          3. Embed text (offloaded)
          4. Classify embedding (offloaded)
        Returns a dict with predicted_label, all_probabilities, summaries, url or an error.
        """
        # Detect if input_str is a URL
        def is_url(text: str) -> bool:
            parsed = urlparse(text)
            return parsed.scheme in ("http", "https")

        def get_domain(input_url: str) -> str:
            parsed = urlparse(input_url)
            return f"{parsed.scheme}://{parsed.netloc}"

        # 1. Crawl if it's a URL
        if is_url(input_str):
            domain_url = get_domain(input_str)
            print(f"Starting crawl for: {domain_url}")
            try:
                combined_text = await self.async_crawler.crawl(domain_url)
            except Exception as e:
                return {"error": f"Unhandled scraping failure: {str(e)}", "url": input_str}

            if not combined_text or len(combined_text.strip()) < 50:
                return {
                    "error": "Unable to extract meaningful content from URL",
                    "url": input_str
                }
        else:
            combined_text = input_str

        # 2. Summarize (offload to executor)
        summarized = await self.loop.run_in_executor(None, self.summarize_text, combined_text)
        # If summarization was too short, use the raw combined_text instead
        final_text = summarized if summarized != "NaN" else combined_text

        # 3. Embed (offload to executor)
        try:
            embedding = await self.loop.run_in_executor(None, self.embed_text, final_text)
        except Exception as e:
            return {"error": f"Embedding failure: {str(e)}", "url": input_str}

        # 4. Classify (offload to executor)
        try:
            result = await self.loop.run_in_executor(
                None, self._classify_from_embedding, embedding, input_str, final_text
            )
            return result
        except Exception as e:
            return {"error": f"Classification failure: {str(e)}", "url": input_str}

    def __call__(self, input_str: str) -> dict:
        """
        Synchronous wrapper that runs the async pipeline to completion.
        """
        return self.loop.run_until_complete(self.__call_async__(input_str))


# Example: Classify 10 URLs from the Hugging Face dataset
async def classify_multiple_websites(url_list):
    predictor = SoACerPredictor(
        embedder_name="meta-llama/Meta-Llama-3-8B",
        classifier_source="Shahriar/SoACer",
        classifier_local_dir=None,
        device=None,
        crawler_max_links=3,
        crawler_concurrency=10,
    )
    tasks = [asyncio.create_task(predictor.__call_async__(url)) for url in url_list]
    results = await asyncio.gather(*tasks, return_exceptions=False)
    return results


if __name__ == "__main__":
    import asyncio
    from datasets import load_dataset
    import json

    import nest_asyncio
    nest_asyncio.apply()

    # (Assume classify_multiple_websites is already defined as before)

    # 1. Load the Hugging Face dataset
    dataset = load_dataset("Shahriar/SoAC_Corpus", split="train")

    # 2. Extract the first 10 Privacy_Policy_URL values
    urls = [dataset[i]["Privacy_Policy_URL"] for i in range(10)]

    # 3. Get the currently running event loop
    loop = asyncio.get_event_loop()

    # 4. Schedule the coroutine using run_until_complete
    all_results = loop.run_until_complete(classify_multiple_websites(urls))

    # 5. Pretty-print each result
    for idx, (url, res) in enumerate(zip(urls, all_results), start=1):
        print(f"\n=== Result for URL #{idx}: {url} ===")
        print(json.dumps(res, indent=2))



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]