In [None]:
!pip install -q transformers datasets evaluate rouge-score accelerate bitsandbytes --upgrade
!pip install -q sentencepiece nltk

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.[0m[31m
[0m

In [None]:
import os, time, math, traceback
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    pipeline
)
import evaluate, nltk, pandas as pd
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, " | torch.cuda.device_count():", torch.cuda.device_count())


Device: cuda  | torch.cuda.device_count(): 1


In [None]:
# Small dataset subset to keep memory/latency low
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:30]")
texts = dataset["article"]
refs = dataset["highlights"]

In [None]:
usa_models = {
    "distilBART (USA)" : "sshleifer/distilbart-cnn-12-6",
    "T5-small (USA)"   : "t5-small",
    "Pegasus (USA)"    : "google/pegasus-cnn_dailymail"
}
china_models = {
    # These are commonly used public Chinese models. Some are causal/chat style.
    "Qwen-1.2 (China)"   : "Qwen/Qwen-1.2-7b",        # causal-style; may be heavy
    "ChatGLM2 (China)"   : "THUDM/chatglm2-6b",       # chat/causal model
    "BLOOMZ-cn-small"    : "IDEA-CCNL/BELLE-1.5M"     # fallback small model (toy)
}
model_dict = {**usa_models, **china_models}

In [None]:
# Metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Utility: safely load tokenizer + model with fallback to causal if seq2seq fails
def safe_load_model(model_id):
    """
    Returns: (tokenizer, model, model_type) where model_type is "seq2seq" or "causal"
    or raises an Exception if both attempts fail.
    """
    # load config to inspect model_type if possible
    try:
        config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
        # Some configs provide model_type or architectures info
        mtype = getattr(config, "model_type", None)
    except Exception:
        config = None
        mtype = None

    # Always load tokenizer first
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    # Try seq2seq first (works for t5, bart, pegasus...)
    seq_err = None
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True
        )
        return tok, model, "seq2seq"
    except Exception as e_seq:
        seq_err = e_seq
        # fallback to causal
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True
        )
        return tok, model, "causal"
    except Exception as e_causal:
        # attach both exceptions for debug
        raise RuntimeError(f"Failed to load as seq2seq ({seq_err}) and as causal ({e_causal})")

# Summarization wrapper: handles seq2seq pipelines and causal pipelines with a prompt
def get_summarizer_pipeline(tokenizer, model, model_type):
    if model_type == "seq2seq":
        # Let accelerate handle device placement (do not pass device arg)
        return pipeline("summarization", model=model, tokenizer=tokenizer)
    else:
        # causal: use text-generation pipeline with summarization prompt
        return pipeline("text-generation", model=model, tokenizer=tokenizer)

# Single-model evaluation
def evaluate_model(name, model_id, max_samples=10):
    print("\n=== Evaluating", name, "(", model_id, ") ===")
    try:
        tok, model, mtype = safe_load_model(model_id)
        print("Loaded as", mtype, "| params (dtype):", next(model.parameters()).dtype)
    except Exception as e:
        print("⚠️ Failed to load model:", e)
        traceback.print_exc()
        return None

    summarizer = get_summarizer_pipeline(tok, model, mtype)

    preds, refs_sub = [], []
    start = time.time()
    samples = min(max_samples, len(texts))
    for i in range(samples):
        article = texts[i]
        try:
            if mtype == "seq2seq":
                # use only max_new_tokens to avoid warnings
                out = summarizer(article, max_new_tokens=120, do_sample=False)
                summary = out[0]["summary_text"]
            else:
                # causal model: create a summarization prompt
                prompt = f"Summarize the following article in 3-4 sentences:\n\n{article}\n\nSummary:"
                out = summarizer(prompt, max_new_tokens=120, do_sample=False, do_sample_top_k=0)
                # pipeline returns list of dicts with 'generated_text'
                # for some huggingface causal pipelines key is 'generated_text' or 'text' or 'summary_text'
                if isinstance(out, list) and isinstance(out[0], dict):
                    summary = out[0].get("generated_text") or out[0].get("text") or out[0].get("summary_text") or ""
                    # remove the prompt prefix if included
                    if summary.startswith(prompt):
                        summary = summary[len(prompt):].strip()
                else:
                    summary = str(out)
            preds.append(summary)
            refs_sub.append(refs[i])
        except Exception as ex:
            print(f" Generation error on sample {i}: {ex}")
            # continue after logging
    elapsed = time.time() - start
    avg_latency = elapsed / max(1, len(preds))

    # compute metrics (guard if no preds produced)
    if len(preds) == 0:
        print("No predictions produced for", name)
        return None
    try:
        rouge_scores = rouge.compute(predictions=preds, references=refs_sub)
    except Exception as e:
        print("ROUGE compute error:", e)
        rouge_scores = {"rouge1": None, "rouge2": None, "rougeL": None}
    try:
        # BLEU expects tokenized refs/preds; evaluate library will attempt tokenization
        bleu_score = bleu.compute(predictions=preds, references=refs_sub)
    except Exception as e:
        print("BLEU compute error:", e)
        bleu_score = {"bleu": None}

    # free memory
    try:
        del model
        torch.cuda.empty_cache()
    except Exception:
        pass

    return {
        "Model": name,
        "HF_id": model_id,
        "Model_type": mtype,
        "Samples": len(preds),
        "ROUGE-1": rouge_scores.get("rouge1"),
        "ROUGE-L": rouge_scores.get("rougeL"),
        "BLEU": bleu_score.get("bleu"),
        "Avg Latency (s/sample)": round(avg_latency, 3)
    }


In [None]:
results = []
# Try to be conservative on T4: reduce sample count to keep memory/time reasonable
MAX_SAMPLES_PER_MODEL = 8

for name, mid in model_dict.items():
    try:
        res = evaluate_model(name, mid, max_samples=MAX_SAMPLES_PER_MODEL)
        if res:
            results.append(res)
    except Exception as e:
        print("Top-level error for", name, ":", e)
        traceback.print_exc()


=== Evaluating distilBART (USA) ( sshleifer/distilbart-cnn-12-6 ) ===


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Device set to use cuda:0


Loaded as seq2seq | params (dtype): torch.float16

=== Evaluating T5-small (USA) ( t5-small ) ===


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (789 > 512). Running this sequence through the model will result in indexing errors


Loaded as seq2seq | params (dtype): torch.float16

=== Evaluating Pegasus (USA) ( google/pegasus-cnn_dailymail ) ===


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


Loaded as seq2seq | params (dtype): torch.float16

=== Evaluating Qwen-1.2 (China) ( Qwen/Qwen-1.2-7b ) ===
⚠️ Failed to load model: Qwen/Qwen-1.2-7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 402, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/Qwen/Qwen-1.2-7b/resolve/main/tokenizer_config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files
    hf_hub_download(
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1007, in hf_hub_download
    retu


=== Evaluating ChatGLM2 (China) ( THUDM/chatglm2-6b ) ===


config.json: 0.00B [00:00, ?B/s]

configuration_chatglm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- configuration_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

tokenization_chatglm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

modeling_chatglm.py: 0.00B [00:00, ?B/s]

quantization.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm2-6b:
- modeling_chatglm.py
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

pytorch_model-00001-of-00007.bin:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

pytorch_model-00007-of-00007.bin:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

pytorch_model-00005-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

pytorch_model-00003-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00006-of-00007.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00007.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

pytorch_model-00004-of-00007.bin:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

⚠️ Failed to load model: Failed to load as seq2seq (The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.) and as causal (The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.)

=== Evaluating BLOOMZ-cn-small ( IDEA-CCNL/BELLE-1.5M ) ===
⚠️ Failed to load model: IDEA-CCNL/BELLE-1.5M is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`


Traceback (most recent call last):
  File "/tmp/ipython-input-3785478630.py", line 36, in safe_load_model
    model = AutoModelForCausalLM.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 597, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5048, in from_pretrained
    ) = cls._load_pretrained_model(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 5384, in _load_pretrained_model
    raise ValueError(
ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_fol

In [None]:
df = pd.DataFrame(results)
if not df.empty:
    display(df)
    # add origin column and aggregate
    df["Origin"] = df["Model"].apply(lambda x: "USA" if "USA" in x or "distilBART" in x or "T5" in x or "Pegasus" in x else "China")
    summary = df.groupby("Origin")[["ROUGE-1","ROUGE-L","BLEU","Avg Latency (s/sample)"]].mean().round(4)
    print("\n=== Aggregated USA vs China ===")
    display(summary)
    df.to_csv("usa_vs_china_summarization_results.csv", index=False)
    print("Saved CSV: usa_vs_china_summarization_results.csv")
else:
    print("No successful model runs. Check logs above.")

Unnamed: 0,Model,HF_id,Model_type,Samples,ROUGE-1,ROUGE-L,BLEU,Avg Latency (s/sample)
0,distilBART (USA),sshleifer/distilbart-cnn-12-6,seq2seq,8,0.373442,0.276798,0.118587,1.14
1,T5-small (USA),t5-small,seq2seq,8,0.378188,0.254719,0.109903,0.936
2,Pegasus (USA),google/pegasus-cnn_dailymail,seq2seq,8,0.046875,0.03125,6.5e-05,0.229



=== Aggregated USA vs China ===


Unnamed: 0_level_0,ROUGE-1,ROUGE-L,BLEU,Avg Latency (s/sample)
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USA,0.2662,0.1876,0.0762,0.7683


Saved CSV: usa_vs_china_summarization_results.csv
