## 1. Summarize Text with Hugging Face BART

In [None]:

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

text = "Long article text goes here..."
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
print(summary[0]['summary_text'])
     

## 2. Fine-tune BART on CNN/DailyMail

In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

def preprocess(examples):
    inputs = tokenizer(examples["article"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples["highlights"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

training_args = TrainingArguments(
    output_dir="./results", per_device_train_batch_size=4, num_train_epochs=1, logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()
     

## 3. Extractive vs. Abstractive Summarization

In [None]:

# Extractive with spaCy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from heapq import nlargest

text = "Long article text here..."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

word_freq = {}
for word in doc:
    if word.text.lower() not in STOP_WORDS and word.is_alpha:
        word_freq[word.text.lower()] = word_freq.get(word.text.lower(), 0) + 1

sentence_scores = {}
for sent in doc.sents:
    for word in sent:
        if word.text.lower() in word_freq:
            sentence_scores[sent] = sentence_scores.get(sent, 0) + word_freq[word.text.lower()]

summary_sentences = nlargest(3, sentence_scores, key=sentence_scores.get)
summary = " ".join([sent.text for sent in summary_sentences])
print(summary)
     

## 4. Summarize Long Documents

In [None]:

def split_text(text, chunk_size=1024):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

chunks = list(split_text("Long document text", chunk_size=400))

from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

summary_parts = [summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
full_summary = " ".join(summary_parts)
print(full_summary)
     

## 5. ROUGE Evaluation

In [None]:

from datasets import load_metric

rouge = load_metric("rouge")
predictions = ["Bart summarized this well."]
references = ["The summary should cover key points clearly."]
results = rouge.compute(predictions=predictions, references=references)
print(results)
     

## 6. Prompt-based Summarization (Chat Models)

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

prompt = "Summarize this article:\n" + "Long article..." + "\nSummary:"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

output_ids = model.generate(input_ids, max_new_tokens=150)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(summary)
     

## 7. Batch Summarization from CSV

In [None]:

import pandas as pd
from transformers import pipeline

df = pd.read_csv("articles.csv")  # column: content
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
df["summary"] = df["content"].apply(lambda x: summarizer(x, max_length=130, min_length=30, do_sample=False)[0]['summary_text'])
df.to_csv("summaries.csv", index=False)
     

## 8. REST API with FastAPI

In [None]:

# Save this as app.py
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

class TextRequest(BaseModel):
    text: str

@app.post("/summarize")
def summarize(req: TextRequest):
    result = summarizer(req.text, max_length=130, min_length=30, do_sample=False)
    return {"summary": result[0]['summary_text']}
     

## 9. Quantized Summarization (4-bit LLM)

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMA-2-7B-GGML", quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMA-2-7B-GGML")
     

## 10. Multilingual Summarization (mBART)

In [None]:

from transformers import MBartTokenizer, MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

text = "Texte en français ici..."  # French text
tokenizer.src_lang = "fr_XX"

input_ids = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).input_ids
summary_ids = model.generate(input_ids, max_length=100)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)
     