# Run with usloth

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
import pandas as pd
import time
import os

In [None]:

dataset_name = "csebuetnlp/xlsum"
language = "arabic"
number_of_summarize = 5000

min_text_length = 250


df_original = load_dataset(dataset_name, language, split="train")
df_original = pd.DataFrame(df_original)
df_original["text_length"] = df_original["text"].str.len()
df_original = df_original[df_original["text_length"] >= min_text_length]
df_original = df_original.sort_values("text_length").head(number_of_summarize).drop(columns=["text_length","title","summary","url","id"])
df_original.reset_index(drop=True,inplace=True)
df_original.to_csv("df_original.csv",index=False)

we have used the avialable tutorials in this link: https://github.com/unslothai/unsloth

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
model_name = "Qwen/Qwen2.5-14B-Instruct"
max_seq_length = 500  # Balanced for speed and memory on T4

# Load model with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=torch.float16,  # T4 excels with FP16 (not BF16)
    load_in_4bit=True,    # AWQ 4-bit
    device_map="auto"     # Maps to T4 GPU
)
# Enable inference mode
model = FastLanguageModel.for_inference(model)

In [None]:
def generate_summaries(csv_path, prompt, batch_size=4,start_index=0, save_interval=100,output_csv="summaries.csv" ):
    df = pd.read_csv(csv_path)
    total_rows = len(df)
    print(f"Loaded dataset with {total_rows} rows.")
    if os.path.exists(output_csv):
        existing_df = pd.read_csv(output_csv)
        processed_rows = len(existing_df)
        if processed_rows > start_index:
            start_index = processed_rows
            print(f"Resuming from {start_index} rows already processed.")
    else:

        existing_df = pd.DataFrame(columns=["text", "summary"])
        existing_df.to_csv(output_csv, index=False)

    for i in range(start_index, total_rows, batch_size):
        batch_end = min(i + batch_size, total_rows)
        batch_df = df.iloc[i:batch_end]
        batch_texts = batch_df["text"].tolist()

        input_texts = [f"{prompt}\nالنص {text} الملخص " for text in batch_texts]

        inputs = tokenizer(
            input_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_seq_length,

        ).to("cuda")
        tokenized_lengths = [len(input_ids) for input_ids in inputs['input_ids']]
        max_new_tokens_gen = int(max(tokenized_lengths)/2)
        start_time = time.time()
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens_gen,
            min_length=5,
            use_cache=True,
            num_beams=1,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id
        )
        batch_time = time.time() - start_time
        summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        batch_results = pd.DataFrame({
            "text": batch_texts,
            "summary": summaries
        })

        batch_results.to_csv(output_csv, mode='a', header=False, index=False)
        print(f"Processed rows {i}–{batch_end}/{total_rows} "
              f"({batch_time:.2f} s, ~{batch_time/batch_size:.2f} s/row)")

        if (batch_end % save_interval == 0) or (batch_end == total_rows):
            print(f"Checkpoint saved at row {batch_end} to {output_csv}")

    print(f"Done! All summaries saved to {output_csv}")



In [None]:

csv_path = "df_original.csv"
prompt = " اكتب ملخصًا موجزًا وقصيرًا للنص التالي دون إضافة معلومات إضافية أو بيانات وصفية: "

generate_summaries(
    csv_path=csv_path,
    prompt=prompt,
    batch_size=20,
    start_index=0,
    save_interval=100,
    output_csv="summaries.csv"
)
