In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the datasets
ds1_train = load_dataset("joelniklaus/legal_case_document_summarization", split='train')
ds1_train = ds1_train.remove_columns(['dataset_name'])
ds1_train = ds1_train.rename_column('judgement', 'text')
ds1_train = ds1_train.rename_column('summary', 'labels')
print(ds1_train)

ds1_test = load_dataset("joelniklaus/legal_case_document_summarization", split='test')
ds1_test = ds1_test.remove_columns(['dataset_name'])
ds1_test = ds1_test.rename_column('judgement', 'text')
ds1_test = ds1_test.rename_column('summary', 'labels')

# NOTE: This dataset only has 50 rows. It may not be a dataset we want to use.
# NOTE: THIS DATA IS NOT PLAYING NICELY WITH CONCATENATION
# Although the summaries appear to be good
ds2 = load_dataset("manasvikalyan/legal-documents-summary")
ds2 = ds2['data']
ds2 = ds2.remove_columns(['summary_a2'])
ds2 = ds2.rename_column('summary_a1', 'labels')
ds2 = ds2.rename_column('judgement', 'text')
print(ds2)

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text', 'labels'],
    num_rows: 7773
})


Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text', 'labels'],
    num_rows: 50
})


In [3]:
# ds9: AjayMukundS/Legal_Text_Summarization-llama2
ds9_train = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split='train')
ds9_test = load_dataset("AjayMukundS/Legal_Text_Summarization-llama2", split='test')
print(ds9_train)

Dataset({
    features: ['judgement', 'dataset_name', 'summary', 'text'],
    num_rows: 7773
})


In [4]:
from transformers import BartTokenizer

In [5]:
# Load the BART tokenizer
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')

In [6]:
# Tokenization function for text and summaries
def tokenize_function(examples):
    # Tokenize the input text
    inputs = tokenizer(examples['text'], max_length=512, truncation=True, padding='max_length')
    
    # Tokenize the output summary labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['labels'], max_length=150, truncation=True, padding='max_length')

    # Set the tokenized labels in the input dictionary
    inputs['labels'] = labels['input_ids']
    
    return inputs

In [7]:
# Example tokenizer for ds9
def tokenize_batch(batch):
    # Tokenizing input and target sequences
    inputs = tokenizer(batch['text'], padding="max_length", truncation=True, max_length=1024)
    targets = tokenizer(batch['summary'], padding="max_length", truncation=True, max_length=256)
    
    # Assign labels as the target input_ids
    inputs["labels"] = targets["input_ids"]
    return inputs

In [8]:
# Tokenize the datasets for DistilBART
# Training Data
ds1_train_tokenized = ds1_train.map(tokenize_function, batched=True)

ds2_tokenized = ds2.map(tokenize_function, batched=True)
ds2_tokenized = ds2_tokenized.train_test_split(test_size=0.2)
ds2_train_tokenized = ds2_tokenized['train'] 

In [9]:
ds9_train_tokenized = ds9_train.map(tokenize_batch, batched=True)
ds9_test_tokenized = ds9_test.map(tokenize_batch, batched=True)

In [33]:
# Taking 10 examples from each tokenized set to test the training.
train_sample = ds9_train_tokenized.select(range(30))
test_sample = ds9_test_tokenized.select(range(10))

print(train_sample)
print(test_sample)


Dataset({
    features: ['judgement', 'dataset_name', 'summary', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 30
})
Dataset({
    features: ['judgement', 'dataset_name', 'summary', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})


In [10]:
from datasets import concatenate_datasets

In [None]:
combined_training_tokenized_dataset = concatenate_datasets([
ds1_train_tokenized, 
ds2_train_tokenized
])

combined_testing_tokenized_dataset = concatenate_datasets([
ds1_test_tokenized, 
ds2_test_tokenized
])

In [None]:
# Set the dataset format to PyTorch tensors
# print(ds1_train_tokenized)
combined_training_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
combined_testing_tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [11]:
from transformers import BartForConditionalGeneration

In [12]:
# Load the DistilBART model for conditional generation
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')

In [13]:
# To handle padding dynamically (i.e., pad to the longest sequence in a batch rather than a fixed length)
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [34]:
from transformers import TrainingArguments, Trainer, logging

In [35]:
import torch
from accelerate import Accelerator

accelerator = Accelerator()
device = accelerator.device
print(f"Accelerator is using device: {device}")

Accelerator is using device: mps


In [36]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_testing_with_ds9",
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=1000,
    eval_steps=500,
    num_train_epochs=3,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_sample, # change these to real data when done testing with samples
    eval_dataset=test_sample,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [38]:
# Set verbosity to info to see the logs in real time
logging.set_verbosity_info()

# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: dataset_name, text, summary, judgement. If dataset_name, text, summary, judgement are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 30
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 24
  Number of trainable parameters = 305,510,400


Step,Training Loss,Validation Loss


Saving model checkpoint to ./results_testing_with_ds9/checkpoint-24
Configuration saved in ./results_testing_with_ds9/checkpoint-24/config.json
Configuration saved in ./results_testing_with_ds9/checkpoint-24/generation_config.json
Model weights saved in ./results_testing_with_ds9/checkpoint-24/model.safetensors
tokenizer config file saved in ./results_testing_with_ds9/checkpoint-24/tokenizer_config.json
Special tokens file saved in ./results_testing_with_ds9/checkpoint-24/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=24, training_loss=0.0, metrics={'train_runtime': 412.5234, 'train_samples_per_second': 0.218, 'train_steps_per_second': 0.058, 'total_flos': 139312087695360.0, 'train_loss': 0.0, 'epoch': 3.0})

In [19]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: dataset_name, text, summary, judgement. If dataset_name, text, summary, judgement are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 10
  Batch size = 4


{'eval_loss': nan, 'eval_runtime': 6.8282, 'eval_samples_per_second': 1.465, 'eval_steps_per_second': 0.439, 'epoch': 3.0}


In [None]:
import random

# Get a random index
random_index = random.randint(0, len(ds9_test_tokenized) - 1)

# Access the 'text' property of the randomly selected sample
random_sample_text = ds9_test_tokenized[random_index]['text']
print(random_sample_text)

In [32]:
# Example of generating a summary

# Ensure model is on MPS
model.to("mps")

# Tokenize the text
# Currently passing a random_sample_text from ds9_test_tokenized
inputs = tokenizer(random_sample_text, return_tensors="pt", max_length=1024, truncation=True)

# Move input tensors to MPS
inputs = {key: value.to("mps") for key, value in inputs.items()}

# Generate the summary with minimum length and check summary_ids
summary_ids = model.generate(
    inputs['input_ids'],
    max_length=142,           # Based on the config
    min_length=56,            # Based on the config
    num_beams=4,              # Config default
    length_penalty=2.0,        # From the config
    no_repeat_ngram_size=3,    # Config default
    early_stopping=True        # From config to stop generation early
)


# Decode the summary and move to CPU before printing
summary = tokenizer.decode(summary_ids[0].cpu(), skip_special_tokens=True)
print(summary)





In [29]:
print(summary_ids)

tensor([[2, 0, 0, 0, 0, 0, 2]], device='mps:0')


In [30]:
print(inputs['input_ids'])

tensor([[    0,     0, 10975,  ..., 10127, 13161,     2]], device='mps:0')


In [31]:
print(model.config)

BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": null,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LAB

In [None]:
# Save your model
model.save_pretrained('./trained_HX100_model_1')

# Save the tokenizer
tokenizer.save_pretrained('./trained_HX100_model_1')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('./trained_HX100_model_1')
tokenizer = AutoTokenizer.from_pretrained('./trained_HX100_model_1')

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import random

# Select 5 random indices from the combined_testing_tokenized_dataset
random_indices = random.sample(range(len(combined_testing_tokenized_dataset)), 5)

for idx in random_indices:
    # Example: Take the tokenized input from the tokenized testing dataset
    example_input_ids = combined_testing_tokenized_dataset['input_ids'][idx]
    
    # Convert to torch tensor and move to the device (GPU or CPU)
    example_input_ids = torch.tensor(example_input_ids).unsqueeze(0).to(device)
    
    # Generate the summary using the model
    summary_ids = model.generate(example_input_ids, num_beams=4, max_length=200, early_stopping=True)
    
    # Decode the generated summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Decode the actual (original) input summary
    original_summary = tokenizer.decode(example_input_ids[0], skip_special_tokens=True)
    
    # Print both summaries for comparison
    print("------------------------------------------------------------------------------------------")
    print(f"Example {idx + 1}:")
    print("Original Summary:", original_summary)
    print("*****************************************************************************************")
    print("Generated Summary:", generated_summary)
    print("\n")
