# Text2Text Generation - mT5

In [6]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, TrainingArguments, Trainer
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from rouge_score import rouge_scorer

## Data Preprocessing

Legal_Clauses

In [7]:
# Load the dataset package
dataset = load_dataset("shay681/Legal_Clauses")

# Convert the datasets to Dataframe (3 Min)
filtered_Hugging_Face_df = pd.DataFrame.from_dict(dataset['train'])

Downloading readme: 100%|██████████| 30.0/30.0 [00:00<00:00, 30.0kB/s]
Downloading data: 100%|██████████| 802M/802M [02:17<00:00, 5.84MB/s] 
Generating train split: 184933 examples [00:33, 5455.70 examples/s] 


In [2]:
# Load the data with extracted Legal_Clauses locally (15 Sec)
# filtered_Hugging_Face_df = pd.read_parquet('Hugging_Face_df_Legal_Clauses.parquet', engine='pyarrow')

Precedents

In [9]:
# Load the dataset package
dataset = load_dataset("shay681/Precedents")

# Convert the datasets to Dataframe (3 Min)
filtered_Hugging_Face_df = pd.DataFrame.from_dict(dataset['train'])

Downloading data: 100%|██████████| 1.07G/1.07G [03:17<00:00, 5.42MB/s]
Generating train split: 591506 examples [00:10, 55673.11 examples/s] 


In [4]:
# Load the data with extracted Precedents locally (15 Sec)
# filtered_Hugging_Face_df = pd.read_parquet('Hugging_Face_df_Precedents.parquet', engine='pyarrow')

In [10]:
filtered_Hugging_Face_df

Unnamed: 0,Id,text,Precedents_Found,__index_level_0__
0,1,בבית המשפט העליון ...,"[בג""ץ 5856/03]",0
1,159588,בבית המשפט העל...,"[בג""ץ 5856/03]",1
2,160618,בבית המשפט העליו...,"[בג""ץ 5856/03]",2
3,168038,בבית המשפט העל...,"[בג""ץ 5856/03]",3
4,168411,בבית המשפט העליו...,"[בג""ץ 5856/03]",4
...,...,...,...,...
591501,743169,בבית המשפט העליו...,"[בג""ץ 6297/22]",751175
591502,743154,בבית המשפט העליו...,"[בש""א 6311/22]",751179
591503,743155,בבית המשפט העליו...,"[בש""א 6312/22]",751180
591504,743178,בבית המשפט העליון ...,"[בש""א 6313/22, בש""א 6039/22]",751181


In [4]:
# Function to create masked_text
def create_masked_text(row):
    text = row['text']
    # legal_clauses = row['Legal_Clauses_Found'] # for precedents replace to 'Precedents_found'
    legal_clauses = row['Precedents_Found'] 
    masked_text = text
    
    for clause in legal_clauses:
        if clause in text:
            # Replace each clause with the <LEGAL_CLAUSE> placeholder
            masked_text = masked_text.replace(clause, '<LEGAL_CLAUSE>')
    
    return masked_text

In [None]:
# Apply the function to create a new column 'masked_text'
filtered_Hugging_Face_df['masked_text'] = filtered_Hugging_Face_df.apply(create_masked_text, axis=1)
filtered_Hugging_Face_df.head()

## Fine-Tuning the Model

In [None]:
# Load tokenizer and model
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Add a special token for legal clause generation
special_tokens_dict = {'additional_special_tokens': ['<LEGAL_CLAUSE>']}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

In [15]:
# Preprocessing function
def preprocess_function(examples):
    inputs = examples["masked_text"]
    targets = examples["text"]
    
    # Tokenize the inputs (masked text with special token)
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")

    # Tokenize the targets (original text)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]  # Set the original text as labels
    return model_inputs

In [None]:
# Convert the dataframe to a Hugging Face dataset
dataset = Dataset.from_pandas(filtered_Hugging_Face_df)

# Apply the preprocessing function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [16]:
# Split dataset into training and evaluation
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [20]:
training_args = TrainingArguments(
    output_dir="./Text2Text_Results",
    evaluation_strategy="epoch",
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,   # Reduce evaluation batch size as well
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision to reduce memory usage
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [None]:
try: 
    trainer.train() # (37 Hours)
except Exception as e:
    print(f"An error occurred during training: {e}")

In [None]:
# Continue training from last checkpoint
trainer.train(resume_from_checkpoint="./Text2Text_Results/checkpoint-184400")

In [25]:
# Save The Model
trainer.save_model("Text2Text_finetuned_model")

## Inference

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("cuda")
else:
    device = torch.device("cpu")
    print("cpu")

cuda


In [None]:
# Load the Legal_Clauses fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("shay681/Text2Text_Legal_Clauses_finetuned_model")
model.to(device)  # Move model to GPU
tokenizer = MT5Tokenizer.from_pretrained("shay681/Text2Text_Legal_Clauses_finetuned_model")

# -------------------------------------------------------------------------------------------------------- 

# Load the Precedents fine-tuned model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("shay681/Text2Text_Precedents_finetuned_model")
model.to(device)  # Move model to GPU
tokenizer = MT5Tokenizer.from_pretrained("shay681/Text2Text_Precedents_finetuned_model")

In [93]:
# train_test_split df
train_df, eval_df = train_test_split(filtered_Hugging_Face_df, test_size=0.2, random_state=42)

In [None]:
sample_df = eval_df[eval_df['text'].str.len() < 5000]

In [None]:
def constrained_decoding_for_multiple_clauses(model, tokenizer, input_text, allowed_clauses, max_length=50):
    # Split the text by the <LEGAL_CLAUSE> placeholders
    parts = input_text.split('<LEGAL_CLAUSE>')
    
    # Initialize the final generated text
    generated_text = parts[0]
    
    for i in range(1, len(parts)):
        # Prepare the input text with the current generated text and the next placeholder
        current_input = generated_text + "<extra_id_0>" + parts[i]

        # Tokenize the input text
        input_ids = tokenizer.encode(current_input, return_tensors="pt").to(device) 
        
        # Use no_grad for inference
        with torch.no_grad():
            # Generate output from the model with increased diversity
            generated_ids = model.generate(
                input_ids,
                max_length=max_length,
                num_return_sequences=1,
                temperature=1.0,  # Increase temperature to introduce more randomness
                top_k=30,  # Consider the top 30 tokens to introduce more variability
                top_p=0.95,  # Use nucleus sampling for more diverse outputs
                do_sample=True  # Enable sampling to avoid repetitive outputs
            )
        
        # Decode the generated output
        generated_clause_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        # Tokenize the generated text
        generated_tokens = tokenizer.encode(generated_clause_text, return_tensors="pt").squeeze(0)

        best_match = None
        best_score = float("-inf")  # Use negative infinity for maximization

        for clause in allowed_clauses:
            # Tokenize each clause
            clause_tokens = tokenizer.encode(clause, return_tensors="pt").squeeze(0)
            
            # Make sure the generated tokens and clause tokens are the same size
            min_length = min(generated_tokens.size(0), clause_tokens.size(0))
            generated_tokens_truncated = generated_tokens[:min_length]
            clause_tokens_truncated = clause_tokens[:min_length]

            # Compute cosine similarity
            score = torch.nn.functional.cosine_similarity(
                generated_tokens_truncated.float(),
                clause_tokens_truncated.float(),
                dim=0
            ).item()

            if score > best_score:
                best_score = score
                best_match = clause

        # Replace <LEGAL_CLAUSE> with the best match clause
        generated_text += best_match + parts[i]

    return generated_text

In [None]:
# Function for constrained decoding across multiple rows in the dataframe
def generate_clauses_for_dataframe(df, model, tokenizer, allowed_clauses_col, input_col):
    # Initialize an empty list to store generated clauses
    generated_clauses = []
    
    # Loop through each row in the dataframe with progress tracking
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        # Extract the input text and allowed clauses
        input_text = row[input_col]
        allowed_clauses = row[allowed_clauses_col]

        # Perform constrained decoding
        generated_text = constrained_decoding_for_multiple_clauses(
            model=model,
            tokenizer=tokenizer,
            input_text=input_text,
            allowed_clauses=allowed_clauses
        )

        # Append the generated text to the list
        generated_clauses.append(generated_text)

    # Return the list of generated clauses
    return generated_clauses

# Use the function the test_df
test_df = sample_df

# Generate clauses for each row and add a new column to the DataFrame
test_df['generated_clauses'] = generate_clauses_for_dataframe(
    test_df, model, tokenizer, allowed_clauses_col='Legal_Clauses_Found', input_col='masked_text'
)
# For Precendents allowed_clauses_col='Precedents_Found'

In [None]:
# Output the DataFrame with generated clauses for review
test_df[['masked_text', 'generated_clauses']]

Unnamed: 0,masked_text,generated_clauses
651063,בבית המשפט העליון ...,בבית המשפט העליון ...
639497,בבית המשפט העליון ...,בבית המשפט העליון ...
650671,בבית המשפט העליו...,בבית המשפט העליו...
100714,בבית המשפט העליון בשבתו כבית משפט ...,בבית המשפט העליון בשבתו כבית משפט ...
666840,בבית המשפט העליו...,בבית המשפט העליו...
...,...,...
661998,בבית המשפט העליו...,בבית המשפט העליו...
516376,בבית המשפט העליו...,בבית המשפט העליו...
185274,בבית המשפט העליון בירושלי...,בבית המשפט העליון בירושלי...
616858,בבית המשפט העליו...,בבית המשפט העליו...


In [None]:
# Save Legal_Clauses df to a parquet file
test_df.to_parquet('Inference_Legal_Clauses_allowed_clauses.parquet', engine='pyarrow', compression='snappy')

# OR

# Save Precedents df to a parquet file
test_df.to_parquet('Inference_Precedents_allowed_clauses.parquet', engine='pyarrow', compression='snappy')

## Evaluation

In [11]:
# Load the dataset package
dataset = load_dataset("shay681/Inference_Legal_Clauses")

# Convert the datasets to Dataframe
Legal_Clauses_eval_dataset = pd.DataFrame.from_dict(dataset['train'])

# Load locally
# Legal_Clauses_eval_dataset = pd.read_parquet('Inference_Legal_Clauses_allowed_clauses.parquet', engine='pyarrow')

Downloading data: 100%|██████████| 89.0M/89.0M [00:16<00:00, 5.46MB/s]
Generating train split: 26677 examples [00:00, 39269.38 examples/s]


In [12]:
# Load the dataset package
dataset = load_dataset("shay681/Inference_Precedents")

# Convert the datasets to Dataframe
Precedents_eval_dataset = pd.DataFrame.from_dict(dataset['train'])

# Load locally
# Precedents_eval_dataset = pd.read_parquet('Inference_Precedents_allowed_clauses.parquet', engine='pyarrow')

Downloading data: 100%|██████████| 253M/253M [00:41<00:00, 6.05MB/s] 
Generating train split: 104226 examples [00:01, 59713.20 examples/s]


In [6]:
def normalize_text(text):
    # Remove punctuation and unnecessary characters, normalize spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove all punctuation (non-alphanumeric)
    text = re.sub(r'\s+', ' ', text)  # Normalize multiple spaces to a single space
    text = text.strip()  # Trim leading/trailing spaces
    return text

In [7]:
def calculate_accuracy(random_sample, doc_type):
    total_clauses = 0
    correct_clauses = 0

    # Iterate through each row in the DataFrame
    for _, row in random_sample.iterrows():
        found_clauses = row[doc_type + '_Found']  # This is a list of clauses
        generated_clause = row['generated_clauses']  # This is a string of generated text

        # Count the total clauses
        total_clauses += len(found_clauses)

        # Normalize generated_clause
        normalized_generated_clause = normalize_text(generated_clause)

        # Check if found_clauses are in the normalized_generated_clause
        for clause in found_clauses:
            if normalize_text(clause) in normalized_generated_clause:
                correct_clauses += 1

    # Calculate accuracy
    accuracy = correct_clauses / total_clauses if total_clauses > 0 else 0
    return accuracy

In [8]:
# Calculate Legal_Clauses accuracy
accuracy = calculate_accuracy(Legal_Clauses_eval_dataset, 'Legal_Clauses')
print(f'Legal_Clauses Accuracy: {accuracy * 100:.2f}%')

Legal_Clauses Accuracy: 87.88%


In [9]:
# Calculate Precedents accuracy
accuracy = calculate_accuracy(Precedents_eval_dataset, 'Precedents')
print(f'Precedents Accuracy: {accuracy * 100:.2f}%')

Precedents Accuracy: 7.51%


### Bleu Score

In [10]:
def extract_matching_clauses(generated_clause, found_clauses):
    # Normalize the generated clause
    normalized_generated_clause = normalize_text(generated_clause)
    
    # Initialize a list to hold matched clauses
    matching_clauses = []

    # Iterate through each found clause
    for clause in found_clauses:
        normalized_clause = normalize_text(clause)
        # Check if the normalized clause is in the normalized generated clause
        if normalized_clause in normalized_generated_clause:
            matching_clauses.append(normalized_clause)

    return matching_clauses

def calculate_bleu_score(found_clauses, generated_clause):
    # Extract matching clauses from generated_clause
    matching_clauses = extract_matching_clauses(generated_clause, found_clauses)

    # Initialize a list to store BLEU scores for each matched clause
    bleu_scores = []

    # Iterate through each matching clause
    for matching_clause in matching_clauses:
        reference_tokens = normalize_text(matching_clause).split()  # Tokenize the matched clause
        for clause in found_clauses:
            generated_tokens = (normalize_text(clause).split())  # Tokenize the generated clause
        
        # Calculate BLEU score for the matched clause against the found clause
        bleu_score = sentence_bleu([reference_tokens], generated_tokens)
        bleu_scores.append(bleu_score)
    
    # Return the average BLEU score across all matched clauses
    if bleu_scores:
        return sum(bleu_scores) / len(bleu_scores)
    return 0

In [11]:
# Apply the bleu_score function on Legal Clauses
average_bleu_score = 0
total_rows = len(Legal_Clauses_eval_dataset)

for _, row in Legal_Clauses_eval_dataset.iterrows():
    found_clauses = row['Legal_Clauses_Found']  # List of legal clauses
    generated_clause = row['generated_clauses']  # String of generated text

    # Calculate BLEU score for each row
    average_bleu_score += calculate_bleu_score(found_clauses, generated_clause)

# Final average BLEU score
average_bleu_score /= total_rows
print(f'Average BLEU Score: {average_bleu_score:.3f}')

Average BLEU Score: 0.888085759821180


In [12]:
# Apply the bleu_score function on Precedents
average_bleu_score = 0
total_rows = len(Precedents_eval_dataset)

for _, row in Precedents_eval_dataset.iterrows():
    found_clauses = row['Precedents_Found']  # List of legal clauses
    generated_clause = row['generated_clauses']  # String of generated text

    # Calculate BLEU score for each row
    average_bleu_score += calculate_bleu_score(found_clauses, generated_clause)

# Final average BLEU score
average_bleu_score /= total_rows
print(f'Average BLEU Score: {average_bleu_score:.3f}')

Average BLEU Score: 0.000000000000000


### Recall and F1 Score

In [13]:
def calculate_recall_f1(found_clauses, generated_clause):
    # Extract matching clauses from generated_clause
    matching_clauses = extract_matching_clauses(generated_clause, found_clauses)

    # True Positives: correctly predicted clauses
    true_positives = len(matching_clauses)

    # False Negatives: found_clauses that were not in generated_clause
    false_negatives = len([clause for clause in found_clauses if clause not in matching_clauses])

    # Calculate Recall
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

    # Calculate Precision
    precision = true_positives / len(matching_clauses) if len(matching_clauses) > 0 else 0

    # Calculate F1 Score
    f1 = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) > 0 else 0

    return recall, f1

In [14]:
def total_recall_f1(dataset, docType):
    # Iterate through each row in the DataFrame and calculate the average Recall and F1 Score
    total_recall = 0
    total_f1 = 0
    total_rows = len(dataset)

    for _, row in dataset.iterrows():
        found_clauses = row[docType]  # List of legal clauses
        generated_clause = row['generated_clauses']  # String of generated text

        # Calculate Recall and F1 score for each row
        recall, f1 = calculate_recall_f1(found_clauses, generated_clause)
        total_recall += recall
        total_f1 += f1

    return total_recall, total_f1, total_rows

In [15]:
# Calculate Recall and F1-Score
Legal_Clauses_total_recall, Legal_Clauses_total_f1, Legal_Clauses_total_rows = total_recall_f1(Legal_Clauses_eval_dataset, 'Legal_Clauses_Found')
Precedents_total_recall, Precedents_total_f1, Precedents_total_rows = total_recall_f1(Precedents_eval_dataset, 'Precedents_Found')

In [16]:
# Final average Recall and F1 Score for Legal_Clauses
Legal_Clauses_average_recall = Legal_Clauses_total_recall / Legal_Clauses_total_rows
Legal_Clauses_average_f1_score = Legal_Clauses_total_f1 / Legal_Clauses_total_rows

print(f'Average Recall of Legal_Clauses: {Legal_Clauses_average_recall:.3f}')
print(f'Average F1 Score of Legal_Clauses: {Legal_Clauses_average_f1_score:.3f}')

Average Recall of Legal_Clauses: 0.478951767009621
Average F1 Score of Legal_Clauses: 0.644487282073813


In [17]:
# Final average Recall and F1 Score for Precedents
Precedents_average_recall = Precedents_total_recall / Precedents_total_rows
Precedents_average_f1_score = Precedents_total_f1 / Precedents_total_rows

print(f'Average Recall of Precedents: {Precedents_average_recall:.3f}')
print(f'Average F1 Score of Precedents: {Precedents_average_f1_score:.3f}')

Average Recall of Precedents: 0.017929827979265
Average F1 Score of Precedents: 0.024801429867562


### ROUGE Score

In [12]:
# Function to calculate ROUGE score for a single row
def calculate_rouge_scores(row, doc_type):
    found_clauses = row[doc_type]  # This is a list of clauses
    generated_clause = row['generated_clauses']  # This is a string of generated text

    # Initialize the rouge scorer for ROUGE-1, ROUGE-2, and ROUGE-L
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Initialize accumulators for averaging scores across multiple found clauses
    rouge1_f, rouge1_p, rouge1_r = 0, 0, 0
    rouge2_f, rouge2_p, rouge2_r = 0, 0, 0
    rougeL_f, rougeL_p, rougeL_r = 0, 0, 0
    
    # Track the number of valid clauses
    num_clauses = len(found_clauses)

    for clause in found_clauses:
        # Calculate the ROUGE scores for each clause in found_clauses
        scores = scorer.score(clause, generated_clause)
        
        # Accumulate the scores for averaging later
        rouge1_f += scores['rouge1'].fmeasure
        rouge1_p += scores['rouge1'].precision
        rouge1_r += scores['rouge1'].recall

        rouge2_f += scores['rouge2'].fmeasure
        rouge2_p += scores['rouge2'].precision
        rouge2_r += scores['rouge2'].recall

        rougeL_f += scores['rougeL'].fmeasure
        rougeL_p += scores['rougeL'].precision
        rougeL_r += scores['rougeL'].recall

    # Avoid division by zero in case there are no clauses
    if num_clauses > 0:
        return {
            'rouge1_fmeasure': rouge1_f / num_clauses,
            'rouge1_precision': rouge1_p / num_clauses,
            'rouge1_recall': rouge1_r / num_clauses,
            'rouge2_fmeasure': rouge2_f / num_clauses,
            'rouge2_precision': rouge2_p / num_clauses,
            'rouge2_recall': rouge2_r / num_clauses,
            'rougeL_fmeasure': rougeL_f / num_clauses,
            'rougeL_precision': rougeL_p / num_clauses,
            'rougeL_recall': rougeL_r / num_clauses
        }
    else:
        return {
            'rouge1_fmeasure': 0, 'rouge1_precision': 0, 'rouge1_recall': 0,
            'rouge2_fmeasure': 0, 'rouge2_precision': 0, 'rouge2_recall': 0,
            'rougeL_fmeasure': 0, 'rougeL_precision': 0, 'rougeL_recall': 0
        }

In [20]:
# Apply the function to calculate ROUGE scores for each row, passing 'doc_type' as an argument
Legal_Clauses_rouge_scores = Legal_Clauses_eval_dataset.apply(lambda row: calculate_rouge_scores(row, 'Legal_Clauses_Found'), axis=1)

# Convert the resulting list of dictionaries into a DataFrame
Legal_Clauses_rouge_scores_df = pd.DataFrame(Legal_Clauses_rouge_scores.tolist())

# Calculate the average ROUGE scores across all rows
Legal_Clauses_average_rouge_scores = Legal_Clauses_rouge_scores_df.mean()

# Print the Legal_Clauses average scores
print("Legal_Clauses - Average ROUGE Scores:")
print(f"ROUGE-1 F1: {Legal_Clauses_average_rouge_scores['rouge1_fmeasure']:.4f}")
print(f"ROUGE-1 Precision: {Legal_Clauses_average_rouge_scores['rouge1_precision']:.4f}")
print(f"ROUGE-1 Recall: {Legal_Clauses_average_rouge_scores['rouge1_recall']:.4f}")
print(f"ROUGE-2 F1: {Legal_Clauses_average_rouge_scores['rouge2_fmeasure']:.4f}")
print(f"ROUGE-2 Precision: {Legal_Clauses_average_rouge_scores['rouge2_precision']:.4f}")
print(f"ROUGE-2 Recall: {Legal_Clauses_average_rouge_scores['rouge2_recall']:.4f}")
print(f"ROUGE-L F1: {Legal_Clauses_average_rouge_scores['rougeL_fmeasure']:.4f}")
print(f"ROUGE-L Precision: {Legal_Clauses_average_rouge_scores['rougeL_precision']:.4f}")
print(f"ROUGE-L Recall: {Legal_Clauses_average_rouge_scores['rougeL_recall']:.4f}")

Legal_Clauses - Average ROUGE Scores:
ROUGE-1 F1: 0.0687
ROUGE-1 Precision: 0.0360
ROUGE-1 Recall: 0.9480
ROUGE-2 F1: 0.0000
ROUGE-2 Precision: 0.0000
ROUGE-2 Recall: 0.0000
ROUGE-L F1: 0.0687
ROUGE-L Precision: 0.0360
ROUGE-L Recall: 0.9480


In [None]:
# Apply the function to calculate ROUGE scores for each row, passing 'doc_type' as an argument
Precedents_rouge_scores = Precedents_eval_dataset.apply(lambda row: calculate_rouge_scores(row, 'Precedents_Found'), axis=1)

# Convert the resulting list of dictionaries into a DataFrame
Precedents_rouge_scores_df = pd.DataFrame(Precedents_rouge_scores.tolist())

# Calculate the average ROUGE scores across all rows
Precedents_average_rouge_scores = Precedents_rouge_scores_df.mean()

# Print the Precedents average scores
print("Precedents - Average ROUGE Scores:")
print(f"ROUGE-1 F1: {Precedents_average_rouge_scores['rouge1_fmeasure']:.4f}")
print(f"ROUGE-1 Precision: {Precedents_average_rouge_scores['rouge1_precision']:.4f}")
print(f"ROUGE-1 Recall: {Precedents_average_rouge_scores['rouge1_recall']:.4f}")
print(f"ROUGE-2 F1: {Precedents_average_rouge_scores['rouge2_fmeasure']:.4f}")
print(f"ROUGE-2 Precision: {Precedents_average_rouge_scores['rouge2_precision']:.4f}")
print(f"ROUGE-2 Recall: {Precedents_average_rouge_scores['rouge2_recall']:.4f}")
print(f"ROUGE-L F1: {Precedents_average_rouge_scores['rougeL_fmeasure']:.4f}")
print(f"ROUGE-L Precision: {Precedents_average_rouge_scores['rougeL_precision']:.4f}")
print(f"ROUGE-L Recall: {Precedents_average_rouge_scores['rougeL_recall']:.4f}")

Precedents - Average ROUGE Scores:
ROUGE-1 F1: 0.1599
ROUGE-1 Precision: 0.0887
ROUGE-1 Recall: 0.9666
ROUGE-2 F1: 0.0878
ROUGE-2 Precision: 0.0466
ROUGE-2 Recall: 0.9448
ROUGE-L F1: 0.1599
ROUGE-L Precision: 0.0887
ROUGE-L Recall: 0.9665


### Semantic Similarity

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to calculate semantic similarity between Legal_Clauses_Found and generated_clauses
def calculate_semantic_similarity(row, doc_type):
    found_clauses = row[doc_type]  # List of clauses
    generated_clause = row['generated_clauses']  # String of generated text
    
    # Encode the found clauses and generated clauses into embeddings
    found_clauses_embedding = model.encode(found_clauses, convert_to_tensor=True).cpu()
    generated_clause_embedding = model.encode([generated_clause], convert_to_tensor=True).cpu()
    
    # Calculate cosine similarity between the found clauses and generated clause embeddings
    similarities = cosine_similarity(found_clauses_embedding, generated_clause_embedding)
    
    # Average the similarities
    avg_similarity = np.mean(similarities)
    
    return avg_similarity

In [None]:
# Apply the function to the DataFrame (13 Min)
Legal_Clauses_eval_dataset['semantic_similarity'] = Legal_Clauses_eval_dataset.apply(lambda row: calculate_semantic_similarity(row, 'Legal_Clauses_Found'), axis=1)

# Calculate the average semantic similarity
average_semantic_similarity = Legal_Clauses_eval_dataset['semantic_similarity'].mean()

print(f'Average Semantic Similarity: {average_semantic_similarity:.4f}')

Average Semantic Similarity: 0.8072


In [None]:
# Apply the function to the DataFrame (13 Min)
Precedents_eval_dataset['semantic_similarity'] = Precedents_eval_dataset.apply(lambda row: calculate_semantic_similarity(row, 'Precedents_Found'), axis=1)

# Calculate the average semantic similarity
average_semantic_similarity = Precedents_eval_dataset['semantic_similarity'].mean()

print(f'Average Semantic Similarity: {average_semantic_similarity:.4f}')

Average Semantic Similarity: 0.4095
