In [29]:
pip install pandas evaluate nltk



In [30]:
pip install rouge_score



In [31]:
import numpy as np
import random

In [32]:
import pandas as pd
import re

df_predictions = pd.read_csv("test_predictions.csv")

def extract_translation(text):
    if isinstance(text, str):
        match = re.search(r'\n(.*)', text, re.DOTALL)
        return match.group(1).strip() if match else text
    return text

df_predictions["predictions"] = df_predictions["predictions"].apply(extract_translation)

In [33]:
import pandas as pd
import re

# df_predictions = pd.read_csv("test_predictions.csv")

def extract_translation(text):
    if isinstance(text, str):
        match = re.search(r'@xcite(.*)', text, re.DOTALL)
        return match.group(1).strip() if match else text
    return text

df_predictions["predictions"] = df_predictions["predictions"].apply(extract_translation)


In [34]:
df_predictions["targets"][3]

'The paper proposes a method called DEXPERTS for controlled text generation by combining a pretrained language model with expert and anti expert language models in a product of experts The approach is applied to language detoxification and sentiment controlled generation and outperforms existing controllable generation methods The method is effective with small expert and anti expert language models and highlights the promise of tuning language models for efficient decoding time steering towards safe and user friendly generations'

In [35]:
df_predictions["predictions"][3]

') . We propose DEXPERTS a decoding time method for controlled text generation that combines a pretrained language model with expert LMs and or anti expert LMs in a product of experts . Intuitively under the ensemble tokens only get high probabil ity if they are considered likely by the ex perts and unlikely by the anti experts . We ap ply DEXPERTS to language detoxification and sentiment controlled generation where we outperform existing controllable generation methods on both automatic and human evalua tions . Moreover because DEXPERTS operates only on the output of the pretrained LM it is effective with anti experts of smaller size in cluding when operating on GPT 3 . Our work highlights the promise of tuning small LMs on text with un desirable attributes for efficient decoding time steering .'

In [36]:
class VNSummarizer:
    def __init__(self, documents, reference_summaries, max_summary_length=150):
        self.documents = documents
        self.reference_summaries = reference_summaries
        self.max_summary_length = max_summary_length
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def calculate_rouge(self, summary):
        rouge1_scores = []
        rouge2_scores = []
        rougeL_scores = []

        for ref in self.reference_summaries:
            score = self.rouge_scorer.score(summary, ref)
            rouge1_scores.append(score['rouge1'].fmeasure)
            rouge2_scores.append(score['rouge2'].fmeasure)
            rougeL_scores.append(score['rougeL'].fmeasure)

        avg_rouge1 = np.mean(rouge1_scores)
        avg_rouge2 = np.mean(rouge2_scores)
        avg_rougeL = np.mean(rougeL_scores)

        combined_score = (avg_rouge1 + avg_rouge2 + avg_rougeL) / 3

        return combined_score

    def shake(self, current_solution, k):
        neighbor = current_solution.copy()
        available_sentences = [s for s in range(len(self.documents)) if s not in neighbor]

        if len(available_sentences) < k or len(neighbor) < k:
            return neighbor

        indices_to_replace = random.sample(range(len(neighbor)), k)
        new_sentences = random.sample(available_sentences, k)

        for i, idx in enumerate(indices_to_replace):
            neighbor[idx] = new_sentences[i]

        return neighbor

    def local_search(self, solution):
        improved = True
        current_solution = solution.copy()
        current_score = self.calculate_rouge(self.generate_summary(current_solution))

        while improved:
            improved = False
            best_neighbor = current_solution
            best_score = current_score

            for i in range(len(current_solution)):
                for j in range(len(self.documents)):
                    if j not in current_solution:
                        neighbor = current_solution.copy()
                        neighbor[i] = j
                        summary = self.generate_summary(neighbor)
                        score = self.calculate_rouge(summary)

                        if score > best_score:
                            best_score = score
                            best_neighbor = neighbor.copy()
                            improved = True

            if improved:
                current_solution = best_neighbor
                current_score = best_score

        return current_solution, current_score

    def generate_summary(self, indices):
        selected_sentences = [self.documents[i] for i in indices]
        return " ".join(selected_sentences)

    def summarize(self, max_iterations=100):
        current_solution = random.sample(range(len(self.documents)),
                                        min(5, len(self.documents)))
        current_summary = self.generate_summary(current_solution)
        current_score = self.calculate_rouge(current_summary)

        k_max = min(5, len(self.documents) // 2)

        for _ in range(max_iterations):
            k = 1
            while k <= k_max:
                neighbor = self.shake(current_solution, k)

                local_solution, local_score = self.local_search(neighbor)

                if local_score > current_score:
                    current_solution = local_solution
                    current_score = local_score
                    k = 1
                else:
                    k += 1

        final_summary = self.generate_summary(current_solution)

        final_scores = {}
        for ref in self.reference_summaries:
            score = self.rouge_scorer.score(final_summary, ref)
            for metric in ['rouge1', 'rouge2', 'rougeL']:
                if metric not in final_scores:
                    final_scores[metric] = []
                final_scores[metric].append(score[metric].fmeasure)

        avg_scores = {metric: np.mean(scores) for metric, scores in final_scores.items()}

        return final_summary, current_score, avg_scores


In [37]:
def calculate_corpus_bleu(references, candidates):
    from nltk.translate.bleu_score import corpus_bleu

    bleu1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0))
    bleu2 = corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0))
    bleu3 = corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0))
    bleu4 = corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25))

    return {
        'bleu1': bleu1,
        'bleu2': bleu2,
        'bleu3': bleu3,
        'bleu4': bleu4
    }


In [41]:
import pandas as pd
import evaluate
import numpy as np
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

documents = [str(doc).split('. ') if not isinstance(doc, float) else [str(doc)] for doc in df_predictions['predictions']]
reference_summaries = df_predictions['targets']

vns_summarizer = VNSummarizer(documents[0], [reference_summaries[0]])
summary, combined_score, individual_scores = vns_summarizer.summarize()

print(f"Evaluation Summary:")
print(f"ROUGE-1 F1 Score: {100*individual_scores['rouge1']:.2f}")
print(f"ROUGE-2 F1 Score: {100*individual_scores['rouge2']:.2f}")
print(f"ROUGE-L F1 Score: {100*individual_scores['rougeL']:.2f}")
bleu_scores = calculate_corpus_bleu(documents, reference_summaries)
print(f"BLEU Score: {100*max(bleu_scores['bleu1'],bleu_scores['bleu2'],bleu_scores['bleu3'],bleu_scores['bleu4']):.2f}")


Evaluation Summary:
ROUGE-1 F1 Score: 48.03
ROUGE-2 F1 Score: 22.38
ROUGE-L F1 Score: 34.41
BLEU Score: 39.38
