In [9]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import networkx as nx

class Summarizer:

    def __init__(self, stop_words=None):
        self.nlp = None  # spacy.load('fr_core_news_md')
        self.stop_words = stop_words
        self.model_name = ''
        self.text_embeddings = None
        self.tokenizer = None
        self.model = None
        self.device = None

    ###### Charger le modèle ######

    def init_model(self, model='bert', device=None, log=False):
        # GPU ou CPU ?
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device = device

        try:
            # Modèle BERT
            if model == 'bert':
                model_name = 'bert-base-uncased'
                self.tokenizer = BertTokenizer.from_pretrained(model_name)
                self.model = BertModel.from_pretrained(model_name)
                self.model_name = model_name
            # Modèle Flaubert
            elif model == 'flaubert':
                model_name = 'flaubert/flaubert_large_cased'
                flaubert = FlaubertModel.from_pretrained(model_name)
                tokenizer = FlaubertTokenizer.from_pretrained(model_name, do_lowercase=False)
                self.model = flaubert
                self.tokenizer = tokenizer
                self.model_name = model_name
            # Modèle Camembert
            elif model == 'camembert':
                model_name = 'camembert'
                self.model = torch.hub.load('pytorch/fairseq', 'camembert')
                self.model_name = model_name
        except Exception as e:
            print(f'Erreur lors du chargement du modèle {model}.')
            print(e)
            return

        # Pour l'inference
        self.model.to(self.device)
        self.model.eval()

        # Logs
        if log:
            self.init_log(self.model_name, self.device)

    def init_log(self, model_name, device):
        print(f'Summarizer: \'{model_name}\' chargé sur le {device}.')

    def to(self, device):
        """
          Déplacer le modele vers un `device` ?
        """
        self.device = device
        self.model.to(device)

    ###### Méthodes de sélection des phrases ######

    def reference_selection(self, reference_embeddings, embeddings, threshold):
        """
        Filtre les embeddings en fonction de leur similarité avec un embedding de référence.
        Retourner les embeddings sélectionnés dont le score de similarité est supérieur au seuil `threshold`.
        """
        selected_indices = []
        for reference_embedding in reference_embeddings:
            similarities = np.array(self.get_similarities(reference_embedding, embeddings))
            filtered_indices = np.where(similarities > threshold)[0]
            selected_indices.extend(filtered_indices.tolist())
        return sorted(list(set(selected_indices)))

    ###### Méthodes de sélection des mots-clés #####

    def word_embedding(self, word):
        """
        Returne le embedding de `word`
        """
        if self.model_name == 'camembert':
            token = self.model.encode(word).to(self.device)
            with torch.no_grad():
                encoded_layers = self.model.extract_features(token, return_all_hiddens=False)
                embedded_word = encoded_layers[0][0].cpu().numpy()
            return embedded_word
        else:
            token_ids = torch.tensor([self.tokenizer.encode(word, add_special_tokens=False)]).to(self.device)
            with torch.no_grad():
                last_layers = self.model(token_ids)
            token_embedding = torch.stack(last_layers, dim=0)[0]
            word_embedding = torch.mean(token_embedding, dim=1)
            embedded_word = word_embedding.cpu().numpy()
            return embedded_word

    def remove_stop_words(self, sentence):
        """
        Supprimez les stop-words de `sentence`.
        """
        split = [word for word in sentence.split(' ') if len(word) > 2]
        sentence = ' '.join(split)
        sentence = self.nlp(sentence)
        tokens = [token.text for token in sentence]
        clean_sentence = tokens
        if self.stop_words is not None:
            clean_sentence = [word for word in tokens if not word in self.stop_words]
        clean_sentence[:] = [item for item in clean_sentence if item != ' ']
        return clean_sentence

    def content_words_embedding(self, text):
        """
        Returne embedding de `text`.
        """
        text_content_words = [self.remove_stop_words(sentence) for sentence in text]
        content_words_embedding = []
        for words in text_content_words:
            content_words_embedding.append([self.word_embedding(word) for word in words])
        return content_words_embedding

    def keyword_similarity(self, content_words_embedding, keyword_embedding):
        keyword_similarities = []
        for words_embedding in content_words_embedding:
            if len(words_embedding) != 0:
                sim = [1 - cosine_similarity([keyword_embedding], [w])[0][0] for w in words_embedding]
            else:
                sim = [0.]
            keyword_similarities.append(sim)
        return keyword_similarities

    def keyword_selection(self, content_words_embedding, keywords_embeddings, method='max', threshold=0.6):
        """
        Renvoie les indices de texte sélectionnés en fonction de la similarité maximale/moyenne avec les mots-clés.
        """
        kw_similarities = [self.keyword_similarity(content_words_embedding, kw) for kw in keywords_embeddings]
        top_indices = []
        for kw_similarity in kw_similarities:
            top_sim = []
            if method == 'max':
                max_sim_sentence = [max(sentence) for sentence in kw_similarity]
                max_sim_sentence = np.array(max_sim_sentence)
                top_sim = np.where(max_sim_sentence >= threshold)[0]
            else:
                mean_sim_sentence = [np.mean(sentence, axis=0) for sentence in kw_similarity]
                mean_sim_sentence = np.array(mean_sim_sentence)
                top_sim = np.where(mean_sim_sentence >= threshold)[0].tolist()
            top_indices.extend(top_sim)
        return list(set(top_indices))

    ###### FIT ######

    def fit(self, text, reference_sentences=None, reference_threshold=0.6, keywords=None, keywords_method='max', keywords_threshold=0.6, log=True):
        # Embedding de tout le texte
        try:
            if not isinstance(text, pd.core.series.Series):
                text = pd.Series(text)
        except:
            print('Erreur d\'entrée de données : le texte devrait être un tableau numpy ndarray ou un tableau pandas..')
            return

        self.text = text.to_numpy()
        if self.model_name == 'camembert':
            self.text_embeddings = self.camembert_text_embedding(self.text)
        else:
            self.text_embeddings = self.bert_text_embedding(self.text)

        # La phrase de référence
        if reference_sentences is not None:
            if self.model_name == 'camembert':
                reference_embeddings = self.camembert_text_embedding(reference_sentences)
            else:
                reference_embeddings = self.bert_text_embedding(reference_sentences)

            selected_indices = self.reference_selection(reference_embeddings, self.text_embeddings, reference_threshold)
            self.text = self.text[selected_indices]
            self.text_embeddings = self.text_embeddings[selected_indices]

        # Les mots-clés
        if keywords is not None:
            keywords_embeddings = [self.word_embedding(keyword) for keyword in keywords]
            content_words_embedding = self.content_words_embedding(self.text)
            selected_indices = self.keyword_selection(content_words_embedding, keywords_embeddings, method=keywords_method, threshold=keywords_threshold)
            self.text = self.text[selected_indices]
            self.text_embeddings = self.text_embeddings[selected_indices]

        # Log Info
        if log:
            print(f'Summarizer fit: OK {self.text_embeddings.shape[0]} embeddings de dimension {self.text_embeddings.shape[1]}.')

    def bert_embedding(self, sentence):
        """Return le embedding calculé avec le modèle BERT """
        tokens = self.tokenizer.encode_plus(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        input_ids = tokens['input_ids'].to(self.device)
        attention_mask = tokens['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            last_hidden_states = outputs.last_hidden_state
            sentence_embedding = torch.mean(last_hidden_states, dim=1)

        return sentence_embedding.cpu().numpy()

    def bert_text_embedding(self, text):
        """
        Retourne les embeddings
        Argument
        ----------
          text: numpy array de string | Toutes les phrases du texte
        """
        embedded_sentences = list(map(self.bert_embedding, text))
        return np.vstack(embedded_sentences)

    def camembert_embedding(self, sentence):
        """Return le embedding calculé avec le modèle camemBERT """
        tokens = self.model.encode(sentence).to(self.device)
        with torch.no_grad():
            encoded_layers = self.model.extract_features(tokens, return_all_hiddens=True)
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        sum_vec = torch.sum(token_embeddings[-4:], dim=0)
        sentence_embedding = torch.mean(sum_vec, dim=0)
        return sentence_embedding.cpu().numpy()

    def camembert_text_embedding(self, text):
        """
        Retourne les embeddings
        Argument
        ----------
          text: numpy array de string | Toutes les phrases du texte
        """
        embedded_sentences = list(map(self.camembert_embedding, text))
        return np.vstack(embedded_sentences)

    def flaubert_text_embedding(self, text):
        """
        Retourne les embeddings
        Argument
        ----------
          text: numpy array de string | Toutes les phrases du texte
        """
        input_ids = [self.tokenizer.encode(sentence) for sentence in text]
        padded = np.array([i + [0]*(300-len(i)) for i in input_ids])
        attention_mask = np.where(padded != 0, 1, 0)
        input_ids_tensor = torch.tensor(padded).to(self.device)
        masks_tensor = torch.tensor(attention_mask).to(self.device)
        with torch.no_grad():
            encoded_layers = self.model(input_ids_tensor, masks_tensor)
        token_embeddings = torch.stack(encoded_layers.to_tuple(), dim=0)[0]
        sentence_embedding = torch.mean(token_embeddings, dim=1)
        embedded_sentences = sentence_embedding.cpu().numpy()
        return embedded_sentences

    ###### Résumé ######

    def get_similarities(self, reference_embedding, embeddings):
        """
        Retourne les scores de similarité entre la référence et les embeddings.
        """
        similarities = []
        for i in range(len(embeddings)):
            sim = 1 - cosine_similarity([reference_embedding], [embeddings[i]])[0][0]
            similarities.append(sim)
        return similarities

    def top_similarities(self, reference_embedding, embeddings, nb_top):
        """
        Retourne les `nb_top` indices  des embeddings les plus proches du `reference_embedding`.
        Argument
        ---------
          reference_embedding: np.array | embedding de reference pour le calcul de distance.
          embeddings: np.ndarray | embeddings à trier selon la distance.
          nb_top: int
        """
        similarities = self.get_similarities(reference_embedding, embeddings)
        top_indices = np.array(similarities).argsort()[::-1][:nb_top]
        return top_indices

    def mean_similarity_summary(self, nb_sentences=10, return_indices=False):
        """
        Arguments
        ----------
          nb_sentences: int | la longueur du résumé.
          return_indices: bool | renvoie les indices de phrases s'ils ont la valeur True.
        """
        mean_sentence_embedding = np.mean(self.text_embeddings, axis=0)
        top_indices = self.top_similarities(mean_sentence_embedding, self.text_embeddings, nb_sentences)
        summary = self.text[sorted(top_indices)]
        if return_indices:
            return summary, top_indices
        return summary

    def text_visualization(self, cluster_labels=None, plot_lib='pyplot'):
        mapped_embeddings = TSNE(n_components=2, metric='cosine', init='pca', perplexity=3).fit_transform(self.text_embeddings)
        if plot_lib == 'pyplot':
            plt.figure(figsize=(50,50))
            x = mapped_embeddings[:, 0]
            y = mapped_embeddings[:, 1]
            plt.scatter(x, y, c=cluster_labels)
            for i, txt in enumerate(self.text):
                plt.annotate(txt[:10], (x[i], y[i]))
            return
        print('Mauvais paramètre plot_lib. Utiliser \'pyplot\' ou ....')

    def cluster_embeddings(self, nb_clusters, nb_top):
        Kmean_cluster = KMeans(n_clusters=nb_clusters).fit(self.text_embeddings)
        centroid = Kmean_cluster.cluster_centers_
        embeddings_cluster_labels = Kmean_cluster.labels_
        top_clustered_indices = {}
        for cluster_label in range(centroid.shape[0]):
            indices = np.where(embeddings_cluster_labels == cluster_label)[0]
            cluster_embeddings = self.text_embeddings[indices]
            top = self.top_similarities(centroid[cluster_label], cluster_embeddings, nb_top)
            top_clustered_indices[cluster_label] = indices[top].tolist()
        return embeddings_cluster_labels, top_clustered_indices

    def clustering_summary(self, nb_clusters, nb_top, return_clusters=False):
        cluster_results = self.cluster_embeddings(nb_clusters, nb_top)
        _, top_clustered_indices = cluster_results
        summary_indices = []
        for indices in top_clustered_indices.values():
            summary_indices += indices
        summary = self.text[sorted(summary_indices)]
        if return_clusters:
            return summary, cluster_results
        return summary

    def graph_summary(self, nb_sentences=10, return_indices=False):
        nb_embeddings = self.text_embeddings.shape[0]
        sim_matrix = np.zeros([nb_embeddings, nb_embeddings])
        for i in range(nb_embeddings):
            for j in range(nb_embeddings):
                if i != j:
                    sim_matrix[i][j] = 1 - cosine_similarity([self.text_embeddings[i]], [self.text_embeddings[j]])[0][0]
        nx_graph = nx.from_numpy_array(sim_matrix)
        scores = nx.pagerank(nx_graph)
        summary_indices = np.argsort([scores[i] for i in range(nb_embeddings)])[::-1]
        summary = self.text[sorted(summary_indices[:nb_sentences])]
        if return_indices:
            return summary, summary_indices
        else:
            return summary

def summarize(model, method, text, nb_sentences, viz=False):
    summarizer = Summarizer()
    summarizer.init_model(model, log=True)
    summarizer.fit(text)
    if method == 'mean':
        summary = summarizer.mean_similarity_summary(nb_sentences=nb_sentences)
    elif method == 'clustering':
        summary, cluster_results = summarizer.clustering_summary(nb_clusters=nb_sentences, nb_top=2, return_clusters=True)
        labels, cluster_indices = cluster_results
        if viz:
            summarizer.text_visualization(cluster_labels=labels)
    elif method == 'graph':
        summary = summarizer.graph_summary(nb_sentences=nb_sentences)
    return summary


In [10]:
from datasets import load_dataset

# Charger le jeu de données de test
test_dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')


In [11]:
articles = test_dataset['article']
references = test_dataset['highlights']


In [12]:
# Exemple de résumé pour un article
article = articles[0]
reference = references[0]

# Générer le résumé
summary = summarize(model='bert', method='mean', text=[article], nb_sentences=3)
print("Résumé généré :", summary)
print("Résumé de référence :", reference)


Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Résumé généré : ['(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Pale

In [14]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=7f9ad3dc6105dbedcb74c77e7fc73eb48b1d36e5655dc268bad986e27e7a99ce
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\85\9d\af\01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [19]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [20]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Initialiser les métriques
rouge = Rouge()

# Fonction pour évaluer les résumés
def evaluate_summaries(summaries, references):
    bleu_scores = []
    rouge_scores = []
    for i, summary in enumerate(summaries):
        reference = references[i]
        bleu_score = sentence_bleu([reference.split()], summary.split())
        bleu_scores.append(bleu_score)
        rouge_score = rouge.get_scores(summary, reference)[0]
        rouge_scores.append(rouge_score)
    return bleu_scores, rouge_scores

# Générer des résumés pour tous les articles
summaries = [summarize(model='bert', method='mean', text=[article], nb_sentences=3) for article in articles]

# Évaluer les résumés
bleu_scores, rouge_scores = evaluate_summaries(summaries, references)

# Afficher les scores moyens
print(f"BLEU Score moyen: {np.mean(bleu_scores)}")
print(f"ROUGE Score moyen: {np.mean([score['rouge-1']['f'] for score in rouge_scores])}")


Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimension 768.
Summarizer: 'bert-base-uncased' chargé sur le cpu.
Summarizer fit: OK 1 embeddings de dimen

AttributeError: 'numpy.ndarray' object has no attribute 'split'

Modele BART

In [21]:
from datasets import load_dataset

# Charger le jeu de données de test
test_dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

articles = test_dataset['article']
references = test_dataset['highlights']


In [22]:
from transformers import BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch

# Choisir le modèle (BART ou T5)
model_name = 'facebook/bart-large-cnn'  # ou 't5-base' pour T5

if 'bart' in model_name:
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
elif 't5' in model_name:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

# Déplacer le modèle vers le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [23]:
def tokenize_data(articles, references, tokenizer, max_length=512, target_max_length=150):
    inputs = tokenizer(articles, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
    targets = tokenizer(references, max_length=target_max_length, truncation=True, padding='max_length', return_tensors='pt')
    return inputs, targets

# Tokeniser les données
inputs, targets = tokenize_data(articles, references, tokenizer)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Définir l'optimiseur
optimizer = AdamW(model.parameters(), lr=5e-5)

# Définir le scheduler
num_epochs = 3
num_training_steps = len(inputs['input_ids']) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
from tqdm import tqdm

# Fonction d'entraînement
def train_model(model, inputs, targets, optimizer, scheduler, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for i in tqdm(range(len(inputs['input_ids'])), desc=f'Epoch {epoch+1}/{num_epochs}'):
            input_ids = inputs['input_ids'][i].unsqueeze(0).to(device)
            attention_mask = inputs['attention_mask'][i].unsqueeze(0).to(device)
            labels = targets['input_ids'][i].unsqueeze(0).to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(inputs["input_ids"])}')

# Entraîner le modèle
train_model(model, inputs, targets, optimizer, scheduler, num_epochs, device)


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Initialiser les métriques
rouge = Rouge()

# Fonction pour évaluer les résumés
def evaluate_summaries(model, tokenizer, articles, references, device, max_length=150):
    model.eval()
    bleu_scores = []
    rouge_scores = []
    with torch.no_grad():
        for i in tqdm(range(len(articles)), desc='Evaluating'):
            input_ids = tokenizer(articles[i], return_tensors='pt', max_length=512, truncation=True, padding='max_length').input_ids.to(device)
            summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
            reference = references[i]

            bleu_score = sentence_bleu([reference.split()], summary.split())
            bleu_scores.append(bleu_score)
            rouge_score = rouge.get_scores(summary, reference)[0]
            rouge_scores.append(rouge_score)

    return bleu_scores, rouge_scores

# Évaluer les résumés
bleu_scores, rouge_scores = evaluate_summaries(model, tokenizer, articles, references, device)

# Afficher les scores moyens
print(f"BLEU Score moyen: {np.mean(bleu_scores)}")
print(f"ROUGE Score moyen: {np.mean([score['rouge-1']['f'] for score in rouge_scores])}")


Génération de résumés avec BERT (approche extractive)

In [None]:
from datasets import load_dataset
import numpy as np
from tqdm import tqdm

# Charger le jeu de données de test
test_dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

# Extrait les articles et les résumés de référence
articles = test_dataset['article']
references = test_dataset['highlights']

# Générer des résumés extractifs avec BERT
def generate_extractive_summaries(articles, nb_sentences=3):
    summaries = []
    for article in tqdm(articles, desc='Generating Extractive Summaries'):
        summary = summarize(model='bert', method='mean', text=[article], nb_sentences=nb_sentences)
        summaries.append(summary[0])  # Prendre le premier résumé généré
    return summaries

# Générer des résumés extractifs
extractive_summaries = generate_extractive_summaries(articles)


Génération de résumés avec BART (approche abstraite)

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Charger le modèle et le tokenizer BART
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Déplacer le modèle vers le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Générer des résumés abstraits avec BART
def generate_abstractive_summaries(articles, tokenizer, model, device, max_length=150):
    summaries = []
    model.eval()
    with torch.no_grad():
        for article in tqdm(articles, desc='Generating Abstractive Summaries'):
            input_ids = tokenizer(article, return_tensors='pt', max_length=512, truncation=True, padding='max_length').input_ids.to(device)
            summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
            summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
            summaries.append(summary)
    return summaries

# Générer des résumés abstraits
abstractive_summaries = generate_abstractive_summaries(articles, tokenizer, model, device)


 Évaluation des résumés

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Initialiser les métriques
rouge = Rouge()

# Fonction pour évaluer les résumés
def evaluate_summaries(summaries, references):
    bleu_scores = []
    rouge_scores = []
    for i in range(len(summaries)):
        summary = summaries[i]
        reference = references[i]
        bleu_score = sentence_bleu([reference.split()], summary.split())
        bleu_scores.append(bleu_score)
        rouge_score = rouge.get_scores(summary, reference)[0]
        rouge_scores.append(rouge_score)
    return bleu_scores, rouge_scores

# Évaluer les résumés extractifs
bleu_scores_extractive, rouge_scores_extractive = evaluate_summaries(extractive_summaries, references)

# Évaluer les résumés abstraits
bleu_scores_abstractive, rouge_scores_abstractive = evaluate_summaries(abstractive_summaries, references)

# Afficher les scores moyens
print(f"BLEU Score moyen (Extractif): {np.mean(bleu_scores_extractive)}")
print(f"ROUGE Score moyen (Extractif): {np.mean([score['rouge-1']['f'] for score in rouge_scores_extractive])}")

print(f"BLEU Score moyen (Abstrait): {np.mean(bleu_scores_abstractive)}")
print(f"ROUGE Score moyen (Abstrait): {np.mean([score['rouge-1']['f'] for score in rouge_scores_abstractive])}")


Comparaison des résultats

In [None]:
# Comparaison des scores
print("Comparaison des scores BLEU et ROUGE :")
print(f"BLEU Score moyen (Extractif): {np.mean(bleu_scores_extractive)}")
print(f"ROUGE Score moyen (Extractif): {np.mean([score['rouge-1']['f'] for score in rouge_scores_extractive])}")

print(f"BLEU Score moyen (Abstrait): {np.mean(bleu_scores_abstractive)}")
print(f"ROUGE Score moyen (Abstrait): {np.mean([score['rouge-1']['f'] for score in rouge_scores_abstractive])}")

# Analyse des résultats
if np.mean(bleu_scores_extractive) > np.mean(bleu_scores_abstractive):
    print("L'approche extractive (BERT) a un meilleur score BLEU.")
else:
    print("L'approche abstraite (BART) a un meilleur score BLEU.")

if np.mean([score['rouge-1']['f'] for score in rouge_scores_extractive]) > np.mean([score['rouge-1']['f'] for score in rouge_scores_abstractive]):
    print("L'approche extractive (BERT) a un meilleur score ROUGE.")
else:
    print("L'approche abstraite (BART) a un meilleur score ROUGE.")
