***AI Hackathon-AIRBUS HELICOPTERS 2024***

***
<center>

***Text Summarization using T5 Transformer***

</center>

***


**Team Name:** ***BrainWave***


***Members:*** 
- Jean-baptiste GOMEZ
- Hicham EL MAKAOUI 
- Nelly AGOSSOU
- Oussama RHITI
- Ulrich SEGODO​





## Installation of requirements and dependencies

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install rouge_score
!pip install evaluate
!pip install pyspellchecker
!pip install tensorflow

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.2
Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (11

## Imports

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# NLP
import string, re, nltk
nltk.download('averaged_perceptron_tagger')
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import torch
import evaluate
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
import glob
import json


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Load data from the provided JSON files
airbus_data = pd.read_json('/content/airbus_helicopters_train_set.json', encoding="utf-8")
airbus_data = airbus_data.transpose().reset_index(drop=True)


## Prepare Dataset

In [None]:
# RegexpTokenizer
regexp = RegexpTokenizer("[\w']+")

# Converting to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Removing whitespaces
def remove_whitespace(text):
    return text.strip()

# Removing punctuations
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string to keep the contractions intact
    return text.translate(str.maketrans("", "", punct_str))

# Removing HTML tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# Removing emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Removing other unicode characters
def remove_http(text):
    http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
    pattern = r"({})".format(http) # creating pattern
    return re.sub(pattern, "", text)

# Dictionary of acronyms
acronyms_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_acronyms.json'
acronyms_dict = pd.read_json(acronyms_url, typ = 'series')
# List of acronyms
acronyms_list = list(acronyms_dict.keys())

# Function to convert acronyms in a text
def convert_acronyms(text):
    words = []
    for word in regexp.tokenize(text):
        if word in acronyms_list:
            words = words + acronyms_dict[word].split()
        else:
            words = words + word.split()

    text_converted = " ".join(words)
    return text_converted

# Dictionary of contractions
contractions_url = 'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/JSON/english_contractions.json'
contractions_dict = pd.read_json(contractions_url, typ = 'series')

# List of contractions
contractions_list = list(contractions_dict.keys())

# Function to convert contractions in a text
def convert_contractions(text):
    words = []
    for word in regexp.tokenize(text):
        if word in contractions_list:
            words = words + contractions_dict[word].split()
        else:
            words = words + word.split()

    text_converted = " ".join(words)
    return text_converted

# pyspellchecker
spell = SpellChecker()

def pyspellchecker(text):
    word_list = regexp.tokenize(text)
    word_list_corrected = []
    for word in word_list:
        if word in spell.unknown(word_list):
            word_corrected = spell.correction(word)
            if word_corrected == None:
                word_list_corrected.append(word)
            else:
                word_list_corrected.append(word_corrected)
        else:
            word_list_corrected.append(word)
    text_corrected = " ".join(word_list_corrected)
    return text_corrected

# Additional stopwords

alphabets = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
              "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]

others = ["ã", "å", "ì", "û", "ûªm", "ûó", "ûò", "ìñ", "ûªre", "ûªve", "ûª", "ûªs", "ûówe"]
additional_stops = alphabets + others

def remove_additional_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in additional_stops])


In [None]:
def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) # converting text to one line
    text = re.sub('\[.*?\]', '', text) # removing square brackets
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = convert_acronyms(text)
    text = convert_contractions(text)
    text = remove_additional_stopwords(text)
    return text

In [None]:
# Implementing text normalization
data = pd.DataFrame()
data['original_text'] = airbus_data['original_text'].apply(text_normalizer)
data['reference_summary'] = airbus_data['reference_summary']
data["uid"] = airbus_data["uid"]

airbus_train = data

In [None]:
# Split data
dataset_train, dataset_valid = train_test_split(airbus_train, test_size=0.2, shuffle=True, random_state=42)


## Configurations

In [None]:
MODEL = 't5-base'
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
# Maximum context length to consider while preparing dataset.
MAX_LENGTH = 512

## Tokenization

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Convertir les DataFrames pandas en datasets Hugging Face
hf_dataset_train = Dataset.from_pandas(dataset_train)
hf_dataset_valid = Dataset.from_pandas(dataset_valid)



In [None]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['original_text']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['reference_summary']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = hf_dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = hf_dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=4):   0%|          | 0/330 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/83 [00:00<?, ? examples/s]



## Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


## ROUGE Metric

In [None]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

## Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.131,0.13261,0.7098,0.4854,0.683,36.4337
400,0.0741,0.090479,0.7409,0.5392,0.7193,37.4096
600,0.0487,0.092662,0.7591,0.5665,0.7399,37.4337
800,0.0377,0.096615,0.7621,0.5665,0.7418,37.4337


In [None]:
tokenizer.save_pretrained(OUT_DIR)

('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}

  adding: results_t5base/ (stored 0%)
  adding: results_t5base/checkpoint-830/ (stored 0%)
  adding: results_t5base/checkpoint-830/config.json (deflated 63%)
  adding: results_t5base/checkpoint-830/rng_state.pth (deflated 25%)
  adding: results_t5base/checkpoint-830/optimizer.pt (deflated 7%)
  adding: results_t5base/checkpoint-830/model.safetensors (deflated 9%)
  adding: results_t5base/checkpoint-830/scheduler.pt (deflated 57%)
  adding: results_t5base/checkpoint-830/generation_config.json (deflated 27%)
  adding: results_t5base/checkpoint-830/trainer_state.json (deflated 84%)
  adding: results_t5base/checkpoint-830/training_args.bin (deflated 51%)
  adding: results_t5base/special_tokens_map.json (deflated 85%)
  adding: results_t5base/events.out.tfevents.1707919292.4a799e00c952.1209.0 (deflated 66%)
  adding: results_t5base/added_tokens.json (deflated 83%)
  adding: results_t5base/checkpoint-747/ (stored 0%)
  adding: results_t5base/checkpoint-747/config.json (deflated 63%)
  adding

## Inference

In [None]:
# the path where you saved your model
# the path is the checkpoint that beginning by number eight (8)
model_path = f"{OUT_DIR}/checkpoint-830"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=max_length, truncation=True)
    # Generate the summary
    summary_ids = model.generate(inputs, max_length=150, num_beams=num_beams, early_stopping=True)
    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate summary on test set

In [None]:
# Lire les fichiers json
test_set = pd.read_json('/content/test_set.json', encoding="utf-8")
test_set = test_set.transpose().reset_index(drop=True)
test_set.head(3)

Unnamed: 0,original_text,uid
0,Customer’s Orders shall be confirmed by the Se...,test_sum01
1,Copying and/or reproducing and/or communicatio...,test_sum010
2,In no event shall the Parties be liable for an...,test_sum011


In [None]:
# Implementing text normalization
clean_test_set = pd.DataFrame()
clean_test_set['original_text'] = test_set['original_text'].apply(text_normalizer)
clean_test_set["uid"] = test_set["uid"]

clean_test_set.head(3)

Unnamed: 0,original_text,uid
0,customer orders shall be confirmed by the sell...,test_sum01
1,copying andor reproducing andor communication ...,test_sum010
2,in no event shall the parties be liable for an...,test_sum011


In [None]:
generated_summaries = []
uid = []
# Iterate over the rows of the DataFrame and generate summaries
for index, row in clean_test_set.iterrows():
    original_text = row['original_text']
    summary = summarize_text(original_text, model, tokenizer)
    id = row['uid']
    uid.append(id)
    generated_summaries.append(summary)


In [None]:
def clean_text(text):
    # Remove apostrophes
    text_without_apostrophes = text.replace("'", "")
    # Remove unnecessary spaces (repeated spaces, leading and trailing spaces)
    text_clean = " ".join(text_without_apostrophes.split())
    return text_clean

df_test = pd.DataFrame({
    'generated_summary': generated_summaries,
    'uid' : uid
})

df_test["generated_summary"] = df_test["generated_summary"].apply(clean_text)
df_test.head(6)

Unnamed: 0,generated_summary,uid
0,Customer’s orders shall be confirmed by the Se...,test_sum01
1,Copying and/or reproducing and/or communicatio...,test_sum010
2,In no event shall the Parties be liable for an...,test_sum011
3,Partie may terminate all or part of the Contra...,test_sum012
4,AH owns all foreground intellectual property a...,test_sum013
5,The content and time schedule of the training ...,test_sum014


In [None]:
# Initialize a dictionary to store JSON data
data_json = {}

# Loop through the rows of the DataFrame and create a JSON entry for each row
for i, row in df_test.iterrows():
    key = row["uid"]
    data_json[key] = {
        "generated_summary": row['generated_summary'],
        "uid": row["uid"]
    }

# Convert dictionary to JSON with UTF-8 encoding
json_data = json.dumps(data_json, ensure_ascii=False, indent=4)


# If you want to save it to a file
with open("generated_set.json", "w", encoding="utf-8") as json_file:
     json_file.write(json_data)
