In [None]:
!pip install datasets
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# Charger le fichier JSON
with open('squad_arabe_enrichi_cleaned.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extraire les questions et les réponses
questions = []
answers = []
contexts = []  # Contexte des paragraphes pour la tâche de question-réponse

for item in data['data']:
    for paragraph in item['paragraphs']:
        context = paragraph['context']  # Contexte
        for qa in paragraph['qas']:
            question = qa['question']  # Question
            answer_text = " ".join([answer['text'] for answer in qa['answers']])  # Réponse
            questions.append(question)
            answers.append(answer_text)
            contexts.append(context)

# Créer un dataset compatible Hugging Face
dataset = Dataset.from_dict({
    'question': questions,
    'answer': answers,
    'context': contexts
})

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# Charger le modèle SambaLingo-Arabic-Chat
model_name = "sambanovasystems/SambaLingo-Arabic-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

# Tokenization des données
def preprocess_function(examples):
    return tokenizer(
        examples['question'], examples['context'], truncation=True, padding=True, max_length=512
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/986k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/780 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

In [None]:
# Appliquer la tokenisation
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',          # Répertoire de sortie
    evaluation_strategy="epoch",     # Evaluation par époque
    learning_rate=2e-5,              # Taux d'apprentissage
    per_device_train_batch_size=16,  # Taille du batch d'entraînement
    per_device_eval_batch_size=64,   # Taille du batch d'évaluation
    num_train_epochs=3,              # Nombre d'époques
    weight_decay=0.01,               # Décroissance du poids
)

# Définir le Trainer
trainer = Trainer(
    model=model,                       # Le modèle à fine-tuner
    args=training_args,                # Les arguments d'entraînement
    train_dataset=tokenized_datasets,  # Le dataset d'entraînement
    eval_dataset=tokenized_datasets    # Le dataset d'évaluation (peut être différent)
)

# Fine-tuning du modèle
trainer.train()

# Sauvegarder le modèle fine-tuné
trainer.save_model('./fine_tuned_sambalingo_arabic_chat_model')

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


OutOfMemoryError: CUDA out of memory. Tried to allocate 22.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 21.06 MiB is free. Process 9043 has 14.72 GiB memory in use. Of the allocated memory 14.42 GiB is allocated by PyTorch, and 186.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)