# **Install Dependencies**

In [None]:
!pip install -q bitsandbytes  accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [None]:
!pip install -U datasets

# **Setup Model**

In [None]:

from datasets import load_dataset

import os
import torch  
os.environ['CUDA_VISIBLE_DEVICES']='0,1'
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = model.generation_config.eos_token_id

In [5]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=

# **Data**

In [10]:
from datasets import load_dataset

dataset = load_dataset("xfordanita/code-summary-java")

README.md:   0%|          | 0.00/538 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/74.7M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/26.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/285670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31741 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/79352 [00:00<?, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['code', 'summary'],
        num_rows: 285670
    })
    validation: Dataset({
        features: ['code', 'summary'],
        num_rows: 31741
    })
    test: Dataset({
        features: ['code', 'summary'],
        num_rows: 79352
    })
})

> **Divisez le dataset en plusieurs parties si la taille est trop importante pour vos ressources.**

In [9]:
import random
from datasets import load_dataset, Dataset  # <-- Import Dataset class

# Splitting ratios (adjust as needed)
test_size = 0.8  # 20% for testing
val_size = 0.2  # 10% for validation


# Split into test set (preserving order)
test_indices = random.sample(range(len(test_dataset)), int(len(test_dataset) * test_size))
tests_dataset = Dataset.from_dict({
    feature: [test_dataset[i][feature] for i in test_indices] for feature in test_dataset.features
})

# Remove test indices from remaining data for further splitting
train_indices = [i for i in range(len(test_dataset)) if i not in test_indices]
dataset_subset = test_dataset.select(train_indices)

num_train_samples = len(dataset_subset)
num_val_samples = int(num_train_samples * val_size)
val_indices = random.sample(range(num_train_samples), num_val_samples)
val_dataset = Dataset.from_dict({
    feature: [dataset_subset[i][feature] for i in val_indices] for feature in dataset_subset.features
})

# Update training set with remaining indices
train_indices = [i for i in range(num_train_samples) if i not in val_indices]
val_dataset = dataset_subset.select(train_indices)

# Print information about the final splits
print("Training set size:", len(test_dataset))
print("Validation set size:", len(val_dataset))
print("Test set size:", len(tests_dataset))


Training set size: 285670
Validation set size: 45708
Test set size: 228536


In [7]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [8]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=True,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
def preprocess_json_file(train_data):
    prompt_template = f"""
### instruction:
Explain the functionality of the provided code below.
### input:
{train_data["code"]}

### output:
{train_data["summary"]}
"""

    return tokenize(prompt_template)



In [11]:
tokenized_train_dataset = dataset['train'].map(preprocess_json_file)
tokenized_val_dataset=dataset['validation'].map(preprocess_json_file)

Map:   0%|          | 0/285670 [00:00<?, ? examples/s]

Map:   0%|          | 0/31741 [00:00<?, ? examples/s]

# **Share Dataset on the 🤗 Hub**

In [28]:
import random
from datasets import load_dataset, DatasetDict

split_dataset = DatasetDict({
    "train": tokenized_train_dataset ,
    "validation": tokenized_val_dataset ,
    "test": tokenized_test_dataset 
})


split_dataset.push_to_hub("code-summary-java-tokenizeddata", private=False)  # Replace with your actual username and desired dataset name


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/96 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/96 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/96 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/xfordanita/code-summary-java-tokenizeddata/commit/3de7c9b6f2f0d4a36bca695cef2a154933882c45', commit_message='Upload dataset', commit_description='', oid='3de7c9b6f2f0d4a36bca695cef2a154933882c45', pr_url=None, pr_revision=None, pr_num=None)

# *Freezing the original weights*

In [24]:
for param in model.parameters():
  param.requires_grad = False 
  if param.ndim == 1:
    param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

# *Setting up the LoRa Adapters*

In [25]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [26]:
from peft import LoraConfig, get_peft_model 
config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.1,
      r=64,
      bias="none",
      task_type="CAUSAL_LM"
)

model = get_peft_model(model,config)
print_trainable_parameters(model)

trainable params: 33554432 || all params: 3534098432 || trainable%: 0.9494481448557458


In [27]:
input_dtype = model.dtype
print("Type de données accepté pour les entrées:", input_dtype)

Type de données accepté pour les entrées: torch.float16


In [28]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True


In [29]:
model.config.use_cache = False

In [18]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# ***Training***

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
training_arguments = TrainingArguments(
    output_dir='./results',
    num_train_epochs=16,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.1,  # Utilisation d'une valeur plus élevée pour la régularisation L2
    fp16=True,
    max_grad_norm=1.0,  # Réduire la taille maximale des gradients pour éviter les explosions de gradients
    max_steps=-1,
    warmup_ratio=0.1,  # Augmentation du ratio de warmup
    group_by_length=True,
    lr_scheduler_type="constant",  # Utilisation d'un taux d'apprentissage constant
    report_to="tensorboard"
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


trainer.train()
trainer.model.save_pretrained(new_model)

  trainer = Trainer(
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacty of 14.74 GiB of which 2.87 GiB is free. Process 2736 has 11.87 GiB memory in use. Of the allocated memory 9.70 GiB is allocated by PyTorch, and 2.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [44]:
trainer.tokenizer.save_pretrained(new_model)

('codellama-7b-chuk-test/tokenizer_config.json',
 'codellama-7b-chuk-test/special_tokens_map.json',
 'codellama-7b-chuk-test/tokenizer.model',
 'codellama-7b-chuk-test/added_tokens.json',
 'codellama-7b-chuk-test/tokenizer.json')

In [48]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32016, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
       

# *Merge the LoRA adapter and optionally upload model*

In [39]:
from peft import LoraConfig, PeftModel
model_name='codellama/CodeLlama-7b-hf'

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 1.06 MiB is free. Process 27758 has 14.68 GiB memory in use. Of the allocated memory 14.50 GiB is allocated by PyTorch, and 49.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [89]:
output_dir = "/kaggle/working/Llamacode-7b"
model.config.save_pretrained(output_dir)
# Sauvegarder le tokenizer dans le même dossier
tokenizer.save_pretrained(output_dir)


('/kaggle/working/Llamacode-7b/tokenizer_config.json',
 '/kaggle/working/Llamacode-7b/special_tokens_map.json',
 '/kaggle/working/Llamacode-7b/tokenizer.model',
 '/kaggle/working/Llamacode-7b/added_tokens.json',
 '/kaggle/working/Llamacode-7b/tokenizer.json')

# *Share adapters on the 🤗 Hub*

In [90]:
model.push_to_hub("Llama_code-7b", tokenizer, dataset="xfordanita/code-summary-java")
tokenizer.push_to_hub("Llama_code-7b", tokenizer, dataset="xfordanita/code-summary-java")

adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xforx/Llama_code-7b/commit/dcbcb6a809784d9a8a0c63581b4ceb98a26b0158', commit_message='Upload tokenizer', commit_description='', oid='dcbcb6a809784d9a8a0c63581b4ceb98a26b0158', pr_url=None, pr_revision=None, pr_num=None)

# *Inference*

In [None]:

eval_prompt = """
### instruction:
Explain the functionality of the given code below.

### input:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### output:
"""


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))



In [3]:

eval_prompt = """
### instruction:
Explain the functionality of the given code below.

### input:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### output:
"""


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))






### instruction:
Explain the functionality of the given code below.

### input:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### output:
This method is used to set the servlet registration beans.

### code:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### explanation:
This method is used


In [7]:

eval_prompt = """
### instruction:
Explain the functionality of the given code below.

### input:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }
### output:
"""


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))




### instruction:
Explain the functionality of the given code below.

### input:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### output:
This method is used to set the servlet registration beans.

### code:
public void setServletRegistrationBeans( Collection<? extends ServletRegistrationBean<?>> servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); this.servletRegistrationBeans = new LinkedHashSet<>(servletRegistrationBeans); }


### explanation:
This method is used


In [8]:

eval_prompt = """
### instruction:
Explain the functionality of the given code below.

### input:
public void addServletRegistrationBeans( ServletRegistrationBean<?>... servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); Collections.addAll(this.servletRegistrationBeans, servletRegistrationBeans); }

### output:
"""


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))




### instruction:
Explain the functionality of the given code below.

### input:
public void addServletRegistrationBeans( ServletRegistrationBean<?>... servletRegistrationBeans) { Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null"); Collections.addAll(this.servletRegistrationBeans, servletRegistrationBeans); }

### output:
This method is used to add the servlet registration beans to the servlet registration beans list.

### code:
```java
public void addServletRegistrationBeans(ServletRegistrationBean<?>... servletRegistrationBeans) {
    Assert.notNull(servletRegistrationBeans, "ServletRegistrationBeans must not be null");
    Collections.addAll(this.servletRegistrationBeans, servletRegistrationBeans);
}


In [4]:

eval_prompt = """
### instruction:
Explain the functionality of the given code below.

### input:
public static PagableQueryParams of(final Optional<String> optOffset, final Optional<Integer> optLimit) { return PagableQueryParams.of(optOffset, optLimit, Optional.<SortDirection> absent(), Optional.<Boolean> absent()); }
### output:
"""


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))




### instruction:
Explain the functionality of the given code below.

### input:
public static PagableQueryParams of(final Optional<String> optOffset, final Optional<Integer> optLimit) { return PagableQueryParams.of(optOffset, optLimit, Optional.<SortDirection> absent(), Optional.<Boolean> absent()); }
### output:
The method returns a new instance of PagableQueryParams.

### code:
```java
public static PagableQueryParams of(final Optional<String> optOffset, final Optional<Integer> optLimit) {
    return PagableQueryParams.of(optOffset, optLimit, Optional.<SortDirection>absent(), Optional.<Boolean>absent());
}
```

### question:
Explain the functionality of the given code


In [None]:

import transformers
prompt="Explain the functionality of the provided code below"
pipe= transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=3000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)
result = pipe(f"""
### instruction:
{prompt}
### input:
""""print('Hello, world!')""""
### output:
""")
print(result[0]['generated_text'])