In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

csv_file_path = "/content/cleaned_first_100_issues.csv"
df = pd.read_csv(csv_file_path)


indexes_to_include = [0, 1, 2, 3, 4]
df_filtered = df.iloc[indexes_to_include]


dataset = Dataset.from_pandas(df_filtered)


base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-mlabonne-enhanced"

llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)


base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

max_length = 512


def calculate_weights(text):


    tokens = llama_tokenizer.tokenize(text)
    weights = [1] * len(tokens)
    return tokens, weights

def tokenize_function(examples):
    texts = examples["body"]
    tokenized_inputs = {"input_ids": [], "attention_mask": []}

    for text in texts:
        tokens, weights = calculate_weights(text)
        weighted_tokens = [token for token, weight in zip(tokens, weights) for _ in range(weight)]
        encoded = llama_tokenizer(" ".join(weighted_tokens), padding="max_length", truncation=True, max_length=max_length)
        tokenized_inputs["input_ids"].append(encoded["input_ids"])
        tokenized_inputs["attention_mask"].append(encoded["attention_mask"])

    return tokenized_inputs


tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)


peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)


peft_model = get_peft_model(base_model, peft_parameters)


train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="adamw_hf",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-1,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)


trainer = Trainer(
    model=peft_model,
    train_dataset=tokenized_datasets,
    tokenizer=llama_tokenizer,
    args=train_params
)


trainer.train()


trainer.save_model(refined_model)
