In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
import torch
from trl import SFTTrainer, SFTConfig

In [3]:
PAD_TOKEN = "<|pad|>"

In [4]:
class Finetune():
    def __init__(
            self,
            trainingData: str,
            token: str,
            testSplit: float = 0.2,
            outputDir: str = "./results",
            modelName: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
            newModelName: str = "Llama-8B-FT"
        ):

        self.modelName = modelName
        self.newModelName = newModelName
        self.trainingData = trainingData
        self.token = token
        self.testSplit = testSplit
        self.outputDir = outputDir
        self.newModelName = newModelName

        self.model, self.tokenizer = self.loadModel()

        self.dataset = self.loadDataset()

        self.loraConfig = self.configLora()

    def loadModel(self):
        tokenizer = AutoTokenizer.from_pretrained(
            self.modelName,
            token=self.token
        )
        tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
        tokenizer.padding_side = "right"

        quantization = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        model = AutoModelForCausalLM.from_pretrained(
            self.modelName,
            quantization_config=quantization,
            token=self.token,
            device_map="auto"
        )
        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

        return model, tokenizer

    def loadDataset(self):
        data = pd.read_csv(self.trainingData)
        data["text"] = data.apply(self.format_example, axis=1)

        train, temp = train_test_split(data, test_size=self.testSplit)
        val, test = train_test_split(temp, test_size=self.testSplit)

        train.to_json("train.json", orient="records", lines=True)
        val.to_json("val.json", orient="records", lines=True)
        test.to_json("test.json", orient="records", lines=True)

        dataset = load_dataset("json", data_files={
            "train": "train.json",
            "validation": "val.json",
            "test": "test.json"
        })

        return dataset

    def configLora(self):
        lora_config = LoraConfig(
            r=8,
            lora_alpha=16,
            target_modules=[
                "self_attn.q_proj",
                "self_attn.k_proj",
                "self_attn.v_proj",
                "self_attn.o_proj",
                "mlp.gate_proj",
                "mlp.up_proj",
                "mlp.down_proj",
            ],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        self.model = prepare_model_for_kbit_training(self.model)
        self.model = get_peft_model(self.model, lora_config)

        print(f"{self.model.print_trainable_parameters()} can be trained")

        return lora_config

    def finetune(self):
        sftConfig = SFTConfig(
            output_dir=self.outputDir,
            dataset_text_field="text",
            max_seq_length=512,
            num_train_epochs=3,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=16,
            optim="paged_adamw_32bit",
            evaluation_strategy="steps",
            eval_steps=0.2,
            save_steps=0.2,
            logging_steps=10,
            learning_rate=2e-4,
            fp16=True,
            save_strategy="steps",
            warmup_ratio=0.1,
            save_total_limit=2,
            lr_scheduler_type="constant",
            report_to="tensorboard",
            save_safetensors=True,
            dataset_kwargs={
                "add_special_tokens": False,
                "append_concat_token": False
            }
        )

        trainer = SFTTrainer(
            model=self.model,
            args=sftConfig,
            train_dataset=self.dataset["train"],
            eval_dataset=self.dataset["validation"],
            peft_config=self.loraConfig,
            tokenizer=self.tokenizer
        )

        trainer.train()

        trainer.save_model(self.newModelName)

        self.saveNewModel()

    def saveNewModel(self):
        tokenizer = AutoTokenizer.from_pretrained(
            self.newModelName,
            token=self.token
        )

        model = AutoModelForCausalLM.from_pretrained(
            self.modelName,
            token=self.token,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
        model = PeftModel.from_pretrained(model, self.newModelName)
        model.merge_and_unload()

        model.push_to_hub(self.newModelName, max_shard_size="5GB")
        tokenizer.push_to_hub(self.newModelName)

    def format_example(self, row: dict):
        messages = [
            {
                "role": "system",
                "content": "IMPORT THIS"
            },
            {
                "role": "user",
                "content": row["Question"]
            },
            {
                "role": "assistant",
                "content": row["Answer"]
            }
        ]
        return self.tokenizer.apply_chat_template(messages, tokenize=False)

In [5]:
# %pip install gdown

In [6]:
# !gdown --id 1fry6rdlp1m67e6T1ODcntqc0M5tih01l

In [7]:
# ft = Finetune(
#     trainingData="Aptitude.csv",
#     token="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"
# )

# ft.finetune()

In [8]:
NEW_MODEL = "Llama-8B-FT"
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    NEW_MODEL,
    token="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"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    token="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",
    torch_dtype=torch.float16,
    device_map="cuda"
)

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model = PeftModel.from_pretrained(model, NEW_MODEL)
model.merge_and_unload()

In [None]:
model.push_to_hub("crackthejob/aptitudeLLM", max_shard_size="5GB")
tokenizer.push_to_hub("crackthejob/aptitudeLLM")

In [None]:
%pip install pydrive

In [None]:
from pydrive.drive import GoogleDrive 
from pydrive.auth import GoogleAuth 
import os 
   
gauth = GoogleAuth() 
  
gauth.LocalWebserverAuth()        
drive = GoogleDrive(gauth)

In [None]:
import os
os.listdir(".")

In [None]:
for x in os.listdir(path): 
   
    f = drive.CreateFile({'title': x}) 
    f.SetContentFile(os.path.join(path, x)) 
    f.Upload() 
  
    # Due to a known bug in pydrive if we  
    # don't empty the variable used to 
    # upload the files to Google Drive the 
    # file stays open in memory and causes a 
    # memory leak, therefore preventing its  
    # deletion 
    f = None

In [6]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
download_file("results", "results")