In [44]:
# check if the GPU is detected

import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3070


In [45]:
# download TinyLlama 1.1B

from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", filename="config.json")

'/home/reggie/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json'

In [46]:
import pandas as pd
raw_data = pd.read_csv('swim_dataset.csv')
raw_data

Unnamed: 0,Question,Answer
0,What is the importance of body alignment in fr...,Maintaining a streamlined body position reduce...
1,How does head position affect freestyle techni...,Keeping the head in line with the spine and lo...
2,What is the role of hip rotation in freestyle?,Proper hip rotation enhances stroke length and...
3,Why is a high elbow important during the catch...,A high elbow position during the catch phase a...
4,How can I improve my freestyle breathing techn...,Practice bilateral breathing and exhale fully ...
...,...,...
512,What strategies can help me increase my breast...,Perform short sprints with a focus on quick t...
513,How do I improve my breaststroke pullout?,Practice underwater pullout drills emphasizin...
514,What techniques can help me reduce drag in bre...,Focus on keeping your head in line with your ...
515,How can I develop better breaststroke turns?,Incorporate open turn drills concentrating on...


In [47]:
# define data format
def preprocess(example):
    return {
        "prompt": f"<|user|> {example['Question']} <|assistant|> {example['Answer']}"
    }

In [51]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device_map = {"": torch.cuda.current_device()}

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map=device_map
)

model = prepare_model_for_kbit_training(model)

In [52]:
# load and tokenize dataset

from datasets import load_dataset

dataset = load_dataset("csv", data_files="swim_dataset.csv")
dataset = dataset.map(preprocess)

def tokenize_function(examples):
    return tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize_function, batched=True)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])