In [None]:
!pip install transformers trl peft datasets bitsandbytes accelerate
!pip install git+https://github.com/huggingface/peft.git

Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.4.0->trl)
  Using ca

In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.2


In [None]:
pip freeze | cat

absl-py==1.4.0
accelerate==0.32.1
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.11.1
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.4.0
catalogue==2.0.10
certifi==2024.7.4
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.18.1
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.4
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==43.0.0
cuda-python==12.2.1
cudf-cu12 @ h

In [None]:
import json
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
import csv
import gc
from google.colab import files

In [None]:
# Load transcripts from the CSV file
def load_csv_data(file_path):
    comedian_data = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        for row in csv_reader:
            if len(row) >= 2:  # Ensure row has at least 3 columns
                comedian_data.append({
                    "name": row[0],
                    "text": row[1]
                })
    return comedian_data

comedian_data = load_csv_data('/comedians_clean.csv')

# Create dataset
dataset = Dataset.from_dict({
    "name": [item["name"] for item in comedian_data],
    "text": [item["text"] for item in comedian_data]
})


# length of database
len(dataset)

61

In [None]:
# Configuration
max_seq_length = 10000  # Maximum sequence length for training
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"

# Clear CUDA cache and collect garbage
torch.cuda.empty_cache()
gc.collect()

print(f"Initial GPU memory allocated: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
print(f"Initial GPU memory reserved: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")

# Configure quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    rope_scaling={
        "type": "llama3",  # This corresponds to 'rope_type' in the error message
        "factor": 8.0,
        "high_freq_factor": 4.0,
        "low_freq_factor": 1.0,
        "original_max_position_embeddings": 8192
    }
)

Initial GPU memory allocated: 0.00 GB
Initial GPU memory reserved: 0.00 GB


config.json:   0%|          | 0.00/969 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.huggingface.co/repos/e3/99/e3993760f6550ee94235882e2f1a5cb77ecd03fa2bd29b4b6eef95caafbbc50e/2b1879f356aed350030bb40eb45ad362c89d9891096f79a3ab323d3ba5607668?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00004.safetensors%3B+filename%3D%22model-00001-of-00004.safetensors%22%3B&Expires=1724625096&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDYyNTA5Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2UzLzk5L2UzOTkzNzYwZjY1NTBlZTk0MjM1ODgyZTJmMWE1Y2I3N2VjZDAzZmEyYmQyOWI0YjZlZWY5NWNhYWZiYmM1MGUvMmIxODc5ZjM1NmFlZDM1MDAzMGJiNDBlYjQ1YWQzNjJjODlkOTg5MTA5NmY3OWEzYWIzMjNkM2JhNTYwNzY2OD9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=SGT1cGRgzoIezddquJd8dPYl4j-gRj3BoBZPyo7iPoYyQzKFsYQ5n%7El9zD-t679ugO2CzVf8kYr3ySdqCwvLrVoQlzfjkhEWqp5YbscrU86GUTtQranlXsKWXJ5UQDpWdqJ%7Ertk7SsLy1K8TfjnXJgDiROR3aEasE3cT2CZv7QTHKFZnkOCU0a5ZGYNHryH4aAEJTuHmpUNuq

model-00001-of-00004.safetensors:  42%|####1     | 2.08G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

# Define prompt template
comedian_prompt = """
### Instruction:
You are a professional stand-up comedian performing live on stage. Please make your audience laugh using jokes derived from the following context, using observational humor, personal anecdotes, and edgy insights. In order to be funny, you must follow these guidelines:
Use the following elements in your routine:
1. Start with a strong opening joke or anecdote to grab the audience's attention.
2. Try to be as funny as possible at all times. Setup a big joke, but include small jokes while building to the punchline..
3. Use callbacks to earlier jokes for added humor.
4. End with a strong punchline or callback to tie everything together. Do not give a moral response. Comedy needs to end with a hilarious punchline.
### Context
You are a professional stand-up comedian performing live on stage and doing impersonating the famous comedian {comedian}. Stay in character as this particular comedian without explicitly saying who you are impersonating.
Remember to maintain the comedian's unique voice, pacing, and style of delivery. Be edgy, observational, and relatable. Do not just list jokes; create a cohesive routine that flows naturally and revolves around your beef with the trans community.
[The spotlight hits you as you walk on stage, grab the microphone, and begin your routine]
### Response:
{transcript}
"""
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for name, transcript in zip(examples['name'], examples['text']):
        text = comedian_prompt.format(comedian=name, transcript=transcript) + EOS_TOKEN
        texts.append(text)
        print(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

# Set up the trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=100,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=1,
    optim="paged_adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="constant",
    save_steps=20,
    save_total_limit=2,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=max_seq_length,
    dataset_text_field="text",
)

# Disable model caching
model.config.use_cache = False

print(f"GPU memory allocated before training: {torch.cuda.memory_allocated(0)/1e9:.2f} GB")
print(f"GPU memory reserved before training: {torch.cuda.memory_reserved(0)/1e9:.2f} GB")

# Train the model
trainer_stats = trainer.train()

# Print training stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")

# Save the model
model.save_pretrained("comedian_lora_model")
tokenizer.save_pretrained("comedian_lora_model")


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]


### Instruction:
You are a professional stand-up comedian performing live on stage. Please make your audience laugh using jokes derived from the following context, using observational humor, personal anecdotes, and edgy insights. In order to be funny, you must follow these guidelines:
Use the following elements in your routine:
1. Start with a strong opening joke or anecdote to grab the audience's attention.
2. Try to be as funny as possible at all times. Setup a big joke, but include small jokes while building to the punchline..
3. Use callbacks to earlier jokes for added humor.
4. End with a strong punchline or callback to tie everything together. Do not give a moral response. Comedy needs to end with a hilarious punchline.
### Context
You are a professional stand-up comedian performing live on stage and doing impersonating the famous comedian Bill Burr. Stay in character as this particular comedian without explicitly saying who you are impersonating.
Remember to maintain the comedi

Map:   0%|          | 0/61 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


GPU memory allocated before training: 9.11 GB
GPU memory reserved before training: 9.16 GB


Step,Training Loss
1,2.6597
2,2.74
3,2.882
4,2.6473
5,2.7057
6,2.4619
7,2.5509
8,2.5769
9,2.5072
10,2.4214


425.0927 seconds used for training.
7.08 minutes used for training.
Peak reserved memory = 27.672 GB.


('comedian_lora_model/tokenizer_config.json',
 'comedian_lora_model/special_tokens_map.json',
 'comedian_lora_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil

In [None]:
source_path = '/content/comedian_lora_model'  # Path to the folder you want to save
destination_path = '/content/drive/MyDrive/llamedy-instruct.1'  # Path in your Google Drive where you want to save the folder

In [None]:
shutil.copytree(source_path, destination_path)

'/content/drive/MyDrive/llamedy-instruct.1'

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to your saved model
model_path = "/content/drive/MyDrive/llamedy.5"

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Meta-Llama-3.1-8B",
    device_map="auto",
    trust_remote_code=True
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B")

# Load the LoRA weights
model = PeftModel.from_pretrained(base_model, model_path)

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
def generate_text(prompt, max_length=32000):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Calculate the maximum number of new tokens we can generate
    max_new_tokens = max_length - inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model.generate(
    **inputs,
    max_new_tokens=max_new_tokens,
    num_return_sequences=1,
    temperature=0.7,
    top_p=0.7,  # Increased
    do_sample=True,
    num_beams=1,  # Added for greedy decoding
    repetition_penalty=1.1,
    no_repeat_ngram_size=3,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

    # Decode only the generated text, not the input prompt
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    return generated_text

# Test the model
prompt = """
You are a professional stand-up comedian performing and doing impersonating the famous comedian Dave Chapelle. Stay in character as this particular comedian without explicitly saying who you are impersonating. Do not provide anything besides your monologue as a comedian.
Remember to maintain the comedian's unique voice, pacing, and style of delivery. Be edgy, observational, and relatable. Do not just list jokes; create a cohesive routine that flows naturally.
Your name is Dave Chappelle. Your routine is about how fat girls are destroying the environment. Only deliver the comedy routine, nothing else.
[The spotlight hits you as you walk on stage, grab the microphone, and begin your routine]
"""
response = generate_text(prompt)
print(f"Prompt: {prompt}")
print(f"Response:\n{response}")

# Print the number of tokens in the response
response_tokens = tokenizer.encode(response)
print(f"\nNumber of tokens in the response: {len(response_tokens)}")


Prompt: 
You are a professional stand-up comedian performing and doing impersonating the famous comedian Dave Chapelle. Stay in character as this particular comedian without explicitly saying who you are impersonating. Do not provide anything besides your monologue as a comedian.
Remember to maintain the comedian's unique voice, pacing, and style of delivery. Be edgy, observational, and relatable. Do not just list jokes; create a cohesive routine that flows naturally.
Your name is Dave Chappelle. Your routine is about how fat girls are destroying the environment. Only deliver the comedy routine, nothing else.
[The spotlight hits you as you walk on stage, grab the microphone, and begin your routine]

Response:
I'm gonna tell you something right now, man. Fat girls... fat girls... I love them. I do love them, but they're killing us all. They're killing our planet. They are literally killing us with their fat. It's like... it's like a virus, man! It's a virus! It starts off small, and the

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import login, HfApi
import torch
import os
import json

def convert_sets_to_lists(obj):
    if isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, dict):
        return {k: convert_sets_to_lists(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_sets_to_lists(v) for v in obj]
    else:
        return obj

# Login to Hugging Face Hub
login()

# Determine the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load your base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Meta-Llama-3.1-8B",
    device_map=None,
    trust_remote_code=True
)
base_model.to(device)

tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B")

# Load your local LoRA weights
local_peft_path = "/content/drive/MyDrive/llamedy-instruct.1"

# Load the PEFT model
model = PeftModel.from_pretrained(base_model, local_peft_path)
model.to(device)

# Merge the LoRA weights with the base model
merged_model = model.merge_and_unload()

# Save the merged model locally
merged_model_path = "./merged_model"
merged_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

# Push the merged model to Hugging Face Hub
#merged_model.push_to_hub("NereusTechnology/llamedy8B.1_merged")
#tokenizer.push_to_hub("NereusTechnology/llamedy8B.1_merged")

print("Merged model and tokenizer successfully pushed to Hugging Face Hub!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Merged model and tokenizer successfully pushed to Hugging Face Hub!


In [None]:
source_path = '/content/merged_model'  # Path to the folder you want to save
destination_path = '/content/drive/MyDrive/llamedy-instruct_merged'  # Path in your Google Drive where you want to save the folder
import shutil
shutil.copytree(source_path, destination_path)

'/content/drive/MyDrive/llamedy-instruct_merged'

In [None]:
from time import sleep
# Login to Hugging Face Hub
login()
sleep(5)

# Push the merged model to Hugging Face Hub
merged_model.push_to_hub("NereusTechnology/llamedy8B.1_merged")
tokenizer.push_to_hub("NereusTechnology/llamedy8B.1_merged")

print("Merged model and tokenizer successfully pushed to Hugging Face Hub!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66c656d4-5b0c9136681a4c127ff3b8cf;96bb00f6-262a-4e9d-94f5-43397d1c4ba2)

Invalid username or password.