**Convert df into hf dataset**

In [None]:
!pip install datasets

In [23]:
from datasets import Dataset
import pandas as pd
df = pd.read_csv('/content/anime-quotes.csv')

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,Anime,Character,Quote
0,0,(Trigun),Vash the Stampede,"When something is gained, something is lost. I..."
1,1,(Fune wo Amu),Mitsuya Majime,The vast ocean of words. Without a means to cr...
2,2,(D.Gray-man),Allen Walker,"What ever may happen to the order, we'll still..."
3,3,(Assassination Classroom 2nd Season),Korosensei,You must not hope for society to change for yo...
4,4,(Grisaia no Meikyuu),Yuuji Kazami,"If all you do is feed your dog, you'll get you..."


In [25]:
df.shape

(8612, 4)

In [26]:
df['Anime'] = df['Anime'].str[1:-1] # removing brackets from 'Anime' feature

In [27]:
(df.duplicated()==True).value_counts()
# No duplicates were present

False    8612
dtype: int64

In [28]:
df['Quote'] = df['Character'] + ' : ' + df['Quote']

In [29]:
df.drop(['Unnamed: 0', 'Character'], axis='columns', inplace=True)

In [30]:
df.head()

Unnamed: 0,Anime,Quote
0,Trigun,"Vash the Stampede : When something is gained, ..."
1,Fune wo Amu,Mitsuya Majime : The vast ocean of words. With...
2,D.Gray-man,Allen Walker : What ever may happen to the ord...
3,Assassination Classroom 2nd Season,Korosensei : You must not hope for society to ...
4,Grisaia no Meikyuu,"Yuuji Kazami : If all you do is feed your dog,..."


In [31]:
df['Anime'].value_counts()

Naruto                    508
Bleach                    423
One Piece                 213
Soul Eater                200
Kuroshitsuji              189
                         ... 
Dusk maiden of Amnesia      1
.hack//GIFT                 1
Sakura Taisen               1
Doraemon                    1
Comic Party                 1
Name: Anime, Length: 801, dtype: int64

In [32]:
dataset = Dataset.from_pandas(df) # convert dataframe into hf-dataset

In [33]:
dataset

Dataset({
    features: ['Anime', 'Quote'],
    num_rows: 8612
})

In [34]:
dataset[0]

{'Anime': 'Trigun',
 'Quote': "Vash the Stampede : When something is gained, something is lost. It's difficult to live after something like that. What is lost will never return. Important things, irreplaceable things. But what is needed to keep those things is in the firm will, packed into one bullet. Man knows... he knows that nothing will begin unless he speaks, and that nothing will change unless he moves."}

In [37]:
dataset.push_to_hub("sarthak-2002/anime-quotes")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sarthak-2002/anime-quotes/commit/a707fde13c83c9f661f12a74f8b98a2aedfd0d9d', commit_message='Upload dataset', commit_description='', oid='a707fde13c83c9f661f12a74f8b98a2aedfd0d9d', pr_url=None, pr_revision=None, pr_num=None)

**Start the process**

In [24]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [26]:
pip install -U accelerate

Collecting accelerate
  Using cached accelerate-0.27.2-py3-none-any.whl (279 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.1
    Uninstalling accelerate-0.27.1:
      Successfully uninstalled accelerate-0.27.1
Successfully installed accelerate-0.27.2


In [1]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer

from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer


In [2]:
os.environ["HF_TOKEN"] = userdata.get('HUGGINGFACE_KEY')

In [3]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             token=os.environ['HF_TOKEN'])

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [5]:
text = "Quote: Our doubts are traitors,"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Our doubts are traitors, and make us lose the good we oft might win, by fearing to attempt.

-William Shakespeare


In [18]:
print(inputs['input_ids'][0])
print(inputs['attention_mask'][0])

tensor([     2,  14782, 235292,   5626,  39396,    708, 176411, 235269],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')


In [19]:
os.environ["WANDB_DISABLED"] = "false"

In [20]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [40]:
data = load_dataset("sarthak-2002/anime-quotes")

In [44]:
data

DatasetDict({
    train: Dataset({
        features: ['Anime', 'Quote'],
        num_rows: 8612
    })
})

In [45]:
print(data['train']['Anime'][69])
print(data['train']['Quote'][69])

Naruto
Yashamaru : Physical wounds will definitely bleed and may look painful 
but over time they heal by themselves and if you apply medicine, 
they will heal faster. What's troublesome are wounds of the heart. Nothing is harder to heal. They're a bit different from physical injuries. You can't apply medicine for one thing and sometimes, they never heal. There's only one cure for a wound of the heart. 
It's a bit bothersome and you can only receive it from someone else. What is it? Love.


In [46]:
data = data.map(lambda samples: tokenizer(samples["Quote"]), batched=True)

Map:   0%|          | 0/8612 [00:00<?, ? examples/s]

In [47]:
data

DatasetDict({
    train: Dataset({
        features: ['Anime', 'Quote', 'input_ids', 'attention_mask'],
        num_rows: 8612
    })
})

In [51]:
def formatting_func(example):
    text = example['Quote'][0]
    return [text]

In [55]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=75,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    ),
    peft_config=lora_config,
    formatting_func = formatting_func
)



Map:   0%|          | 0/8612 [00:00<?, ? examples/s]



In [56]:
trainer.train()

Step,Training Loss
1,2.7621
2,1.0074
3,1.855
4,1.9072
5,0.7791
6,2.5578
7,2.5855
8,0.7439
9,1.6198
10,1.4608


TrainOutput(global_step=75, training_loss=0.5997031643986702, metrics={'train_runtime': 69.6039, 'train_samples_per_second': 17.24, 'train_steps_per_second': 1.078, 'total_flos': 369104018657280.0, 'train_loss': 0.5997031643986702, 'epoch': 50.0})

In [57]:
text = """Itachi Uchiha : Those who cannot """
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

In [60]:
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Itachi Uchiha : Those who cannot 
bear the weight of responsibility will not 
be allowed to seek it. And those who will not 
protect the innocent will not be allowed to 
do so. And I, Itachi


In [73]:
text = """Gintama : I would always choose soul over """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Gintama : I would always choose soul over 100% safety. Because the soul, which knows the pain of loss, will advance towards the enemy, even if it has the power to return at any time. It will advance towards victory


In [75]:
text = """Monkey D. Luffy : Pirates are not bad, rather """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Monkey D. Luffy : Pirates are not bad, rather 
they're the only thing that's good in this world. 
They're the only hope for a world filled with 
nothing but despair. So, for me, who


In [77]:
text = """Pain : Pirates are not bad, rather """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Pain : Pirates are not bad, rather <b>they</b>'re necessary. <b>They</b>'re <b>the</b> only ones who can protect the <b>people</b> who can't protect <b>themselves</b>. <b>They


In [85]:
text = """Armin Arlelt : Pirates are not bad, rather """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Armin Arlelt : Pirates are not bad, rather <em>bad</em> is what they look for in a <em>bad</em> man, right? :D :D :D :D :D :D :D :D :D :D :D


In [79]:
quote = 'I don\'t think '
text1 = f"""Eren Jaeger : {quote}"""
text2 = f"""Naruto Uzumaki : {quote}"""
text3 = f"""Sasuke Uchiha : {quote}"""

In [84]:
texts = [text1, text2, text3]
for text in texts:
  inputs = tokenizer(text, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=30)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  print('---------------------------------------')

Eren Jaeger : I don't think 14-year-old me could understand the impact of a single bullet. But I'm sure he would understand that one shot can change the
---------------------------------------
Naruto Uzumaki : I don't think <em>that</em> will protect me, guys! It's stupid to depend on something that's fake! The only thing fake is something that doesn
---------------------------------------
Sasuke Uchiha : I don't think <em>I </em>have the <em>time </em>for <em>drama, </em>Sakura.
Sakura : I'm sorry, I-
---------------------------------------


Naruto Uzumaki : I don't think <em>that</em> will protect me, guys! It's stupid to depend on something that's fake! The only thing fake is something that doesn

In [89]:
text = """Jiraiya : Our doubts are traitors, """
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=70)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Jiraiya : Our doubts are traitors, 
       Honest mistakes, 
       Which point us in the right direction. 
       Fools who choose to see the good in things as they are. 
       Cowards who won't look for the evil that could destroy them. 
       We are the ones who stand between evil and the world. 
       We


**Pushing the model to hub**

In [64]:
trainer.push_to_hub('AnimeBot')

adapter_model.safetensors:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

events.out.tfevents.1708786860.100a30908bfe.3255.0:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/sarthak-2002/outputs/commit/cc218ab2241e46d0234ce0eb6804d45a529bb928', commit_message='AnimeBot', commit_description='', oid='cc218ab2241e46d0234ce0eb6804d45a529bb928', pr_url=None, pr_revision=None, pr_num=None)

In [65]:
tokenizer.push_to_hub('AnimeBot')

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sarthak-2002/AnimeBot/commit/a917473734ee91b401dc6fadb7fc1c3505725541', commit_message='Upload tokenizer', commit_description='', oid='a917473734ee91b401dc6fadb7fc1c3505725541', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model.push_to_hub('AnimeBot')