In [None]:
%%capture
!pip install datasets transformers[sentencepiece] sacrebleu
!pip install accelerate
!pip install bitsandbytes
!pip install peft

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import bitsandbytes as bnb

In [None]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"

# Quantize the model

In [None]:
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
MODEL_NAME = "Alperens1/turna-gec-1"

tokenizer = AutoTokenizer.from_pretrained("boun-tabi-LMG/TURNA")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME,
                                              quantization_config=bnb_config,
                                              torch_dtype=torch.bfloat16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [None]:
print("Model loaded to:",model.device)
print("Model size:",round(model.get_memory_footprint()/10**9,3))
print("Available GPU memory:",round(torch.cuda.mem_get_info()[0]/10**9,3),
"\nTotal GPU memory:",round(torch.cuda.mem_get_info()[1]/10**9,3))

Model loaded to: cuda:0
Model size: 1.397
Available GPU memory: 14.218 
Total GPU memory: 15.836


# Get LORA CONFIG, PREPARE THE MODEL

In [None]:
peft_model = prepare_model_for_kbit_training(model) # prepares the whole model for kbit training

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.01,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

peft_model = get_peft_model(model, config) # Now you get a model ready for QLoRA training

In [None]:
peft_model.print_trainable_parameters()

trainable params: 7,077,888 || all params: 1,148,902,400 || trainable%: 0.6161


In [None]:
print("Model loaded to:",peft_model.device)

Model loaded to: cuda:0


# PREPARE AND PREPROCESS TRAINING DATA

In [None]:
dataset_gen_train = load_dataset("csv", data_files="train_data_ready.csv")["train"]

In [None]:
def preprocess_function(examples):
    source = [i+tokenizer.eos_token for i in examples["source"]]
    target = [i+tokenizer.eos_token for i in examples["target"]]

    model_inputs = tokenizer(source,
                             max_length=55, truncation=True,
                             )
    labels = tokenizer(target,
                       max_length=55, truncation=True
                       )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



tokenized_datasets = dataset_gen_train.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["source","target"])

# FINE-TUNE THE MODEL

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
small_batch = [tokenized_datasets[i] for i in range(2)]
# Pass the small batch through the data collator
collated_batch = data_collator(small_batch)

# Print the collated batch to inspect the output
print("Input IDs:", collated_batch['input_ids'])
print("Attention Mask:", collated_batch['attention_mask'])
print("Labels:", collated_batch['labels'])

Input IDs: tensor([[  207,   290,    66,    50,  7877,     4,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [ 9202,     5, 24991,    11, 14830,  4314,  5747,    48, 23652,   425,
            10,    30,  2266,   428,    50, 22046,    62,     5,  1099,   233,
            14,    46, 30361,  7732,    10,    62,  3829,   886, 10714,    87,
         12405, 10186,   657,     4,     1]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Labels: tensor([[  207,   290,  2634,  7716,     4,     1,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 

In [None]:
from transformers import Trainer, TrainingArguments
import transformers

training_args = TrainingArguments(
        per_device_train_batch_size=16,
        num_train_epochs=3,
        learning_rate=2e-4,
        bf16 = True,
        logging_steps=500,
        save_steps=500,
        output_dir="outputs",
        optim="adamw_torch",  #"paged_adamw_8bit", "adamw_torch_fused""
        lr_scheduler_type="cosine",
        warmup_ratio=0.1
        #load_best_model_at_end=True,
        )

trainer = Trainer(
    model=peft_model,
    train_dataset=tokenized_datasets,
    args=training_args,
    data_collator=data_collator
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,0.3187
1000,0.1132
1500,0.1017
2000,0.0926
2500,0.0932




Step,Training Loss
500,0.3187
1000,0.1132
1500,0.1017
2000,0.0926
2500,0.0932
3000,0.0899
3500,0.0861
4000,0.0884




TrainOutput(global_step=4329, training_loss=0.12009676534327109, metrics={'train_runtime': 12675.6407, 'train_samples_per_second': 5.461, 'train_steps_per_second': 0.342, 'total_flos': 2.247709885223731e+16, 'train_loss': 0.12009676534327109, 'epoch': 3.0})

In [None]:
trainer.model.save_pretrained("turna-ft-qlora2-")

# Try ft model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig

base_model_id = "Alperens1/turna-gec-1"

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model_id,  # Llama 2 7B, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained("boun-tabi-LMG/TURNA", trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "turna-ft-qlora2")

In [None]:
ft_model.device

device(type='cuda', index=0)

In [None]:
input_text = """Sendemi bizde ki oyunlardan habersizsin? Söylemedimmi ben sane?\
 Beni dinlesene! Bu sınırı hala daha aşmadıysak ben aşmaını bilirim"""
input_text2 = "Bende ki oyunlardan habersizsin." #--
input_text3 = "San ki diğeri daha güzel" #++
input_text4 = "Orada ki daha güzel" #--
input_text_ = "Sendemi bizde ki oyunlardan habersizsin?"
input_text2_ = "Sen de fazladan kalem var mı?"
input_text3_ = "Benki dünyaların fatihi!"
input_text4_ = "Yer de kilermi aradıkların?"
input_text5_ = "Aşağı da kilermi kitapını aldı?"
input_text6_ = "Aşağı da kislermi kitapsını altdı?"
input_text7 = "bekarlara, dullara, çocukları olmayan, bir çocuğuv akardeş isteyeLnlereM.."
input_text8 = "Dünyanın en ünlü terör uzmanları, biyoloji, kimya Qve nükleer fizikçiler, sosyolglr gelsin"

selected_text = input_text8 + "<EOS>"

In [None]:
model_input = tokenizer(selected_text, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=False))

<PAD> Dünyanın en ünlü terör uzmanları, biyoloji, kimya ve nükleer fizikçiler, sosyologlar gelsin.<EOS>


In [None]:
input_ids = tokenizer.encode(selected_text, return_tensors='pt').to("cuda")
print("input:",selected_text)


output_sequences = base_model.generate(input_ids=input_ids, max_length=50, num_return_sequences=1)


output_text = tokenizer.decode(output_sequences[0],skip_special_tokens=False)
print("output:",output_text)

input: Dünyanın en ünlü terör uzmanları, biyoloji, kimya Qve nükleer fizikçiler, sosyolglr gelsin<EOS>
output: <PAD> Dünyanın en ünlü terör uzmanları, biyoloji, kimya ve nükleer fizikçiler, sosyologlar gelsin.<EOS>


In [None]:
input_ids = tokenizer.encode(selected_text, return_tensors='pt').to("cuda")
print("input:",selected_text)


output_sequences = trainer.model.generate(input_ids=input_ids, max_length=50, num_return_sequences=1)


output_text = tokenizer.decode(output_sequences[0],skip_special_tokens=False)
print("output:",output_text)


input: Dünyanın en ünlü terör uzmanları, biyoloji, kimya Qve nükleer fizikçiler, sosyolglr gelsin<EOS>
output: <PAD> Dünyanın en ünlü terör uzmanları, biyoloji, kimya ve nükleer fizikçiler, sosyologlar gelsin.<EOS>


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    #per_device_eval_batch_size=2,

    #Save memory
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="adamw_bnb_8bit",
    #evaluation_strategy="epoch",

    #Load best model at end
    load_best_model_at_end=True,
    save_strategy="steps",
    save_steps = 1000
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
    #eval_dataset=tokenized_datasets["validation"]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,1.4623
1000,0.2217
1500,0.1469
2000,0.1108
2500,0.0868
3000,0.082
3500,0.0801




Step,Training Loss
500,1.4623
1000,0.2217
1500,0.1469
2000,0.1108
2500,0.0868
3000,0.082
3500,0.0801
4000,0.0764
4500,0.0667
5000,0.0616




KeyboardInterrupt: 

# Save model and tokenizer to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

version = "2.0-qlora"
path = "./model_"+version

# Save the model
trainer.model.save_pretrained(path)

# Save the tokenizer
#tokenizer.save_pretrained(path)



!cp -r model_1.0 "/content/drive/My Drive/Turkish Grammar Checker and Corrector/turna-ft-models/"

ValueError: Mountpoint must not contain a space.

In [None]:
trainer.model.save_pretrained("model_2.0-qlora")



# Test ft model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/Turkish Grammar Checker and Corrector/turna-ft-models/model_1.0/"
tokenizer_ft = AutoTokenizer.from_pretrained(path)
model_ft = AutoModelForSeq2SeqLM.from_pretrained(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model_ft.to(device)
model_ft.device

device(type='cuda', index=0)

In [None]:
dataset_gen["train"]["source"][20:30]

['Fakat bu şimdi imkânsız.',
 "Ege'nin bu güzel adasına iner inmez tuhaf bir duyguya kapıldım.",
 "Size yargılandığımız konuyuda mahkeme sürecini'de uzun uzun anlatmak istemiyorum.",
 'Bu konuda bana güven, örneklerini her gün görüyorum.',
 'Bir iki genç kız Ümit Yaşar demişlerdi.',
 'Yakın dönem Türk siyasi tarihindeki alçaklık döngüsü bu baharla beraber yine bizleri bekliyor...',
 'Temizlenen bölgelerin neresi olduğu ise belirtilmedi bugüne değin.',
 'Erdoğan o süreci şöyle anlatıyor: "Bir cezaevi sürecim oldu.',
 "Türkiye, Kuzey Kıbrıs Türklerinin, Güney'deki Rum Yönetimi ile egemen eşitlik temeline dayalı bir ilişki kurmasından vazgeçiyor mu, geçmiyor mu?",
 'Aylarca aradım.']

In [None]:
#compare to seq-tagger
from transformers import pipeline
pipe = pipeline("token-classification", model="GGLab/gec-tr-seq-tagger")

config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/404 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/755k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
for input_text,output in zip(dataset_gen["train"]["source"][30:40],dataset_gen["train"]["target"][30:40]):
  # Encode the input text
  input_ids = tokenizer_ft.encode(input_text, return_tensors="pt").to(device)

  # Generate predictions. Adjust the `num_return_sequences` and `max_length` as needed.
  # `max_length` should be at least one more than input length to get the next token.
  output_sequences = model_ft.generate(input_ids, max_length=len(tokenizer_ft.tokenize(input_text))+50)

  # Decode the generated sequence to text
  predicted_text = tokenizer_ft.decode(output_sequences[0], skip_special_tokens=False)

  # Extract the predicted next token. Since the input might be tokenized into subwords,
  # the last generated token(s) could represent the next word or part of it.
  next_token = predicted_text[len(input_text):]

  print(f"Input text: {input_text}")
  print(f"Target text: {output}")
  print(f"TURNA: {next_token}")
  print(f"GECTurk seq-tagger: {pipe(input_text)}")
  print()

Input text: Vicdan sahibi herkesin göklerin kapısını zorlıyacak o durumdan çok sakınması gerekir...
Target text: Vicdan sahibi herkesin göklerin kapısını zorlayacak o durumdan çok sakınması gerekir...
TURNA: kir...... Vicdan sahibi herkesin göklerin kapısını zorlayacak o durumdan çok sakınması gerekir...... Vicdan sahibi herkesin göklerin kapısını zorlayacak o durumdan çok sakınması gerekir...... Vicdan sahibi herkesin göklerin kapısını zorlayacak o durumdan çok sakınması gerekir...... Vicdan
GECTurk seq-tagger: [{'entity': 'B-rule_12', 'score': 0.9999114, 'index': 8, 'word': 'zor', 'start': 41, 'end': 44}, {'entity': 'B-rule_12', 'score': 0.99978286, 'index': 9, 'word': '##lı', 'start': 44, 'end': 46}]

Input text: Büyük Dostoyevski "Cinler" adlı muhteşem romanında Rus anarşist teröristlerini Necayef örneğinde anlatırken bu şiddet tutkusunu, "her yeri saran örgütlenme" mistisizmini çok iyi tasvir eder...
Target text: Büyük Dostoyevski "Cinler" adlı muhteşem romanında Rus anarşist terö

In [None]:
input_text = "Sendemi bizde ki oyunlardan habersizsin?"

input_ids = tokenizer.encode(input_text, return_tensors="pt")
#model.to("cpu")
# Generate predictions. Adjust the `num_return_sequences` and `max_length` as needed.
# `max_length` should be at least one more than input length to get the next token.
output_sequences = model.generate(input_ids, max_length=len(tokenizer.tokenize(input_text)) + 5)

# Decode the generated sequence to text
predicted_text = tokenizer.decode(output_sequences[0], skip_special_tokens=False)

# Extract the predicted next token. Since the input might be tokenized into subwords,
# the last generated token(s) could represent the next word or part of it.
next_token = predicted_text[len(input_text):]

print(f"Input text: {input_text}")
print(f"Predicted next token(s): '{next_token}'")

Input text: Sendemi bizde ki oyunlardan habersizsin?
Predicted next token(s): ' da da da da ve ki oyun da ki oyun da ki oyun da ki de'


# Push to Hub

In [None]:
%%capture
!pip install bitsandbytes
!pip install accelerate


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import bitsandbytes
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained("boun-tabi-LMG/TURNA")
model = AutoModelForSeq2SeqLM.from_pretrained("Alperens1/turna-gec-1",
                                              quantization_config=bnb_config,
                                              torch_dtype=torch.bfloat16)

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.68k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [None]:
model.eval()
inp = "hayatda en haki ki mürşit ilimdirr fendr.<EOS>"
out = model.generate(tokenizer.encode(inp, return_tensors="pt"))
print(tokenizer.decode(out[0]))



<PAD> Hayatda en haki ki mürşit ilimdirr fendr.. Hayatda en


In [None]:
from huggingface_hub import notebook_login
# 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("turna-gec-1")

README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.41G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Alperens1/turna-gec-1/commit/a88fb38de25a271ded0ec267511d96439ae08fe6', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='a88fb38de25a271ded0ec267511d96439ae08fe6', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!git lfs install
!GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Alperens1/turna-gec-1

Git LFS initialized.
fatal: destination path 'turna-gec-1' already exists and is not an empty directory.


In [None]:
%cd turna-gec-1

/content/turna-gec-1


In [None]:
%pwd

'/content/turna-gec-1'

In [None]:
!git reset a98a122fae7eca6047df195e2b1af317d2013c0f
!git push --force

Unstaged changes after reset:
M	README.md
M	config.json
M	generation_config.json
M	model.safetensors
Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
To https://huggingface.co/Alperens1/turna-gec-1
 + 2058773...a98a122 main -> main (forced update)
