In [69]:
import warnings
warnings.filterwarnings('ignore')

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [7]:
def fine_tune_gpt2(model_name, train_file, output_dir):
    # gpt2 ve tokinzer'ı yükledik.
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Eğitim setini yükledik wuhu.
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    # DataCollator --> Elemanları alıp batchler halinde düzenler. (Bunun için padding ve augmentation uygulayabilirler, mesela bu yapıyor.)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    # Parametreler
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )

    # Trainer'ı set et. (Gören de cümle Türkçe sanar...)
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    # Kaydet
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [5]:
import json

# Eldeki dosya formatını işlemek için gereken kodlar. Bu kısım her görev için değişiklik gösterecektir.
# Üstteki ksıımlar daha kalıcı.

def preprocess_intents_json(intents_file):
    with open(intents_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    preprocessed_data = []
    
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            preprocessed_data.append(f"User: {pattern}\n")
            for response in intent["responses"]:
                preprocessed_data.append(f"Assistant: {response}\n")
    
    return "".join(preprocessed_data)


def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)


intents_file = "intents.json"
output_file = "mental_health_data.txt"


preprocessed_data = preprocess_intents_json(intents_file)
save_preprocessed_data(preprocessed_data, output_file)

In [70]:
# Fine-tuning yapalım.
fine_tune_gpt2("gpt2", "mental_health_data.txt", "output")

In [14]:
#
#    https://www.linkedin.com/pulse/fine-tuning-gpt-2-large-language-model-unlocking-its-adamson-mbcs/
#    
#    Başka da olmak üzere en son bu siteden faydalandım. Her şeyi açıklayarak ya da linkleyerek gidiyor, işinize yarar umarım.
#
#    DATASET LİNKİ: https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data?resource=download
#    

In [15]:
# "output" klasöründe konfigürasyon, ağırlıklar gibi şeyler kaydediliyor.
# Şimdi kaydettiğimiz modeli yükleyip metin üretelim.

model = GPT2LMHeadModel.from_pretrained('output')
tokenizer = GPT2Tokenizer.from_pretrained('output')

 

In [63]:
prompt_text = "User: I feed depressed, do your best to support me.\n"
input_ids = tokenizer(prompt_text, return_tensors ="pt").input_ids
attention_mask = tokenizer(prompt_text, return_tensors = "pt").attention_mask


In [68]:
sequences = 5
for i in range(sequences):
    output = model.generate(
        input_ids,
        max_length=100,  
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=attention_mask,
        
        # Aşağıdaki 4 satır kaldırıldığında model her seferinde aynı konuşmayı üretiyor, üstelik cümleleri tekrar ediyor.
        
        do_sample=True,   # Use sampling during decoding
        top_k=50,         # Number of top-k tokens to sample from
        top_p=0.95,       # Cumulative probability of the top-k tokens to sample from
        temperature=1.35, # Controls randomness in sampling
    )
    text = tokenizer.decode(output[0],skip_special_tokens=True)
    print(f"Generated Text {i+1}: {text}")

Generated Text 1: User: I feed depressed, do your best to support me.
User: I'm so sorry but i'm stressed
Assistant: Talking about it all will bring you back to me. Tell me why you're feeling this way.
User: i feel so sorry
Assistant: Okay you tell me why. Let's talk more about it. What's behind the thoughts? What's the cause?
User: How long have you been feeling this way?
Assistant: 2 weeks
User
Generated Text 2: User: I feed depressed, do your best to support me.
Assistant: No problem at all Assistant: Great to see you back.
User: Why do i not like _____?
Assistant: There are many different types of depressed people. People with low mood and/or anxiety are considered hopeless. Some people with mild mood problems may enjoy trying to find some kind of mental support. When their mood stabilizes, they often seek recovery services. These services generally require medication and support groups
Generated Text 3: User: I feed depressed, do your best to support me.
Assistant: Tell me more.
A