# **Introduction:**

In this project, I've **fine-tuned a GPT-2 model** to generate conversational responses based on given instructions and context. Leveraging the power of natural language processing and deep learning, **the model is trained on a dataset containing input-output pairs**. To facilitate effective training, **I formated this dataset into instructional prompts paired with corresponding inputs and outputs.** Through preprocessing the data and employing techniques such as tokenization and language modeling, the model learns to generate coherent and contextually relevant responses to various tasks and inquiries. The trained model serves as a versatile and intelligent conversational agent capable of engaging in meaningful interactions across a wide range of scenarios. This project aims to showcase the capabilities of advanced language models in facilitating natural and fluid communication, paving the way for enhanced human-computer interaction and dialogue systems.

In [None]:
# These installations will depend on your environment.

!pip install jsonlines
!pip install datasets
!pip install accelerate>=0.21.0
!pip install transformers[torch]
!pip install accelerate -U
!pip install wandb

In [None]:
# Imports realted to the project

import os

import itertools # for iteration streaming datasets.
import jsonlines

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [None]:

def load_and_process_dataset():
    """
    Loads the dataset, processes it, and returns the processed data.

    Returns:
    processed_data (list): List of dictionaries containing processed prompts and outputs.
    """
    dataset = load_dataset("yahma/alpaca-cleaned", streaming=True, split='train')

    prompt_template_with_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {instruction}

    ### Input:
    {input}

    ### Response:"""

    prompt_template_without_input = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    {instruction}

    ### Response:"""

    processed_data = []
    for entry in dataset:
        if not entry["input"]:
            processed_prompt = prompt_template_without_input.format(instruction=entry["instruction"])
        else:
            processed_prompt = prompt_template_with_input.format(instruction=entry["instruction"], input=entry["input"])

        processed_data.append({"input": processed_prompt, "output": entry["output"]})

    return processed_data

def save_processed_data(processed_data, filename):
    """
    Saves processed data to a JSONL file.

    Args:
    processed_data (list): List of dictionaries containing processed prompts and outputs.
    filename (str): Name of the file to save.
    """
    with jsonlines.open(filename, 'w') as writer:
        writer.write_all(processed_data)

def load_processed_dataset(filename):
    """
    Loads processed dataset from a JSONL file.

    Args:
    filename (str): Name of the JSONL file.

    Returns:
    dataset (Dataset): Loaded dataset.
    """
    dataset = load_dataset("json", data_files=filename, split='train')
    return dataset

def preprocess_text(example):
    """
    Tokenizes the text in an example.

    Args:
    example (dict): Input example containing text.

    Returns:
    dict: Tokenized example.
    """
    return tokenizer(example['text'],
                    padding=True,
                    truncation=True,
                    return_tensors='pt')

def group_texts(examples):
    """
    Groups text examples into chunks.

    Args:
    examples (dict): Dictionary containing input examples.

    Returns:
    dict: Grouped examples.
    """
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

def train_model(model, train_dataset, eval_dataset, data_collator, training_args):
    """
    Trains the model.

    Args:
    model: The model to be trained.
    train_dataset (Dataset): Training dataset.
    eval_dataset (Dataset): Evaluation dataset.
    data_collator: Data collator for language modeling.
    training_args: Training arguments.
    """
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator
    )

    trainer.train()




In [None]:
if __name__ == "__main__":
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
    model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

    # Load and process dataset
    processed_data = load_and_process_dataset()
    save_processed_data(processed_data, "alpaca_processed.jsonl")

    # Load processed dataset
    dataset = load_processed_dataset("alpaca_processed.jsonl")

    # Tokenize dataset
    block_size = 128
    tokenized_dataset = dataset.map(preprocess_text, batched=True, remove_columns=dataset.column_names)

    # Group texts
    tokenized_dataset = tokenized_dataset.map(group_texts, batched=True)

    # Split dataset
    dataset = tokenized_dataset.train_test_split(test_size=0.2)
    train_dataset = dataset['train']
    val_dataset = dataset['test']

    # Initialize data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Define training arguments
    training_args = TrainingArguments(

    # set the path where you want to store model if you want to store in drive, you have to mount it first in google colab.
    output_dir="/content/drive/MyDrive/gpt_conversation",  # Directory to save model checkpoints and logs
    overwrite_output_dir=True,  # Overwrite the content of the output directory if it exists

    num_train_epochs=1,  # Total number of training epochs


    per_device_train_batch_size=32,  # Batch size per GPU during training
    per_device_eval_batch_size=32,  # Batch size per GPU during evaluation

    learning_rate=2.0e-5,  # Learning rate for the optimizer
    warmup_steps=1000,  # Number of steps for linear warmup
    logging_dir="./logs",  # Directory where logs will be saved
    logging_steps=100,  # Log training metrics every X steps

    evaluation_strategy="steps",  # Evaluate model every `eval_steps`
    eval_steps=2000,  # Evaluate model every X steps
    save_strategy="steps",  # Save model every `save_steps`
    save_steps=300,  # Save model every X steps

    save_total_limit=2,  # Limit the total number of saved checkpoints
    report_to="wandb",  # Enable logging to wandb
    seed=42,  # Random seed for reproducibility
    disable_tqdm=False  # Disable tqdm progress bars
)
     # Set wandb configuration
    os.environ["WANDB_PROJECT"] = "Gpt_conversation"
    os.environ["WANDB_LOG_MODEL"] = "true"
    os.environ["WANDB_WATCH"] = "false"

    # Train the model
   train_model(model, train_dataset, val_dataset, data_collator, training_args)
