# Downloading and importing required libraries

In [None]:
!pip install transformers datasets torch
!pip install transformers huggingface_hub



In [None]:
from huggingface_hub import login
login(token='Upload your token here')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments,EarlyStoppingCallback
import torch
import json
import numpy as np

# Setting up the model for fine-tuning

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Preparing data on which model will be fine-tuned

In [None]:
with open('/content/dataset_manual_creation.txt', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)
df['input_text'] = df.apply(lambda x: f"Question: {x['question']} Context: {x['context']}", axis=1)
df['target_text'] = df['answer']
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])

In [None]:
def preprocess_function(examples):
    inputs = examples['input_text']
    model_inputs = tokenizer(inputs,
                             max_length=512,
                             truncation=True,
                             padding="max_length",
                             return_attention_mask=True,
                             add_special_tokens = True,
                             return_tensors = "pt"
                             )
    labels = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length",return_tensors = "pt")

    return {
        "input_ids": model_inputs["input_ids"].squeeze(),
        "attention_mask": model_inputs["attention_mask"].squeeze(),
        "labels": labels["input_ids"].squeeze(),
    }

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1045 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Training

In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


trainer.train()

# Saving model and uploading it to hugging face

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the model
trainer.save_model('./flan_t5_finetuned')
tokenizer.save_pretrained('/content/flan_t5_finetuned')

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer


model_name = "mognc/t5_7_epoch"

model = T5ForConditionalGeneration.from_pretrained('/content/flan_t5_finetuned')
tokenizer = T5Tokenizer.from_pretrained('/content/flan_t5_finetuned')

model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

print(f"Model and tokenizer have been uploaded to Hugging Face Hub as '{model_name}'.")


# Loading the model from hugging face

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mognc/t5_7_epoch")
model = AutoModelForSeq2SeqLM.from_pretrained("mognc/t5_7_epoch")

# Functions to generate answers

In [None]:
#This function will only take query and will generate answer
def generate_answer(query):
    input_text = f"question: {query}"
    input = tokenizer.encode_plus(
        input_text,
        max_length = 300,
        truncation = True,
        padding = True,
        add_special_tokens = True,
        return_tensors='pt'
        )
    input_ids = input['input_ids']
    attenion_mask = input['attention_mask']

    outputs = model.generate(
        input_ids,
        attention_mask = attenion_mask,
        max_length=512,
        num_beams=9,
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        do_sample=True
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

#This function will take both query and context and will generate answer
def generate_answer_context(query, context):
    input_text = f"question: {query} context: {context}"
    input = tokenizer.encode_plus(
        input_text,
        max_length=3000,
        truncation=True,
        padding="max_length",
        add_special_tokens=True,
        return_tensors='pt'
        )

    input_ids = input['input_ids']
    attention_mask = input['attention_mask']

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=512,
        num_beams=9,
        early_stopping=True,
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        do_sample=True
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test

In [None]:
query = "What does Viserys suggest about how Daenerys should view her marriage to Drogo?"
answer = generate_answer(query)
print("Generated Answer:", answer)

Generated Answer: She should view her marriage as a reversal of the past, rather than a reversal of the past.


In [None]:
context = df['context']
query = "What does Viserys suggest about how Daenerys should view her marriage to Drogo?"
print(generate_answer_context(query, context))

He suggests that Daenerys should view her marriage to Drogo as a victory for the Seven King.
