<a href="https://colab.research.google.com/github/pranalibose/Resume_Analyser/blob/main/RA_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1: Install Required Libraries

In [None]:
#!pip install transformers datasets peft accelerate

In [None]:
#!pip install rouge-score

## Step 2: Load and Preprocess the Data

In [None]:
import json
from datasets import Dataset

# Load the JSON data
with open('/content/LangVision/resume_dataset.json', 'r') as f:
    data = json.load(f)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({
    "resume_text": [item["resume_text"] for item in data],
    "instruction": [item["instruction"] for item in data],
    "feedback": [item["feedback"] for item in data]
})

# Split the dataset into training and evaluation sets
dataset = dataset.train_test_split(test_size=0.1)

## Step 3: Tokenize the Data

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

def preprocess_function(examples):
    inputs = [f"Instruction: {inst}\nResume: {resume}" for inst, resume in zip(examples['instruction'], examples['resume_text'])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize targets
    labels = tokenizer(examples['feedback'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'instruction', 'feedback', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 27
    })
    test: Dataset({
        features: ['resume_text', 'instruction', 'feedback', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

## Step 4: Apply LoRA to the Model

In [None]:
from transformers import T5ForConditionalGeneration
from peft import get_peft_model, LoraConfig, TaskType

# Load the base model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# # Setup LoRA configuration
# lora_config = LoraConfig(
#     r=8,  # Rank of the LoRA
#     lora_alpha=16,  # Scaling factor
#     lora_dropout=0.1,  # Dropout rate
#     task_type=TaskType.SEQ_2_SEQ_LM
# )

# # Wrap the model with LoRA
# model = get_peft_model(model, lora_config)

## Step 5: Fine-Tune the Model

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./flan-t5-lora-resume-feedback",
#     evaluation_strategy="epoch",
#     learning_rate=5e-5,
#     report_to="wandb",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_total_limit=2,
#     max_grad_norm=1.0,
#     logging_dir="./logs",
#     logging_steps=10,
#     fp16=True,  # Enable mixed precision training if supported
# )

training_args = TrainingArguments(
    output_dir="./flan-t5-resume-feedback",
    evaluation_strategy="epoch",
    learning_rate=3e-5,  # Lower learning rate
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    max_grad_norm=1.0,  # Gradient clipping
    push_to_hub=False,
    hub_model_id="pranalibose/flan-t5-base"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Start training
trainer.train()

  trainer = Trainer(


HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67ab85fd-3cecfc576e02b26823984070;da7d900f-2092-4b97-8aef-da3742315bdd)

Invalid username or password.

In [None]:
tokenizer.save_pretrained("pranlaibose/flan-t5-base") # push the tokenizer also to hub (important!)

## Step 6: Evaluate the model

In [None]:
def evaluate_model(model, tokenizer, examples):
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)  # Initialize Rouge scorer

    results = []
    for example in examples:
      input_text = f"Instruction: {example['instruction']}\nResume: {example['resume_text']}"
      input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device) # Move input to the same device as the model

      outputs = model.generate(input_ids, max_length=512) # Generate outputs
      predicted_feedback = tokenizer.decode(outputs[0], skip_special_tokens=True)
      target_feedback = example['feedback']

      score = scorer.score(target_feedback, predicted_feedback)
      results.append(score)

    # Calculate average Rouge scores
    avg_rouge1 = sum(score['rouge1'].fmeasure for score in results) / len(results)
    avg_rougel = sum(score['rougeL'].fmeasure for score in results) / len(results)

    print(f"Average Rouge-1: {avg_rouge1}")
    print(f"Average Rouge-L: {avg_rougel}")

evaluate_model(model, tokenizer, data) # Evaluate on the loaded data

Average Rouge-1: 0.13262157678558573
Average Rouge-L: 0.12092715778679111


## Step 7: Test on a real data

In [None]:
sample_resume = "Summary: Experienced software engineer with 5+ years of expertise in Python, Java, and cloud technologies. Skills: Python, Java, AWS, SQL, Docker. Experience: Software Engineer at XYZ Corp (2018-2023). Education: B.Tech in Computer Science from ABC University."
sample_instruction = "Highlight the candidate's experience with data analysis and suggest specific projects or skills that are relevant to a data scientist role."

# Tokenize the sample input with padding and truncation
inputs = tokenizer(
    f"Instruction: {sample_instruction}\nResume: {sample_resume}",
    return_tensors="pt",
    max_length=512,
    truncation=True,
    padding="max_length"
).to(model.device) # Move the input tensors to the same device as the model

# Generate feedback using keyword arguments
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128
)

# Decode the generated feedback
generated_feedback = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated Feedback: {generated_feedback}")

Generated Feedback: Must have a minimum of 5 years of experience with data analysis and a minimum of 5 years of experience with cloud technologies.


## Step 8: Push the model to hub

In [None]:
import os
os.environ["HF_AUTH_TOKEN"] = "your_actual_token"

In [None]:
trainer.push_to_hub()
tokenizer.save_pretrained("pranalibose/flan-t5-base")