<a href="https://colab.research.google.com/github/shubha07m/LLM-initials/blob/main/shonku.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Using a model without tuning for Q and A

In [105]:
# Importing library
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import os
from PyPDF2 import PdfReader
import re
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer
from nltk.tokenize import sent_tokenize
from lmqg import TransformersQG
import spacy
nlp = spacy.load("en_core_web_sm")
import ast
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments, pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [70]:
# Load the model and tokenizer

model_name = "google/flan-t5-base"  # You can use 'small' if 'base' is too large
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [71]:
# Move the model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [72]:
# Creating a function for simple Q and A

def answer_question(question):
    # Construct a prompt that encourages detailed answers
    prompt = f"""You are an AI assistant answering questions about a resume. Provide a detailed answer with multiple sentences if appropriate. Include relevant context and explanations.

Question: {question}

Detailed answer:"""

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate the answer with parameters encouraging longer outputs
    outputs = model.generate(
        **inputs,
        max_length=200,  # Increased max length
        num_return_sequences=1,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        no_repeat_ngram_size=2,
        length_penalty=1.0  # Encourage slightly longer outputs
    )

    # Decode and return the answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [73]:
# Example usage
questions = [
    "What is the capital of India?",
    "Who is the CEO of Microsoft?",
    "What are my skills in NLP?"]

for question in questions:
    answer = answer_question(question)
    print(f"Q: {question}")
    print(f"A: {answer}")
    print()

Q: What is the capital of India?
A: Madhya Pradesh

Q: Who is the CEO of Microsoft?
A: Microsoft's chief executive officer is Steve Jobs.

Q: What are my skills in NLP?
A: NLP is the use of machine learning to help you understand the connections between information, processes, and information in a machine.



# Fine tuning a LLM on my profile data

In [74]:
# Installing required library
!pip install transformers lmqg PyPDF2



In [76]:
# Mounting the Google Drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Extraction of text data from PDF

In [77]:
# Function to read text data from PDF

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

In [78]:
# Extract text from all PDFs

pdf_path = '/content/drive/My Drive/my_resume_data'

extracted_texts = [extract_text_from_pdf(pdf_path+'/' + pdf) for pdf in os.listdir(pdf_path)]

extracted_texts[0][:15]

'Dear Employer, '

In [79]:
# Minimum text cleaning

def clean_text(text):
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Split the text into words and filter out words with more than four digits
    words = text.split()
    cleaned_words = [word for word in words if not re.search(r'\d{5,}', word)]

    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

# Extracted_texts is a list of texts to be cleaned

cleaned_texts = [clean_text(text) for text in extracted_texts]
len(cleaned_texts)
cleaned_texts_string = str(cleaned_texts[0])
cleaned_texts_string[:100]

'Dear Employer In my search for new endeavors I am excited to find this opportunity with your company'

## Making sentence size chunks: different stratagy

In [80]:
# Clause-based splitting

def split_into_clauses(sentence):
    doc = nlp(sentence)
    clauses = []
    for token in doc:
        if token.dep_ == "ROOT":
            clause = ' '.join([t.text for t in token.subtree])
            clauses.append(clause)
    return clauses

long_sentence = cleaned_texts_string
clauses = split_into_clauses(long_sentence)

print(len(clauses))

for i, clause in enumerate(clauses, 1):
    print(f"Clause {i}: {clause}")

5
Clause 1: Dear Employer
Clause 2: In my search for new endeavors I am excited to find this opportunity with your company I am a recent PhD graduate May 2024 in computer systems With expertise in data science and machine learning I possess a strong foundation in computer vision NLP and MLOps I am currently located in Sunnyvale California
Clause 3: I m open to relocation and authorized to work on OPT EAD
Clause 4: My key skillsets are like below Computer Vision Developed and deployed YOLO and CNN 2D and 3D object detection segmentation and perception Applied OpenCV with Azure for real time object detection and tracking as well as for conducting in depth research in biometric privacy Utilized geospatial data visualization tools such as Google Earth Engine for large scale satellite imagery analysis and MapInfo for detailed spatial analysis Proficient in utilizing transfer learning techniques with pre trained models like ResNet VGG and MobileNet for faster convergence and improved accurac

In [81]:
# Punctuation-based splitting

def split_at_punctuation(sentence):
    return re.split(r'[,;()]', sentence)

segments = split_at_punctuation(long_sentence)
segments = [seg.strip() for seg in segments if seg.strip()]

print(len(segments))

for i, segment in enumerate(segments, 1):
    print(f"Segment {i}: {segment}")

1
Segment 1: Dear Employer In my search for new endeavors I am excited to find this opportunity with your company I am a recent PhD graduate May 2024 in computer systems With expertise in data science and machine learning I possess a strong foundation in computer vision NLP and MLOps I am currently located in Sunnyvale California Im open to relocation and authorized to work on OPT EAD My key skillsets are like below Computer Vision Developed and deployed YOLO and CNN 2D and 3D object detection segmentation and perception Applied OpenCV with Azure for real time object detection and tracking as well as for conducting in depth research in biometric privacy Utilized geospatial data visualization tools such as Google Earth Engine for large scale satellite imagery analysis and MapInfo for detailed spatial analysis Proficient in utilizing transfer learning techniques with pre trained models like ResNet VGG and MobileNet for faster convergence and improved accuracy in computer vision tasks Exp

In [82]:
# Fixed-length chunks with overlap

def split_into_chunks(sentence, chunk_size=10, overlap=3):
    words = sentence.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = split_into_chunks(cleaned_texts_string)

print(len(chunks), type(chunks[0]))

for i, chunk in enumerate(chunks[:10], 1):
    print(f"Chunk {i}: {chunk}")

64 <class 'str'>
Chunk 1: Dear Employer In my search for new endeavors I am
Chunk 2: endeavors I am excited to find this opportunity with your
Chunk 3: opportunity with your company I am a recent PhD graduate
Chunk 4: recent PhD graduate May 2024 in computer systems With expertise
Chunk 5: systems With expertise in data science and machine learning I
Chunk 6: machine learning I possess a strong foundation in computer vision
Chunk 7: in computer vision NLP and MLOps I am currently located
Chunk 8: am currently located in Sunnyvale California Im open to relocation
Chunk 9: open to relocation and authorized to work on OPT EAD
Chunk 10: on OPT EAD My key skillsets are like below Computer


## Convert text data to Q/A pair

In [83]:
# Question and answer generation (QAG) with language models (LMs) using lmqg
# Ref: https://github.com/asahi417/lm-question-generation

model = TransformersQG(language="en")
context = chunks
qa = model.generate_qa(context)

100%|██████████| 68/68 [00:00<00:00, 3236.93it/s]
100%|██████████| 67/67 [00:00<00:00, 3702.43it/s]


## Processing data to proper format

In [84]:
print(len(qa))
print(qa[3])
print(type(qa[0][0]), type(qa[0]), type(qa))

64
[('When was the recent PhD graduate in computer systems?', 'May 2024')]
<class 'tuple'> <class 'list'> <class 'list'>


In [85]:
# Convert to a list of dictionaries

processed_data = [
    {
        "question": item[0][0],
        "answer": item[0][1]
    }
    for item in raw_data
]

In [86]:
# Print the first few items to verify
print(type(processed_data), type(processed_data[0]))

for item in processed_data[:3]:
    print(item)

<class 'list'> <class 'dict'>
{'question': 'What is the purpose of a Dear Employer?', 'answer': 'In my search for new endeavors'}
{'question': 'What do you do with your endeavors?', 'answer': 'I am excited to find this opportunity'}
{'question': 'What type of graduate is I a recent graduate of?', 'answer': 'PhD graduate'}


In [87]:
# Split the data into training and validation sets

train_data, val_data = train_test_split(processed_data, test_size=0.2, random_state=42)

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")

# Print a few examples from each set to verify
print("\nTraining data examples:")
for item in train_data[:2]:
    print(item)

print("\nValidation data examples:")
for item in val_data[:2]:
    print(item)

Number of training examples: 51
Number of validation examples: 13

Training data examples:
{'question': 'What is Azure used for?', 'answer': 'real time object detection and tracking'}
{'question': 'What is NER?', 'answer': 'entity recognition'}

Validation data examples:
{'question': 'What is Kubernetes based for monitoring and visualization with Grafana?', 'answer': 'Bayesian optimization'}
{'question': 'What is a greater illustration of my background and background?', 'answer': 'learning pipelines'}


In [91]:
# Convert our data to the Dataset format

train_dataset = Dataset.from_dict({
    'question': [item['question'] for item in train_data],
    'answer': [item['answer'] for item in train_data],
    'context': [f"{item['question']} {item['answer']}" for item in train_data]  # Using Q+A as context
})

val_dataset = Dataset.from_dict({
    'question': [item['question'] for item in val_data],
    'answer': [item['answer'] for item in val_data],
    'context': [f"{item['question']} {item['answer']}" for item in val_data]  # Using Q+A as context
})

# Initializing the model and tokenizer

In [90]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name).to(device)

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization of data

In [93]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name).to(device)

Using device: cuda


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
# Function for tokenization

def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    start_positions = []
    end_positions = []

    for i, context in enumerate(contexts):
        answer = examples['answer'][i]
        start_idx = context.find(answer)
        end_idx = start_idx + len(answer)

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        offsets = inputs["offset_mapping"][i]

        start_token = context_start
        while start_token < context_end and offsets[start_token][0] <= start_idx:
            start_token += 1
        start_positions.append(start_token - 1)

        end_token = context_end
        while end_token > context_start and offsets[end_token][1] >= end_idx:
            end_token -= 1
        end_positions.append(end_token + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [95]:
# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

print("Dataset preparation complete.")
print(f"Number of training examples: {len(tokenized_train_dataset)}")
print(f"Number of validation examples: {len(tokenized_val_dataset)}")

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset preparation complete.
Number of training examples: 51
Number of validation examples: 13


## Defining training argumen, training and evaluation

In [100]:
# Define training arguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    push_to_hub=False,
)

In [101]:
# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

In [102]:
# Start training
print("Starting model fine-tuning...")
trainer.train()

Starting model fine-tuning...


Epoch,Training Loss,Validation Loss
1,No log,2.007553
2,No log,1.173406
3,No log,0.634561
4,No log,0.470771
5,No log,0.473989
6,No log,0.490706
7,No log,0.479385
8,No log,0.455423
9,No log,0.517577
10,No log,0.552509


TrainOutput(global_step=200, training_loss=0.11631434440612792, metrics={'train_runtime': 102.5478, 'train_samples_per_second': 24.866, 'train_steps_per_second': 1.95, 'total_flos': 249874053811200.0, 'train_loss': 0.11631434440612792, 'epoch': 50.0})

In [103]:
import json


# Define the path to the "special_models" folder in your Google Drive
save_path = "/content/drive/My Drive/fine_tuned_resume_qa_model"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the fine-tuned model
print("Saving the fine-tuned model...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved successfully.")

Saving the fine-tuned model...
Model saved successfully.


In [104]:
# Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the evaluation results
eval_results_path = os.path.join(save_path, "eval_results.json")
with open(eval_results_path, 'w') as f:
    json.dump(eval_results, f)
print(f"Evaluation results saved to {eval_results_path}")

print(f"All files have been saved to: {save_path}")

Evaluating the model...


Evaluation results: {'eval_loss': 0.6609388589859009, 'eval_runtime': 0.2062, 'eval_samples_per_second': 63.046, 'eval_steps_per_second': 4.85, 'epoch': 50.0}
Evaluation results saved to /content/drive/My Drive/fine_tuned_resume_qa_model/eval_results.json
All files have been saved to: /content/drive/My Drive/fine_tuned_resume_qa_model


In [108]:
# Path to your saved model
model_path = "/content/drive/My Drive/fine_tuned_resume_qa_model"

# Load the model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [109]:
# Function for question and answer
def answer_question(question):
    # Use a generic context that includes all possible information from your resume
    context = "This context contains all information from my resume."

    # Get the answer
    result = qa_pipeline(question=question, context=context)

    return result['answer']

In [110]:
# Interactive question-answering loop
print("Ask me questions about the resume. Type 'quit' to exit.")
while True:
    question = input("Your question: ")
    if question.lower() == 'quit':
        break
    answer = answer_question(question)
    print(f"Answer: {answer}\n")

Ask me questions about the resume. Type 'quit' to exit.
Your question: tell me about yourself?
Answer: my resume

Your question: what is your name?
Answer: my resume

Your question: what is your skill in NLP?
Answer: my

Your question: quit
