In [1]:
import pymupdf

In [2]:
def load_pdf(pdf_file_path):
    contents = []
    doc = pymupdf.open("resume.pdf")
    for page in doc:
        text = page.get_text()
        contents.append(text)

    return "\n".join(contents)

In [3]:
docs = load_pdf("resume.pdf")

In [4]:
docs

'Raj Kishor Naruka \nPython | Data | Development \nI am an aspiring Data Scientist with 1+ year experience as an Associate Business Analyst, skilled in data analysis, SQL, Python, AWS, machine learning,\nand NLP. I am eager to leverage NLP, ML, and deep learning skills to contribute to a data-driven team. \nrajnaruka0698@gmail.com \n0403905464 \nMelbourne, Australia \nwww.kaggle.com/rajnaruka0698 \nlinkedin.com/in/raj-naruka \ngithub.com/rajnaruka06 \nEDUCATION \nMaster of Data Science \nSwinburne University of Technology/ Melbourne \n02/2023 - Present,  \nBachelor of Technology (cse) \nLovely Professional University/ Punjab \n07/2016 - 06/2021,  \n76.00% \nPERSONAL PROJECTS \nPrivacy Protector (08/2023 - 10/2023) \nDeployed a Finetuned pretrained BERT on privacy policy documents on streamlit local to\nclassify as acceptable or non acceptable with an f1 score of 0.86 \nTechnologies used: Python, Transformers API from huggingface, sklearn, NLP, Streamlit \nStoryGPT - Text Generation Mod

In [5]:
## Function for chunking the text into smaller parts
def chunk_text(text, chunk_size=1000, overlap=100):
    chunks = []
    start = 0
    end = chunk_size
    while start < len(text):
        chunks.append(text[start:end])
        start = end - overlap
        end = start + chunk_size
    return chunks

In [6]:
chunked_docs = chunk_text(docs, 100, 10)
chunked_docs

['Raj Kishor Naruka \nPython | Data | Development \nI am an aspiring Data Scientist with 1+ year experie',
 'ar experience as an Associate Business Analyst, skilled in data analysis, SQL, Python, AWS, machine ',
 ', machine learning,\nand NLP. I am eager to leverage NLP, ML, and deep learning skills to contribute ',
 'ontribute to a data-driven team. \nrajnaruka0698@gmail.com \n0403905464 \nMelbourne, Australia \nwww.kag',
 'a \nwww.kaggle.com/rajnaruka0698 \nlinkedin.com/in/raj-naruka \ngithub.com/rajnaruka06 \nEDUCATION \nMast',
 'TION \nMaster of Data Science \nSwinburne University of Technology/ Melbourne \n02/2023 - Present,  \nBa',
 'sent,  \nBachelor of Technology (cse) \nLovely Professional University/ Punjab \n07/2016 - 06/2021,  \n7',
 '/2021,  \n76.00% \nPERSONAL PROJECTS \nPrivacy Protector (08/2023 - 10/2023) \nDeployed a Finetuned pret',
 'tuned pretrained BERT on privacy policy documents on streamlit local to\nclassify as acceptable or no',
 'able or non acceptable with 

In [7]:
from sentence_transformers import SentenceTransformer
import tqdm
import torch

  from tqdm.autonotebook import tqdm, trange


In [8]:
## Create a vector representation for each chunk

def create_embeddings(chunks, model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    model = SentenceTransformer(model_name)
    embeddings = []
    for chunk in tqdm.tqdm(chunks, desc = "Creating embeddings"):
        embeddings.append(model.encode(chunk, convert_to_tensor=True))
    
    ## Create embedding dict
    embeddings_dict = {text : embedding for text, embedding in zip(chunks, embeddings)}
    return embeddings_dict

In [9]:
embeddings = create_embeddings(chunked_docs)
torch.save(embeddings, "embeddings.pt")

No sentence-transformers model found with name TinyLlama/TinyLlama-1.1B-Chat-v1.0. Creating a new one with mean pooling.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Creating embeddings: 100%|██████████| 37/37 [00:01<00:00, 29.27it/s]


In [10]:
## Retrieve top k similar chunks

def retrieve_topk_similar(query, embeddings, k=5, model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = {}
    for text, embedding in embeddings.items():
        cos_sim = torch.nn.functional.cosine_similarity(query_embedding.unsqueeze(0), embedding.unsqueeze(0))
        # cos_sim = torch.nn.functional.cosine_similarity(query_embedding, embedding)
        similarities[text] = cos_sim.item()
    
    topk_similar = sorted(similarities.items(), key = lambda x: x[1], reverse = True)[:k]
    return [text for text, score in topk_similar]

In [11]:
query = "What are the skills of the candidate?"
topk_similar = retrieve_topk_similar(query, embeddings)
topk_similar

No sentence-transformers model found with name TinyLlama/TinyLlama-1.1B-Chat-v1.0. Creating a new one with mean pooling.


['ar experience as an Associate Business Analyst, skilled in data analysis, SQL, Python, AWS, machine ',
 'sting and development. \nDeveloped machine learning models for tasks including clustering and custome',
 'to design\nand implementation. \nDeveloped SQL scripts for generating Power BI reports, ensuring data ',
 ', machine learning,\nand NLP. I am eager to leverage NLP, ML, and deep learning skills to contribute ',
 'imulate data environments and eﬃciently populate\nthe MySQL database with dummy data for testing and ']

In [12]:
## Create context to pass to the LLM model

def create_context(query, topk_similar):
    context = [f"Document{idx}: {text}" for idx, text in enumerate(topk_similar, 1)]
    context = " ".join(context)
    return context

In [13]:
context = create_context(query, topk_similar)
context

'Document1: ar experience as an Associate Business Analyst, skilled in data analysis, SQL, Python, AWS, machine  Document2: sting and development. \nDeveloped machine learning models for tasks including clustering and custome Document3: to design\nand implementation. \nDeveloped SQL scripts for generating Power BI reports, ensuring data  Document4: , machine learning,\nand NLP. I am eager to leverage NLP, ML, and deep learning skills to contribute  Document5: imulate data environments and eﬃciently populate\nthe MySQL database with dummy data for testing and '

In [14]:
## Prompt the LLM model

prompt = """Please answer the following question based on the given context.
context: {context}
question: {query}
"""

In [15]:
## Use LLM to answer the question

from transformers import pipeline

def answer_question(query, context, model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
    qa_pipeline = pipeline("text-generation", model=model_name)
    messgae = prompt.format(context = context, query = query)
    answer = qa_pipeline(messgae, max_length=1000)[0]["generated_text"]
    return answer


In [16]:
query = "What are the skills of the candidate?"
ans  = answer_question(query, context)
ans = ans.split("answer:")[1].strip()
ans

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'The candidate has experience as an Associate Business Analyst, skilled in data analysis, SQL, Python, AWS, machine learning, and NLP. They have developed machine learning models for tasks including clustering and customization, developed SQL scripts for generating Power BI reports, ensuring data quality, and leveraging NLP, ML, and deep learning skills to contribute to the development of data environments and efficiently populate the MySQL database with dummy data for testing and evaluation.'