
##### This notebook demonstrates a sample project to generate cover letters based on job descriptions and candidate's information.
    It involves the following steps:
    1. Extract text from resumes, cover letters, and project documents.
    2. Encode the texts into embeddings using Sentence Transformers.
    3. Find relevant texts based on cosine similarity to the job description.
    4. Generate a cover letter using GPT-2 based on the relevant texts and job description.




In [None]:
# Import necessary libraries
from IPython import get_ipython
from IPython.display import display
# %%
# Import libraries for PDF and DOCX handling
import PyPDF2
from docx import Document
import os


In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# Define functions to extract text from PDF and DOCX files and from a folder

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = [paragraph.text for paragraph in doc.paragraphs if paragraph.text]
    return '\n'.join(text)

def extract_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
            texts.append(text)
        elif filename.endswith('.docx'):
            text = extract_text_from_docx(file_path)
            texts.append(text)
    return texts

# Paths to your folders in Google Drive
resume_path = '/content/drive/My Drive/RAG cover letter tool/Resumes'
cover_letter_path = '/content/drive/My Drive/RAG cover letter tool/Cover Letters'
project_path = '/content/drive/My Drive/RAG cover letter tool/Previous projects'

# Extract texts
resume_texts = extract_texts_from_folder(resume_path)
cover_letter_texts = extract_texts_from_folder(cover_letter_path)
project_texts = extract_texts_from_folder(project_path)

print('Resumes:', resume_texts)
print('Cover Letters:', cover_letter_texts)
print('Projects:', project_texts)


Resumes: ["Rutvi Vadera\nChicago, United States, +1 (872) 297-7438, vaderarutvi1996@gmail.com\nL I N K S LinkedIn, T ech Blog, Github\nP R O F I L E Results-driven professional with extensive experience in program management, event execution, and social \nentrepreneurship. Proven track record in developing innovative programs, optimizing operations, & engaging \ndiverse stakeholders. Passionate about driving social impact and enhancing program effectiveness. Excited to \ncontribute to the Rustandy Center’s mission of advancing social entrepreneurship.\nP R O F E S S I O N A L  E X P E R I E N C E \nJun 2023 — Present Consultant, Corporate Strategy, MPOWER Financing Chicago, Il\n•Developed a go-to-market strategy for the high-risk Canadian college segment, identifying 11 colleges and \n45 programs through rigorous analysis, resulting in a $460M TAM increase.\n•Designed an impact measurement framework and piloted surveys to track employee diversity and student \noutcomes Published findin

In [None]:
# Install the sentence-transformers library
!pip install sentence-transformers




In [None]:
# Load sentence transformer model and tokenizer
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sentence_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") # Changed model to sentence_model

# Define a function to encode texts into embeddings
def encode_texts(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = sentence_model(**encoded_input)
    return model_output.last_hidden_state.mean(dim=1)

# Combine all texts and labels
all_texts = resume_texts + cover_letter_texts + project_texts
labels = ['resume'] * len(resume_texts) + ['cover_letter'] * len(cover_letter_texts) + ['project'] * len(project_texts)

# Encode documents
documents_embeddings = encode_texts(all_texts)


In [None]:
# Define a function to find relevant texts based on cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

def find_relevant_texts(query, embeddings, documents, top_n=5):
    # Encode the query using the tokenizer and sentence_model
    encoded_input = tokenizer([query], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        query_embedding = sentence_model(**encoded_input).last_hidden_state.mean(dim=1)

    cosine_scores = cosine_similarity(query_embedding, embeddings)[0]
    top_indices = np.argsort(cosine_scores)[::-1][:top_n]
    return [(documents[i], cosine_scores[i]) for i in top_indices]


In [None]:
# Example job description
job_description = "Seeking a data scientist with expertise in machine learning and natural language processing."

# Find the top 5 relevant texts for the job description
relevant_texts = find_relevant_texts(job_description, embeddings, all_texts)
for text, score in relevant_texts:
    print(f"Score: {score}\nText: {text[:500]}...\n")  # Print a snippet of each text


Score: 0.39548730850219727
Text: Dear
Hiring
Manager,
I
am
excited
to
apply
for
the
Market
Research
Operations
Associate
position
at
Putnam.
With
a
Master’s
in
Public
Policy
and
a
specialization
in
Data
Analytics
from
the
University
of
Chicago,
I
have
honed
a
robust
skill
set
in
research,
data
analysis,
and
project
management,
making
me
an
excellent
fit
for
this
role.
I
have
also
taken
advanced
PhD-level
courses
in
microeconomics
and
statistics,
which
have
further
sharpened
my
analytical
skills
and
ability
to
break
down
complex...

Score: 0.39129337668418884
Text: Dear
Hiring
Manager,
I’m
excited
to
apply
for
the
Capacity
Intern
role
at
WRI’s
Global
Restoration
Initiative
where
I
can
leverage
my
skills
in
strategy,
operations,
and
project
management
to
contribute
to
your
mission
of
restoring
degraded
land
and
building
capacity
for
locally
led
enterprises.
My
background
in
public
policy,
data
analytics,
and
grassroots
activism
equips
me
well
for
this
role.
The
TerraFund
and
Land
Acceler

In [None]:
from transformers import pipeline
# Set environment variable for Hugging Face Transformers
import os
os.environ["HF_HOME"] = "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"


In [None]:
from transformers import pipeline

# Initialize the text generation model pipeline
generator = pipeline('text-generation', model='gpt2', tokenizer='gpt2')  # You can specify the model and tokenizer

def generate_cover_letter(context, job_desc):
    # Check if context is a string and if not, convert it to a string
    if not isinstance(context, str):
        context = str(context)

    # Truncate the context to a shorter length
    max_context_length = 256  # Example: Limit context to 256 tokens
    context = context[:max_context_length]

    prompt = f"Context: {context}\nJob Description: {job_desc}\n###\nPlease write a detailed cover letter:"
    results = generator(prompt, max_new_tokens=512, num_return_sequences=1) # Use max_new_tokens instead of max_length
    return results[0]['generated_text']

# Using the relevant texts as context
# Ensure that context is a string by joining the text elements of relevant_texts
context = " ".join([text for text, _ in relevant_texts])  # Concatenate relevant texts to form the context
job_description = "Seeking a data scientist with expertise in machine learning and natural language processing."

# Generate the cover letter
cover_letter = generate_cover_letter(context, job_description)
print(cover_letter)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context: Dear
Hiring
Manager,
I
am
excited
to
apply
for
the
Market
Research
Operations
Associate
position
at
Putnam.
With
a
Master’s
in
Public
Policy
and
a
specialization
in
Data
Analytics
from
the
University
of
Chicago,
I
have
honed
a
robust
skill
set
in
research,
Job Description: Seeking a data scientist with expertise in machine learning and natural language processing.
###
Please write a detailed cover letter: http://lms.cps.nhgate.edu/
Email me: cps@nyct.edu


Purdue

Hiring Information:

Department

Office of

Communications

Office

(202) 687-4044

Please write to

Hired,

Purdue Center for

Communications and Information Administration,

H

1 Grand Lake Dr

Purdue,

Indiana 47315

Telephone:

217-638-3855

Fax:

216-638-3859

Email


University of Chicago

Hiring Information:

Department

Office of

Communications and Information Administration

Office

2-8-7

Farewell

Purdue,

Illinois 48221

Telephone:

(217) 722-7544

Fax:

(217) 722-6413

Email


University of Chicago

Hir

In [50]:
!echo "# RAG-project" >> README.md
!git init
!git add README.md
!git commit -m "first commit"
!git branch -M main
!git remote add origin https://github.com/rutvi1996/RAG-project.git
!git push -u origin main

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@c484777c6c26.(none)')
error: src refspec main does not match any
[31merror: failed to push some refs to 'https://github.com/rutvi1996/RAG-project.git'