# Resume selection

Start by getting the documents from Google Drive using API from shared folder

Source used: https://developers.google.com/drive/api/quickstart/python

In [1]:
# pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [2]:
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from google.oauth2.credentials import Credentials
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.docstore.document import Document
from dotenv import load_dotenv
from langgraph.graph import StateGraph
from langchain.memory import ConversationBufferMemory
from pydantic import BaseModel, Field
from typing import Optional, List, Tuple
import json
    
load_dotenv()

True

In [3]:
# Locations
SCOPES = ["https://www.googleapis.com/auth/drive"]

FOLDER_ID = "1YVPomt1xV_GBqE0IaUsydybqUX9WhaO7"

DOWNLOAD_DIR = "app/static/files/"

In [4]:
# Authenticate function
def authenticate():

    creds = None

    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                "credentials.json", SCOPES
            )
            creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())

    return build("drive", "v3", credentials=creds)

In [5]:
# Get files information
def list_files_in_folder(service, folder_id):

    query = f"'{folder_id}' in parents and trashed=false"
    results = service.files().list(q=query, fields="files(id, name)").execute()
    return results.get("files", [])

In [6]:
# Download files
def download_file(service, file_id, file_name, download_path):

    request = service.files().get_media(fileId=file_id)
    file_path = os.path.join(download_path, file_name)

    with open(file_path, "wb") as file:
        file.write(request.execute())

    print(f"Downloaded: {file_name}")

In [None]:
def main():
    try:
        service = authenticate()

        if not os.path.exists(DOWNLOAD_DIR):
            os.makedirs(DOWNLOAD_DIR)

        files = list_files_in_folder(service, FOLDER_ID)

        if not files:
            print("⚠ No files found in the folder.")
            return

        for file in files:
            download_file(service, file["id"], file["name"], DOWNLOAD_DIR)
      
        print("\nAll files downloaded successfully!")

    except HttpError as error:
        print(f"An error occurred: {error}")

if __name__ == "__main__":
    main()


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=551693389828-kjkq02525tj6abkin3ld307bd4f72uhb.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A47313%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=337F3tmOwQd8jwGIPfLN36Kxrnrj6G&access_type=offline
Downloaded: Resume-Samples-1-36-13.pdf
Downloaded: Resume-Samples-1-36-25.pdf
Downloaded: Resume-Samples-1-36-10.pdf
Downloaded: Resume-Samples-1-36-30.pdf
Downloaded: Resume-Samples-1-36-34.pdf
Downloaded: Resume-Samples-1-36-17.pdf
Downloaded: Resume-Samples-1-36-31.pdf
Downloaded: Resume-Samples-1-36-33.pdf
Downloaded: Resume-Samples-1-36-2.pdf
Downloaded: Resume-Samples-1-36-26.pdf
Downloaded: Resume-Samples-1-36-15.pdf
Downloaded: Resume-Samples-1-36-14.pdf
Downloaded: Resume-Samples-1-36-11.pdf
Downloaded: Resume-Samples-1-36-35.pdf
Downloaded: Resume-Samples-1-36-19.pdf
Downloaded: Resume-Samples-1-36-6.pdf
Downloaded: Resume-Samp

Now I will extract the content of each document

In [7]:
# Extract information from resumes
def extract_resume_text(folder_path):
    resumes = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(folder_path, file))
            pages = []
            for page in loader.load():
                pages.append(page.page_content)  # Get text from each page
            
            resume_text = " ".join(pages)  # Combine all pages into one text
            resumes.append({"file": file, "text": resume_text})
    return resumes

In [8]:
processed_resumes = extract_resume_text(DOWNLOAD_DIR)

In [7]:
llm = GoogleGenerativeAI(model="models/gemini-2.0-flash", google_api_key=os.environ.get("GOOGLE_API_KEY"))

In [None]:
def summarize_resume(resume):
    
    prompt = f"""
        Based on the following resume, generate a structured and detailed profile suitable for matching future job descriptions. Organize the output into the following sections:

        - Information: Name, email, phone number
        - Candidate Summary: A brief professional summary highlighting key strengths, experience, and career focus.
        - Core Skills: List of technical and soft skills relevant to job applications.
        - Work Experience: Structured descriptions of previous roles, including job title, company name, dates of employment, responsibilities, key achievements, and technologies used.
        - Education: Degree(s), university name, graduation year, and relevant coursework if applicable.
        - Certifications & Training: Any relevant certifications or training programs completed.
        - Projects & Portfolio: Notable projects, personal or professional, with a brief description of objectives and outcomes.
        - Industry Keywords: A list of relevant industry terms, skills, and technologies that enhance job-matching capabilities.

        If the user doesn't have any of the sections, go to the next section.
        Don't add any other random text, just the answer to my request.

        Here is the resume:

        {resume}
    """

    return llm.invoke(prompt).strip()

In [108]:
for file in processed_resumes:
    file["summary"] = summarize_resume(file["text"])
    print(f"Done {file['file']}")

Done Resume-Samples-1-36-10.pdf
Done Resume-Samples-1-36-11.pdf
Done Resume-Samples-1-36-12.pdf
Done Resume-Samples-1-36-13.pdf
Done Resume-Samples-1-36-14.pdf
Done Resume-Samples-1-36-15.pdf
Done Resume-Samples-1-36-16.pdf
Done Resume-Samples-1-36-17.pdf
Done Resume-Samples-1-36-18.pdf
Done Resume-Samples-1-36-19.pdf
Done Resume-Samples-1-36-2.pdf
Done Resume-Samples-1-36-20.pdf
Done Resume-Samples-1-36-21.pdf
Done Resume-Samples-1-36-22.pdf
Done Resume-Samples-1-36-23.pdf
Done Resume-Samples-1-36-24.pdf
Done Resume-Samples-1-36-25.pdf
Done Resume-Samples-1-36-26.pdf
Done Resume-Samples-1-36-27.pdf
Done Resume-Samples-1-36-28.pdf
Done Resume-Samples-1-36-29.pdf
Done Resume-Samples-1-36-3.pdf
Done Resume-Samples-1-36-30.pdf
Done Resume-Samples-1-36-31.pdf
Done Resume-Samples-1-36-32.pdf
Done Resume-Samples-1-36-33.pdf
Done Resume-Samples-1-36-34.pdf
Done Resume-Samples-1-36-35.pdf
Done Resume-Samples-1-36-36.pdf
Done Resume-Samples-1-36-4.pdf
Done Resume-Samples-1-36-5.pdf
Done Resume-

## Start point after getting and processing the data

Decided to use InMemoryVectorStore because don't need data to be stored after execution, as this is a small project

In [8]:
with open("app/static/data/processed_resumes.json", "r", encoding="utf-8") as f:
    processed_resumes = json.load(f)

In [9]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

docs = [Document(page_content=res["summary"], metadata={"source": res["file"]}) for res in processed_resumes]

vector_store = InMemoryVectorStore.from_documents(docs, embeddings)

Define state for memory and conversation memory for context

In [10]:
class AgentState(BaseModel):
    user_input: Optional[str] = Field(default=None, description="User's latest input message")
    response: Optional[List[str]] = Field(default=None, description="Generated response to the user")
    related: Optional[str] = Field(default=None, description="Indicates relation status of the current conversation")
    retrieved_files: Optional[List[Tuple[str, str]]] = Field(default=None, description="List of retrieved files (source, page_content)")
    last_action: Optional[str] = Field(default=None, description="Last operation performed")

    def update_user_input(self, input_text: str):
        """Updates the user input."""
        self.user_input = input_text

    def update_response(self, response_text: str):
        """Ensures the response is stored as a list."""
        if isinstance(response_text, list):
            self.response = response_text  # If already a list, keep it as is
        else:
            self.response = [response_text]  # Convert string to a list

    def update_retrieved_files(self, files: List[Tuple[str, str]]):
        """Updates the list of retrieved files."""
        self.retrieved_files = files

    def store_relation(self, relation_status: str):
        """Stores relation status for conversation tracking."""
        self.related = relation_status

    def update_last_action(self, action: str):
        """Updates the last performed action."""
        self.last_action = action


In [11]:
chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

  chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [12]:
def check_context(state: AgentState) -> AgentState:

    memory = chat_memory.load_memory_variables({}).get("chat_history", [])
    
    prompt = f"""
        Based on the message and the memory, is this request related to the one before or are we talking about a new position?
        If the user is talking about a new position return New, otherwise Old.

        User: {state.user_input}

        Memory: {memory}

        Return ONLY New or Old.
    """

    related = llm.invoke(prompt).strip()

    state.store_relation(related)

    print(f"check_context done: {state}")

    return state

In [13]:
def first_router(state: AgentState):

    print(f"first_router done: {state}")
    
    return state.related

In [14]:
def find_top_5_best_fit(state) -> AgentState:

    # Clear old retrieved files
    state.update_retrieved_files([])
    

    prompt = f"""
        Structure the job description in a way that can be used to best match resumes descriptions which have the format below.

        - Information: Name, email, phone number
        - Candidate Summary: A brief professional summary highlighting key strengths, experience, and career focus.
        - Core Skills: List of technical and soft skills relevant to job applications.
        - Work Experience: Structured descriptions of previous roles, including job title, company name, dates of employment.
        - Education: Degree(s), university name, graduation year.
        - Certifications & Training: Any relevant certifications or training programs completed.
        - Industry Keywords: A list of relevant industry terms, skills, and technologies that enhance job-matching capabilities.

        Don't add any other random text, just the answer to my request.

        {state.user_input}
    """

    text_to_match = llm.invoke(prompt).strip()

    results = vector_store.similarity_search(text_to_match, k=5)

    document_info = [(doc.metadata["source"], doc.page_content) for doc in results]

    state.update_retrieved_files(document_info)

    state.update_last_action("find_top_5_best_fit")

    print(f"find_top_5_best_fit last: {state}")

    return state

In [15]:
def retrieve_request(state: AgentState) -> AgentState:
    prompt = f"""
        Based on the request of the user, route my next step:

        - If the user is giving a job description, return "candidates_description".
        - If the user is asking for you to be more detailed, return "get_details"
        - If he is asking specifically to answer with the resume files, return "get_resume_by_filename"

        User Text: {state.user_input}

        Return ONLY get_resume_by_filename, get_details or candidates_description.
    """

    route = llm.invoke(prompt).strip()

    print(f"Retrieve request: {route}")

    state.update_last_action(route)

    return state

In [16]:
def get_details(state: AgentState) -> AgentState:

    candidates_info = []

    prompt = f"""
        Understand from the user request, which candidates the user wants more information.
        If the user specifies that wants from the first 3 for example, return 1,2,3.
        If the user specify names, return the number they are in the order of the information candidates I will provide.

        User Text: 
        {state.user_input}

        Information candidates: 
        {state.response}

        Return ONLY the numbers, with comma as separator.
    """

    numbers = llm.invoke(prompt).strip()

    numbers = numbers.replace(" ", "").split(",")

    for number in numbers:
        candidates_info.append(state.retrieved_files[int(number) - 1][1])

    state.update_response(candidates_info)
    state.update_last_action("get_details")
    
    return state

In [17]:
def candidates_description(state: AgentState) -> AgentState:

    page_contents = [content for _, content in state.retrieved_files]

    state.update_last_action("candidates_description")
    state.update_response(page_contents) 

    return state

In [18]:
def get_resume_by_filename(state: AgentState) -> AgentState:

    filename = [file for file, _ in state.retrieved_files]

    state.update_last_action("get_resume_by_filename")
    state.update_response(filename)

    return state


In [19]:
def create_answer(state: AgentState):

    files = []

    if state.last_action == "candidates_description":
        
        prompt = f"""
            Based on the results of the retrieve, write a text with the structure below for each candidate retrieved.

            Candidates: {state.response}

            Structure to follow:

            Here are the best matches based on your description:

            Name candidate | phone number | email
            Create a short and concise text why is a good fit.
            ...
        """

        result = llm.invoke(prompt).strip()

    elif state.last_action == "get_details":

        prompt = f"""
            Create a detailed and structured description of the candidates.
            Try to summarize a bit the text but keep some detail.

            Candidates: {state.response}

            Structure to follow:

            Name candidate
            Short summary
            - Paragraph about work experience
            - Paragraph about studies
            - Paragraph about skills
            - Another short paragraph with some information you think is relevant
            ...
        """

        result = llm.invoke(prompt).strip()

    else:
        files = state.response

        result = f"Here are the files: {', '.join(files)}"

    state.update_last_action("create_answer")

    chat_memory.save_context({"question": state.user_input}, {"answer": result})

    return {'user_input': state.user_input, 'response': result, 'related': state.related, 'retrieved_files': state.retrieved_files, 'last_action': state.last_action}

In [20]:
def second_router(state: AgentState):

    return state.last_action

In [21]:
graph = StateGraph(AgentState)

graph.add_node("check_context", check_context)
graph.add_node("retrieve_request", retrieve_request)
graph.add_node("candidates_description", candidates_description)
graph.add_node("get_details", get_details)
graph.add_node("find_top_5_best_fit", find_top_5_best_fit)
graph.add_node("get_resume_by_filename", get_resume_by_filename)
graph.add_node("create_answer", create_answer)


graph.set_entry_point("check_context")
graph.add_conditional_edges(
    "check_context", 
    first_router,
    {"New": "find_top_5_best_fit", "Old": "retrieve_request"}
)
graph.add_edge("find_top_5_best_fit", "candidates_description")
graph.add_conditional_edges(
    "retrieve_request", 
    second_router,
    {"get_details": "get_details", "get_resume_by_filename": "get_resume_by_filename"}
)
graph.add_edge("candidates_description", "create_answer")
graph.add_edge("get_resume_by_filename", "create_answer")
graph.add_edge("get_details", "create_answer")
graph.set_finish_point("create_answer")

workflow = graph.compile()


# Debug: Print graph structure
print(workflow.get_graph().nodes)
print(workflow.get_graph().edges)

{'__start__': Node(id='__start__', name='__start__', data=<class '__main__.AgentState'>, metadata=None), 'check_context': Node(id='check_context', name='check_context', data=check_context(tags=None, recurse=True, explode_args=False, func_accepts_config=False, func_accepts={}), metadata=None), 'retrieve_request': Node(id='retrieve_request', name='retrieve_request', data=retrieve_request(tags=None, recurse=True, explode_args=False, func_accepts_config=False, func_accepts={}), metadata=None), 'candidates_description': Node(id='candidates_description', name='candidates_description', data=candidates_description(tags=None, recurse=True, explode_args=False, func_accepts_config=False, func_accepts={}), metadata=None), 'get_details': Node(id='get_details', name='get_details', data=get_details(tags=None, recurse=True, explode_args=False, func_accepts_config=False, func_accepts={}), metadata=None), 'find_top_5_best_fit': Node(id='find_top_5_best_fit', name='find_top_5_best_fit', data=find_top_5_b

In [22]:
# %%
from IPython.display import Image, display

try:
    display(Image(workflow.get_graph().draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

In [23]:
state = AgentState()

In [None]:
while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        print("Exiting chat...")
        break

    state.update_user_input(user_input)

    invoke_workflow = workflow.invoke(state)

    state.update_retrieved_files(invoke_workflow['retrieved_files'])
    state.update_response(invoke_workflow['response'])
    state.update_last_action(invoke_workflow['last_action'])

    print(f"Bot: {invoke_workflow['response']}\n")