In [59]:
import asyncio
import operator
from typing_extensions import TypedDict
from typing import  Annotated, List, Optional, Literal
from pydantic import BaseModel, Field

from tavily import TavilyClient, AsyncTavilyClient

from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.runnables import RunnableConfig

from langgraph.constants import Send
from langgraph.graph import START, END, StateGraph
from langsmith import traceable

import os
from dataclasses import dataclass, field, fields
from typing import Any, Optional

from langchain_core.runnables import RunnableConfig
from typing_extensions import Annotated
from dataclasses import dataclass
import requests
import logging

from bs4 import BeautifulSoup
import aiohttp


import re

In [5]:
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    temperature=0,
)

In [56]:
class SearchQuery(BaseModel):
    search_query: str = Field(None, description="Query for web search.")

class Queries(BaseModel):
    queries: List[SearchQuery] = Field(
        description="List of search queries.",
    )

class ResearchState(TypedDict):
    job_description: str
    candidate_context: str
    candidate_full_name: str
    number_of_queries: int


In [57]:
report_planner_query_writer_instructions="""You are an expert at researching people online. Your goal is to find detailed information about a candidate for a job opportunity.

The candidate is:
{candidate_full_name}
{candidate_context}

The job they're being considered for is:
{job_description}

Generate {number_of_queries} search queries that will help gather comprehensive information about this candidate. 

Guidelines for creating effective person-focused queries:
1. Create simple, direct queries using key identifying information
2. Avoid complex queries with multiple keywords or technical terms
3. Focus on finding the candidate's digital presence
4. Include queries that might surface profiles, articles, or mentions from:
   - Professional organizations and news
   - University publications
   - Personal blogs
   - GitHug repositories

Make each query specific and focused on one aspect of the candidate's background."""

In [64]:
tavily_async_client = AsyncTavilyClient()

@traceable
async def tavily_search_async(search_queries):
    """
    Performs concurrent web searches using the Tavily API.

    Args:
        search_queries (List[SearchQuery]): List of search queries to process
        tavily_topic (str): Type of search to perform ('news' or 'general')
        tavily_days (int): Number of days to look back for news articles (only used when tavily_topic='news')

    Returns:
        List[dict]: List of search results from Tavily API, one per query

    Note:
        For news searches, each result will include articles from the last `tavily_days` days.
        For general searches, the time range is unrestricted.
    """
    
    search_tasks = []
    for query in search_queries:
        search_tasks.append(
            tavily_async_client.search(
                query,
                max_results=10,
                include_raw_content=True
            )
        )

    # Execute all searches concurrently
    search_docs = await asyncio.gather(*search_tasks)

    return search_docs

In [112]:
def clean_text(text: str) -> str:
    return re.sub(r"[^\w\s]", " ", text.lower())

def heuristic_validator(
    content, title, candidate_full_name: str
) -> bool:
    cleaned_link_text = clean_text(content + " " + title)
    cleaned_candidate_full_name = clean_text(candidate_full_name)
    name_parts = cleaned_candidate_full_name.split()

    score = 0.0

    if cleaned_candidate_full_name in cleaned_link_text:
        score += 1.0

    name_part_matches = sum(
        1 for part in name_parts if f" {part} " in f" {cleaned_link_text} "
    )
    score += (name_part_matches / len(name_parts)) * 0.5

    return score >= 0.5

class LLMValidatorOutput(BaseModel):
    is_valid: bool

async def llm_validator(
    content, title, raw_content, candidate_full_name: str, candidate_context: str
) -> bool:
    prompt = """
You are a validator determining if a webpage's content is genuinely about a specific candidate.

Candidate Full Name: {candidate_full_name}
Candidate Profile:
{candidate_context}

Webpage Content:
Title: {title}
Content: {content}
Raw Content: {raw_content}

Use the following guidelines to validate if this webpage is about the candidate in question:
1. Name Match:
   - The webpage must explicitly mention the candidate's full name or a clear variation

2. Context Alignment:
   - Current or past employers mentioned in the candidate's profile
   - Educational institutions from the candidate's background
   - Job titles or roles from the candidate's experience
   - Projects or achievements mentioned in the candidate's profile
   - Time periods that align with the candidate's career history

3. Confidence Check:
   - Is there any conflicting information that suggests this might be about a different person?
   - Are there enough specific details to be confident this is about our candidate?
   - Could this content reasonably apply to someone else with the same name?

While you should be very careful in your evaluation, we don't want to reject a valid source. As long as you have reasonable confidence that this is about the candidate in question, you should return True.
    """


    structured_llm = llm.with_structured_output(LLMValidatorOutput)
    output = structured_llm.invoke([SystemMessage(content=prompt.format(candidate_full_name=candidate_full_name, candidate_context=candidate_context, title=title, content=content, raw_content=raw_content))]+[HumanMessage(content="Validate if this webpage is about the candidate in question.")])
    return output.is_valid


In [117]:
async def deduplicate_and_format_sources(search_response, max_tokens_per_source, candidate_full_name: str, candidate_context: str, include_raw_content: bool):
    """
    Takes either a single search response or list of responses from Tavily API and formats them.
    Limits the raw_content to approximately max_tokens_per_source.
    include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
    
    Args:
        search_response: Either:
            - A dict with a 'results' key containing a list of search results
            - A list of dicts, each containing search results
            
    Returns:
        str: Formatted string with deduplicated sources
    """
    # Convert input to list of results
    if isinstance(search_response, dict):
        sources_list = search_response['results']
    elif isinstance(search_response, list):
        sources_list = []
        for response in search_response:
            if isinstance(response, dict) and 'results' in response:
                sources_list.extend(response['results'])
            else:
                sources_list.extend(response)
    else:
        raise ValueError("Input must be either a dict with 'results' or a list of search results")
    
    # Deduplicate by URL
    unique_sources = {}
    for source in sources_list:
        if source['url'] not in unique_sources:
            unique_sources[source['url']] = source

    # Validate sources
    valid_sources = {}
    for source in unique_sources.values():
        if heuristic_validator(source['content'], source['title'], candidate_full_name):
            if await llm_validator(source['content'], source['title'], source['raw_content'], candidate_full_name, candidate_context):
                valid_sources[source['url']] = source

    print(f"Unique sources: {len(unique_sources)}")
    print(f"Valid URLs: {len(valid_sources)}")

    # Format output
    formatted_text = "Sources:\n\n"
    for i, source in enumerate(valid_sources.values(), 1):
        formatted_text += f"Source {source['title']}:\n===\n"
        formatted_text += f"URL: {source['url']}\n===\n"
        formatted_text += f"Most relevant content from source: {source['content']}\n===\n"
        if include_raw_content:
            # Using rough estimate of 4 characters per token
            char_limit = max_tokens_per_source * 4
            # Handle None raw_content
            raw_content = source.get('raw_content', '')
            if raw_content is None:
                raw_content = ''
                print(f"Warning: No raw_content found for source {source['url']}")
            if len(raw_content) > char_limit:
                raw_content = raw_content[:char_limit] + "... [truncated]"
            formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
                
    return formatted_text.strip()

async def candidate_research(state: ResearchState):
    job_description = state["job_description"]
    candidate_context = state["candidate_context"]
    number_of_queries = state["number_of_queries"]
    candidate_full_name = state["candidate_full_name"]

    structured_llm = llm.with_structured_output(Queries)
    system_instructions_query = report_planner_query_writer_instructions.format(job_description=job_description, candidate_full_name=candidate_full_name, candidate_context=candidate_context, number_of_queries=number_of_queries)
    results = structured_llm.invoke([SystemMessage(content=system_instructions_query)]+[HumanMessage(content="Generate search queries.")])
    query_list = [query.search_query for query in results.queries]

    search_docs = await tavily_search_async(query_list)

    source_str = await deduplicate_and_format_sources(search_docs, max_tokens_per_source=1000, candidate_full_name=candidate_full_name, candidate_context=candidate_context, include_raw_content=False)

    return source_str

In [69]:
candidate_full_name = "Kevin Jin"

test_candidate = """
Software Engineer @ Pinterest

Pinterest

Vanderbilt University
San Francisco, California, United States

Software Engineer
Pinterest · Full-time
Jul 2024 - Present · 6 mos
San Francisco, California, United States
Analytics InfrastructureA
Apache Spark, Apache Flink and +3 skills

Software Engineer Intern
Apple · Internship
May 2023 - Aug 2023 · 4 mos
Cupertino, California, United States
Open Source Program Office (OSPO)
Natural Language Processing (NLP), Open-Source Software and +3 skills

Software Engineer Intern
Roblox · Internship
Mar 2023 - May 2023 · 3 mos
San Mateo, California, United States
Next.js, ASP.NET Core and +3 skills

Team Create Presence
Amazon logo
Software Engineer Intern
Amazon · Internship
Jun 2022 - Aug 2022 · 3 mos
Seattle, Washington, United States
Alexa Shopping Conversation Management
Infrastructure as code (IaC), Amazon Web Services (AWS) and +3 skills

Education
Vanderbilt University
Bachelor of Science - BS, Computer Science and Mathematics, Minor in Engineering Management (National Merit Scholar)
Aug 2020 - May 2024
Activities and societies: VandyHacks (Director of Content), ChangePlusPlus (Technical Lead), Vanderbilt Commodore Orchestra, Vanderbilt Club Swimming (National Team)Activities and societies: VandyHacks (Director of Content), ChangePlusPlus (Technical Lead), Vanderbilt Commodore Orchestra, Vanderbilt Club Swimming (National Team)
Teaching Assistant (CS/DS 1100, EECE 2123), Research Assistant (Institute for Software Integrated Systems)Teaching Assistant (CS/DS 1100, EECE 2123), Research Assistant (Institute for Software Integrated Systems)
"""

test_job = """
About the role
You will play a critical role in building our platform, driving the technical direction of our company. This is a hands-on position that will involve working on all things infrastructure, and collaborating with other engineers, designers, and founders. A great opportunity for someone to create a lasting impact not only on the primary product but also on the company's values and vision, given our early stage - we hope Pump would be that career defining opportunity for all of us!

Responsibilities:

Work closely with the rest of the team to design and implement our infrastructure.
Collaborate with other engineers and designers to build a user-friendly and intuitive platform that meets our customers' needs.
Help define and execute the technical roadmap for our platform, balancing technical debt and innovation to ensure we can deliver on our business goals.
Inventing, extending, and maintaining development processes, tools, and workflows.
Contribute to product roadmap
Qualifications:

BS or MS in Computer Science, Engineering, or a related field.
5+ years of experience as a DevOps engineer, with a focus on designing cloud native infrastructures.
Experience with AWS, GCP, and Azure, and knowledge of AWS, GCP, and Azure cost and billing is a plus.
Experience with CloudFormation, Terraform, etc.
Experience with Kubernetes, docker, and distributed systems.
Having AWS, GCP, and Azure certifications is a plus.
Strong communication skills and ability to collaborate effectively with cross-functional teams.
Experience with and enjoys creating prototypes and experimenting quickly.
"""



In [118]:
import nest_asyncio
nest_asyncio.apply()

result = await candidate_research({"job_description": test_job, "candidate_context": test_candidate, "number_of_queries": 5, "candidate_full_name": candidate_full_name})
print(result)

Unique sources: 45
Valid URLs: 10
Sources:

Source Kevin Jin - Medium:
===
URL: https://medium.com/@kevin-jin
===
Most relevant content from source: Kevin Jin – Medium Write Kevin Jin From Freelance To FAANG: How My Resume Evolved Through College -------------------------------------------------------------- ### Sharing my journey to help inspire yours. A Complete Guide To Writing Tests in Coding Interviews ------------------------------------------------------ ### Building a strategy for writing tests that cover all bases. A Complete Student Guide to Software Engineering Behavioral Interviews ---------------------------------------------------------------------- ### During my last 2 years of college, I’ve done behavioral-specific interviews for software engineering (SWE) roles with over 30 companies of… Why Every Computer Science Student Should Read “Clean Code” ----------------------------------------------------------- ### Writing clean code is the most important thing schools don’t