### load the data

In [2]:
import os
from json import load
root = "/".join(os.getcwd().split("/")[:-1])
experience_path = os.path.join(root, "data", "processed", "candidates_experience.json")
with open(file = experience_path, mode = "r") as file:
    experience_dict = load(file)
experience_dict.keys()

dict_keys(['c69f2087775fc760', '1', 'b5142b9b9676a4f9', 'c5f45dfe145427e1'])

In [3]:
sample_experience = experience_dict['1']
sample_experience

{'experience': [{'job_title': 'Data Analyst Assistant',
   'company_name': 'UTS, Sydney, Australia',
   'years_of_experience': 'Sep 2025 – Current',
   'responsibilities': ['Redesigned and deployed robust pipelines for UTS Alumni, Donors, Student Data from NXT CRM into a data warehouse using on premise SQL server.',
    'Used Airflow for pipeline orchestration.']},
  {'job_title': 'Data Engineer III',
   'company_name': 'Wizeline - Nike, Bogotá, CO',
   'years_of_experience': 'Dec 2024 – Jun 2025',
   'responsibilities': ['Oversaw a team of three Level 2 data engineers within a 14-member team, providing technical mentorship.',
    'Orchestrated the migration of core Airflow DAGs to Databricks Workflows.',
    'Engineered and maintained seven core data pipelines ingesting global data for locations, employees, leasing, building access, and IoT sensor-based headcounts.',
    'Structured a medallion data model with Delta Lake and enforced governance with Unity Catalog to facilitate self-se

In [7]:
from functions import text_extraction
job_description = text_extraction(
    os.path.join(root, "data", "raw", "job", "data_role_des.txt")
)
print(job_description)

Graduate Consultant - Data & Analytics
Are you a recent graduate with a passion for data and a desire to build a career in analytics and business transformation? Do you enjoy solving complex problems and delivering innovative, data-driven solutions that empower businesses? If so, Synogize is excited to connect with you!
 
We are seeking a motivated Graduate Consultant to join our growing team of Data & Analytics professionals in Melbourne. This is an exciting opportunity to launch your career in a dynamic field, working alongside experienced consultants on impactful projects. Applicants must have full working rights in Australia.
 
About You
You are someone who:
Is eager to collaborate with clients to develop tailored data and analytics solutions.
Enjoys problem-solving and continuously seeks to optimize the use of data.
Has a strong interest in understanding business challenges and translating them into actionable insights.
Is seeking a role that supports both personal growth and care

### what should we look for in the answers?

In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from functions import format_experience_for_prompt
from pydantic import BaseModel, Field, field_validator
from typing import List

load_dotenv()

llm = ChatOpenAI(
    model = "gpt-4o-2024-08-06"
)

class InterviewQuestions(BaseModel):
    """generation of interview questions"""
    experience_questions : List[str] = Field(
        description = "List of questions aimed to clarify or expand upon the candidate's professional experience"
    )
    situational_questions : List[str] = Field(
        description = "List of questions aimed to understand the candidate's behavior, problem-solving abilities, and soft skills in relation to the target role"
    )
    @field_validator("experience_questions", "situational_questions")
    @classmethod
    def check_questions(cls, v: List[int]):
        # This validator is crucial for ensuring data integrity
        if len(v) == 0:
            raise ValueError("The list should contain at least one value")
        return v

interview_questions_template = [
    (
        "system",
        (
            "You are an expert recruiter. Your task is to analyze a candidate's experience and job description to "
            "generate a set of interview questions. Your output must be a list of questions."
        ),
    ),
    (
        "human",
        (
            "Please provide a JSON object with the following keys.\n"
            "- **Experience questions**: <List of 4 questions aimed to clarify or expand upon the candidate's professional experience, "
            "skills, and achievements. These questions should help assess the candidate's expertise and past job performance.>\n"
            "- **Situational questions**: <List of 4 questions aimed to understand the candidate's behavior, problem-solving abilities, "
            "and soft skills in relation to the target role. These questions should explore how the candidate handles specific situations and challenges.>\n"
            "\n\n**Job Description:**\n{job_description}"
            "\n\n**Candidate Experience:**\n{candidate_experience}"
        )
    )
]
interview_questions_prompt = ChatPromptTemplate.from_messages(messages = interview_questions_template)

llm_constrained = llm.with_structured_output(schema = InterviewQuestions)

questions_chain = interview_questions_prompt | llm_constrained

sample_questions = questions_chain.invoke(
    input = {
        "job_description" : job_description,
        "candidate_experience" : format_experience_for_prompt(sample_experience)
    }
)
sample_questions

InterviewQuestions(experience_questions=['Can you describe the process and outcomes of migrating on-premises solutions to cloud environments, like the strategic migration to Azure you led at SII Colombia?', 'What specific challenges did you encounter while managing data governance with Unity Catalog for Wizeline, and how did you address them?', 'Tell us more about your experience with tools like Airflow for pipeline orchestration. How did you utilize these tools in your roles at UTS or Wizeline?', "Could you elaborate on your experience with data transformation tools and how you've applied SQL-based transformations in your previous roles?"], situational_questions=['Imagine you are working on a project where the data sources are inconsistent and lack standardization. How would you tackle this issue to ensure reliable data analytics solutions for the client?', 'A client is not satisfied with the current insights generated from their data analytics setup. How would you approach understand

In [13]:
from typing import List
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI  # or your LLM client

# 1) Define a clean schema
class QAItem(BaseModel):
    question: str = Field(description="Interview question")
    look_for: str = Field(description="What the recruiter should look for in the candidate's answer")

class InterviewQuestionsV2(BaseModel):
    experience_questions: List[QAItem] = Field(
        description="4 questions to expand/clarify candidate experience, with what to look for"
    )
    technical_questions: List[QAItem] = Field(
        description="4 questions to assess job-relevant tools/skills, with what to look for"
    )

# 2) Build prompt
template = [
    (
        "system",
        "You are an expert recruiter. Analyze the candidate's experience and job description to "
        "produce interview questions and what to look for in responses."
    ),
    (
        "human",
        "Generate 4 experience questions and 4 technical questions. Each must include what to look for.\n"
        "**Job Description:**\n{job_description}\n"
        "**Candidate Experience:**\n{candidate_experience}"
    ),
]
prompt = ChatPromptTemplate.from_messages(template)

# 3) Constrain output to the schema
llm = ChatOpenAI(model="gpt-4o-mini")  # pick your model
llm_constrained = llm.with_structured_output(schema=InterviewQuestionsV2)

chain = prompt | llm_constrained

# 4) Invoke
sample_questions = chain.invoke(
    input = {
        "job_description" : job_description,
        "candidate_experience" : format_experience_for_prompt(sample_experience)
    }
)


In [14]:
sample_questions

InterviewQuestionsV2(experience_questions=[QAItem(question='Can you describe your experience redesigning and deploying robust data pipelines in your current role? What challenges did you face, and how did you overcome them?', look_for='Look for specific examples of problem-solving, innovative approaches, and the ability to adapt to obstacles. The candidate should highlight both technical skills and teamwork.'), QAItem(question='In your role as a Data Engineer III, how did you communicate technical needs to non-technical stakeholders? Can you provide an example of a project where effective communication made a difference?', look_for="Assess the ability to simplify complex concepts and gauge the candidate's interpersonal skills. Look for examples that illustrate successful outcomes stemming from clear communication."), QAItem(question="Tell me about a time you worked in a team-oriented environment. What role did you play, and how did your contributions affect the project's success?", loo

In [16]:
for q_and_a in sample_questions.technical_questions:
    print(f"question: {q_and_a.question}")
    print(f"what to look for: {q_and_a.look_for}")
    print("-"*20)

question: Can you explain the ETL process and your experience with tools like Python, SQL, and Azure Data Factory in carrying out ETL tasks?
what to look for: Look for a clear understanding of the ETL process, familiarity with the mentioned tools, and practical experiences that demonstrate competence in executing ETL workflows.
--------------------
question: Describe your experience with data transformation tools such as dbt or Matillion. Can you provide an example of how you utilized these tools effectively in a project?
what to look for: Assess familiarity with specific tools, depth of experience in data transformation processes, and practical challenges navigated during project execution.
--------------------
question: What data visualization tools have you used, and how have you applied them to present data insights to stakeholders? Can you give an example of a dashboard or report you've created?
what to look for: Evaluate experience and comfort level with visualization tools. Look

### candidates tournament

In [5]:
import random
from math import floor, ceil
random.seed(2000)
# parameters
population = 28
batch_size = 5
selected_per_batch = 2
num_batches = ceil(population / batch_size)
# example
candidates = list(range(1, population + 1))
# controling the rounds
candidates_copy = candidates.copy()
for round in range(4):
    print(f"round: {round + 1}")
    # random ids
    random.shuffle(candidates_copy)
    print(f"initial candidates: {len(candidates_copy)}")
    print(f"*"*20)
    # batches
    batches = [
        candidates_copy[i : batch_size + i] for i in range(0, len(candidates_copy) + 1, batch_size)
    ]
    # selection of the candidates per batch
    round_winners = []
    for b in batches:
        print(f"for the batch: {b}")
        winners = random.sample(b, k = min(selected_per_batch, len(b)))
        print(f"the winners were: {winners}")
        print("-"*20)
        for w in winners:
            round_winners.append(w)
    candidates_copy = round_winners

round: 1
initial candidates: 28
********************
for the batch: [17, 14, 3, 27, 21]
the winners were: [17, 14]
--------------------
for the batch: [28, 10, 26, 8, 12]
the winners were: [12, 28]
--------------------
for the batch: [4, 1, 18, 24, 22]
the winners were: [18, 24]
--------------------
for the batch: [11, 20, 7, 13, 5]
the winners were: [13, 11]
--------------------
for the batch: [19, 6, 23, 9, 16]
the winners were: [23, 6]
--------------------
for the batch: [25, 2, 15]
the winners were: [25, 2]
--------------------
round: 2
initial candidates: 12
********************
for the batch: [14, 2, 28, 13, 12]
the winners were: [14, 12]
--------------------
for the batch: [24, 23, 18, 25, 17]
the winners were: [17, 25]
--------------------
for the batch: [6, 11]
the winners were: [6, 11]
--------------------
round: 3
initial candidates: 6
********************
for the batch: [11, 17, 25, 14, 6]
the winners were: [17, 6]
--------------------
for the batch: [12]
the winners were: 

In [9]:
import random
from math import floor, ceil
random.seed(2000)
# parameters
population = 28
batch_size = 5
selected_per_batch = 2
num_batches = ceil(population / batch_size)
# example
candidates = list(range(1, population + 1))
# controling the rounds
candidates_copy = candidates.copy()
round = 0
while len(candidates_copy) > selected_per_batch + 1:
    print(f"round: {round + 1}")
    # random ids
    random.shuffle(candidates_copy)
    print(f"initial candidates: {len(candidates_copy)}")
    print(f"*"*20)
    # batches
    batches = [
        candidates_copy[i : batch_size + i] for i in range(0, len(candidates_copy) + 1, batch_size)
    ]
    # selection of the candidates per batch
    round_winners = []
    for b in batches:
        print(f"for the batch: {b}")
        winners = random.sample(b, k = min(selected_per_batch, len(b)))
        print(f"the winners were: {winners}")
        print("-"*20)
        for w in winners:
            round_winners.append(w)
    print(f"the remaining group represents: {len(round_winners)/population:.2%}")
    candidates_copy = round_winners
    round += 1

round: 1
initial candidates: 28
********************
for the batch: [17, 14, 3, 27, 21]
the winners were: [17, 14]
--------------------
for the batch: [28, 10, 26, 8, 12]
the winners were: [12, 28]
--------------------
for the batch: [4, 1, 18, 24, 22]
the winners were: [18, 24]
--------------------
for the batch: [11, 20, 7, 13, 5]
the winners were: [13, 11]
--------------------
for the batch: [19, 6, 23, 9, 16]
the winners were: [23, 6]
--------------------
for the batch: [25, 2, 15]
the winners were: [25, 2]
--------------------
the remaining group represents: 42.86%
round: 2
initial candidates: 12
********************
for the batch: [14, 2, 28, 13, 12]
the winners were: [14, 12]
--------------------
for the batch: [24, 23, 18, 25, 17]
the winners were: [17, 25]
--------------------
for the batch: [6, 11]
the winners were: [6, 11]
--------------------
the remaining group represents: 21.43%
round: 3
initial candidates: 6
********************
for the batch: [11, 17, 25, 14, 6]
the win

In [17]:
import random
from math import floor, ceil
random.seed(2000)
# parameters
population = 28
batch_size = 5
selected_per_batch = 2
num_batches = ceil(population / batch_size)
# example
candidates = list(range(1, population + 1))
# controling the rounds
candidates_copy = candidates.copy()
round = 0
print(f"tournament simulation:")
while len(candidates_copy) > selected_per_batch:
    # random ids
    random.shuffle(candidates_copy)
    # batches
    batches = [
        candidates_copy[i : batch_size + i] for i in range(0, len(candidates_copy) + 1, batch_size)
    ]
    # selection of the candidates per batch
    round_winners = []
    for b in batches:
        winners = random.sample(b, k = min(selected_per_batch, len(b)))
        for w in winners:
            round_winners.append(w)
    print(f"round {round + 1}. Initial candidates: {len(candidates_copy)} -> Final candidates: {len(round_winners)} ({len(round_winners)/population:.2%} of population)")
    candidates_copy = round_winners
    round += 1


tournament simulation:
round 1. Initial candidates: 28 -> Final candidates: 12 (42.86% of population)
round 2. Initial candidates: 12 -> Final candidates: 6 (21.43% of population)
round 3. Initial candidates: 6 -> Final candidates: 3 (10.71% of population)
round 4. Initial candidates: 3 -> Final candidates: 2 (7.14% of population)


In [20]:
from importlib import reload
import functions
reload(functions)

<module 'functions' from '/Users/santiagocardenas/Documents/MDSI/202502/internship/internship_project/notebooks/functions.py'>

In [25]:
from functions import candidate_simulation
population = 28
batch_size = 5
selected_per_batch = 2
parameters_approval = False
while not parameters_approval:
    print(f"the candidate parameters are:")
    print(f"-batch size: {batch_size}")
    print(f"-selection per batch: {selected_per_batch}")
    print("-"*20)
    candidate_simulation(
        population = population,
        batch_size = batch_size,
        selected_per_batch = selected_per_batch
    )
    parameters_feedback = input("Do you approve this parameters?")
    if parameters_feedback.lower() in ['yes', 'y']:
        suggested_rounds = int(input("how many rounds to you wish to run?"))
        parameters_approval = True
    else:
        print("what parameters do you wish to run?")
        batch_size = int(input("What batch size do you want to use?"))
        selected_per_batch = int(input("How many candidates do you want to select per batch?"))

the candidate parameters are:
-batch size: 5
-selection per batch: 2
--------------------
tournament simulation:
round 1. Initial candidates: 28 -> Final candidates: 12 (42.86% of population)
round 2. Initial candidates: 12 -> Final candidates: 6 (21.43% of population)
round 3. Initial candidates: 6 -> Final candidates: 3 (10.71% of population)
round 4. Initial candidates: 3 -> Final candidates: 2 (7.14% of population)
what parameters do you wish to run?
the candidate parameters are:
-batch size: 4
-selection per batch: 2
--------------------
tournament simulation:
round 1. Initial candidates: 28 -> Final candidates: 14 (50.00% of population)
round 2. Initial candidates: 14 -> Final candidates: 8 (28.57% of population)
round 3. Initial candidates: 8 -> Final candidates: 4 (14.29% of population)
round 4. Initial candidates: 4 -> Final candidates: 2 (7.14% of population)
