# Treatment Generation Script - Notebook Version

This notebook converts the `treatment_generation.py` script into modular, testable code blocks. Each utility function and main workflow step is separated for clarity and easier debugging.

You can run and test each section independently.

## 1. Imports and Environment Setup

This section sets up the environment and imports all required libraries.

In [1]:
# Add parent and libs directory to sys.path for imports
import sys
import os
sys.path.append('..')
sys.path.append('../libs')
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from libs.mongodb import get_all_file_ids, _get_mongo_client, get_document_by_fileid, _clean_raw_llm_response
from libs.gemini_processor import GeminiProcessor
from utils import get_logger
import json
import datetime
import random
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

NameError: name '__file__' is not defined

## 2. Configuration and Constants

Set up sector, MongoDB, Gemini model, and file paths.

In [None]:
# Logger
logger = get_logger(__name__)

# Sector we are processing
SECTOR = "ITC"

# MongoDB configuration
MONGO_CLIENT = _get_mongo_client()
DB_NAME = "Resume_study"
SOURCE_COLLECTION_NAME = "Standardized_resume_data"
TARGET_COLLECTION_NAME = "Treated_resumes"

# Gemini model configuration
GEMINI_MODEL_NAME = "gemini-2.5-flash"
GEMINI_TEMPERATURE = 0.7
ENABLE_GOOGLE_SEARCH = False
PROMPT_TEMPLATE_PATH = "Phase 2 Workflow/Prompts/prompt_treatment_generation.md"
MAX_RETRIES = 2
STYLE_MODIFIERS = [
    "using strong, action-oriented verbs and focusing on quantifiable outcomes",
    "using a direct, concise, and professional tone, prioritizing clarity and brevity",
    "by emphasizing collaborative efforts and cross-functional teamwork",
    "by highlighting strategic thinking and the business impact of the work",
    "by describing the technical aspects of the work with more precision and detail",
    "by framing the accomplishments as a narrative of challenges, actions, and results"
]

# Treatment file paths
TREATMENT_CEC_FILE = "Phase 2 Workflow/Education_treatment_dummy.xlsx"
TREATMENT_CWE_FILE = "Phase 2 Workflow/WE_treatment_dummy.xlsx"

## 3. Load Treatment Data and Similarity Model

This section loads the treatment data from Excel files and initializes the sentence transformer model for cosine similarity calculations.

In [None]:
# Load treatment data from Excel files in pandas DataFrames
cec_treatment_df = pd.read_excel(TREATMENT_CEC_FILE)
cec_treatment_df = cec_treatment_df[cec_treatment_df['sector'] == SECTOR].reset_index(drop=True)
cwe_treatment_df = pd.read_excel(TREATMENT_CWE_FILE)
cwe_treatment_df = cwe_treatment_df[cwe_treatment_df['sector'] == SECTOR].reset_index(drop=True)

# Load the model once to be reused in the loop for cosine similarity calculations
SIMILARITY_MODEL = SentenceTransformer(
    r'Phase 2 Workflow/models/all-MiniLM-L6-v2'
 )

## 4. Utility Functions

The following cells define utility functions used throughout the workflow.

In [None]:
def calculate_focused_similarity(control_resume_data, treated_resume_data):
    control_text = extract_rephrased_text(control_resume_data)
    treated_text = extract_rephrased_text(treated_resume_data)
    
    if not control_text or not treated_text:
        return 0.0

    control_embedding = SIMILARITY_MODEL.encode(control_text)
    treated_embedding = SIMILARITY_MODEL.encode(treated_text)
    
    score = cosine_similarity([control_embedding], [treated_embedding])[0][0]
    return score

In [None]:
def select_and_prepare_treatments(
    cec_treatment_df: pd.DataFrame,
    cwe_treatment_df: pd.DataFrame,
    source_resume_data: dict,
    treatment_prompt_template: str,
    style_modifiers: list[str]
    ):
    """
    Selects random treatments, assigns unique style modifiers, and prepares prompts
    for the 3 treated resume variations (Type I, Type II, Type III).

    The 'Control' version is the original source resume and is handled separately.
    """
    if cec_treatment_df.empty or cwe_treatment_df.empty:
        logger.error("No treatments available for CEC or CWE.")
        return None

    try:
        cec_treatments = cec_treatment_df.sample(n=2, replace=False).to_dict('records')
        cwe_treatments = cwe_treatment_df.sample(n=2, replace=False).to_dict('records')
    except ValueError:
        logger.error("Not enough unique treatments available in the dataframes to sample.")
        return None

    if len(style_modifiers) < 3:
        logger.error("Not enough style modifiers to ensure unique styles for all treatments.")
        return None
    shuffled_styles = random.sample(style_modifiers, 3)

    treatment_prompts = {}
    base_prompt = treatment_prompt_template.replace(
        "{JSON_resume_object}", json.dumps(source_resume_data, indent=2)
    )

    # Type I (CEC)
    cec_treatment_idx = random.randint(0, 1)
    cec_treatment = cec_treatments[cec_treatment_idx]
    type_i_prompt = base_prompt.replace("{Treatment_object}", str(cec_treatment))
    type_i_style_guide = shuffled_styles.pop()
    type_i_prompt = type_i_prompt.replace("{style_guide}", type_i_style_guide)
    treatment_prompts["Type_I"] = {
        "prompt": type_i_prompt,
        "style_guide": type_i_style_guide,
        "treatment_applied": {"Canadian_Education": cec_treatment}
    }
    cec_treatment_idx = 1 - cec_treatment_idx

    # Type II (CWE)
    cwe_treatment_idx = random.randint(0, 1)
    cwe_treatment = cwe_treatments[cwe_treatment_idx]
    type_ii_prompt = base_prompt.replace("{Treatment_object}", str(cwe_treatment))
    type_ii_style_guide = shuffled_styles.pop()
    type_ii_prompt = type_ii_prompt.replace("{style_guide}", type_ii_style_guide)
    treatment_prompts["Type_II"] = {
        "prompt": type_ii_prompt,
        "style_guide": type_ii_style_guide,
        "treatment_applied": {"Canadian_Work_Experience": cwe_treatment}
    }
    cwe_treatment_idx = 1 - cwe_treatment_idx

    # Type III (CEC + CWE)
    mixed_treatment_payload = {
        "task": "ADD_EDUCATION_AND_EXPERIENCE",
        "payload": {
            "education": cec_treatments[cec_treatment_idx],
            "experience": cwe_treatments[cwe_treatment_idx]
        }
    }
    type_iii_prompt = base_prompt.replace("{Treatment_object}", str(mixed_treatment_payload))
    type_iii_style_guide = shuffled_styles.pop()
    type_iii_prompt = type_iii_prompt.replace("{style_guide}", type_iii_style_guide)
    treatment_prompts["Type_III"] = {
        "prompt": type_iii_prompt,
        "style_guide": type_iii_style_guide,
        "treatment_applied": {
            "Canadian_Education": cec_treatments[cec_treatment_idx],
            "Canadian_Work_Experience": cwe_treatments[cwe_treatment_idx]
        }
    }

    logger.info(f"Successfully prepared 3 unique treatment prompts for the resume.")
    return treatment_prompts

## 5. Main Workflow

The following cells walk through the main workflow for generating and saving treated resumes.

In [None]:
# 1. Import all files from the source collection for the specified sector
all_files = get_all_file_ids(
    db_name=DB_NAME,
    collection_name=SOURCE_COLLECTION_NAME,
    mongo_client=MONGO_CLIENT
)

sector_files = [f for f in all_files if SECTOR in f]
if not sector_files:
    logger.error(f"No files found for sector {SECTOR}. Exiting.")
    raise Exception(f"No files found for sector {SECTOR}.")
logger.info(f"Found {len(sector_files)} files for sector: {SECTOR}.")

# 2. Initialize the GeminiProcessor
treatment_model = GeminiProcessor(
    model_name=GEMINI_MODEL_NAME,
    temperature=GEMINI_TEMPERATURE,
    enable_google_search=ENABLE_GOOGLE_SEARCH
)
treatment_prompt = treatment_model.load_prompt_template(prompt_file_path=PROMPT_TEMPLATE_PATH)

target_collection = MONGO_CLIENT[DB_NAME][TARGET_COLLECTION_NAME]

In [None]:
# Main processing loop for a single file (for testing)
file = sector_files[0]
logger.info(f"Processing file: {file}")
file_data = get_document_by_fileid(
    db_name=DB_NAME,
    collection_name=SOURCE_COLLECTION_NAME,
    file_id=file,
    mongo_client=MONGO_CLIENT
)

source_resume_data = file_data.get('resume_data', {})
if not source_resume_data:
    logger.error(f"No resume data found for file {file}. Skipping.")
    raise Exception(f"No resume data found for file {file}.")

documents_to_save = []
common_metadata = {
    'original_file_id': file,
    'industry_prefix': file_data.get('industry_prefix'),
    'file_size_bytes': file_data.get('file_size_bytes'),
    'source_file_hash': file_data.get('file_hash'),
}

control_resume_target_collection = {
        **common_metadata,
        "document_id": f"{file}_control",
        "treatment_type": "control",
        "generation_timestamp": datetime.datetime.now(),
        "validation": {
            "focused_similarity_score": "",
            "passed_threshold": "N/A"
        },
        "treatment_applied": "N/A",
        "resume_data": source_resume_data['resume_data'] 
}
documents_to_save.append(control_resume_target_collection)

In [None]:
# Get two random treatments for Canadian Education and Canadian Work Experience
treatment_prompts = select_and_prepare_treatments(
    cec_treatment_df,
    cwe_treatment_df,
    source_resume_data,
    treatment_prompt,
    STYLE_MODIFIERS
)
if not treatment_prompts:
    logger.error(f"No treatments available for file {file}. Skipping.")
    raise Exception(f"No treatments available for file {file}.")

for key, value in treatment_prompts.items():
    logger.info(f"Generating treatment {key} for file {file}.")
    # Generate the treated resume using GeminiProcessor
    response = treatment_model.generate_content(
        prompt=value['prompt'],
)
    
    if not response or not response.text:
        logger.error(f"Failed to generate content for treatment {key} in file {file}.")
        continue
    
    # Clean the raw response
    treated_resume_data = _clean_raw_llm_response(response.text)
    
    # Validate the rephrasing with cosine similarity
    focused_similarity_score = calculate_focused_similarity(
        source_resume_data['resume_data'], treated_resume_data['resume_data']
    )
    focused_similarity_score = float(focused_similarity_score)
    
    if focused_similarity_score < 0.85:
        logger.warning(f"Low similarity score ({focused_similarity_score}) for treatment {key} in file {file}. Please add retry logic")
    
    final_doc_for_this_version = {
            **common_metadata,
            "document_id": f"{file.replace('.pdf', '')}_{key}",
            "treatment_type": key,
            "generation_timestamp": datetime.datetime.now(),
            "validation": {
                "focused_similarity_score": focused_similarity_score,
                "passed_threshold": True 
            },
            "style_guide": value['style_guide'],
            "treatment_applied": value['treatment_applied'],
            "resume_data": treated_resume_data 
        }
    documents_to_save.append(final_doc_for_this_version)
    logger.info(f"  -> Successfully prepared '{key}' for saving.")

if documents_to_save:
    target_collection.insert_many(documents_to_save)
    logger.info(f"Successfully saved {len(documents_to_save)} treated resumes for file {file}.")
else:
    logger.error(f"No documents to save for file {file}. Skipping saving step.")
    raise Exception(f"No documents to save for file {file}.")

In [None]:
def extract_rephrased_text(resume_data):
    text_parts = []
    if 'basics' in resume_data and 'summary' in resume_data['basics']:
        text_parts.append(resume_data['basics']['summary'])
    if 'work_experience' in resume_data:
        for job in resume_data['work_experience']:
            if 'highlights' in job and isinstance(job['highlights'], list):
                text_parts.append(" ".join(job['highlights']))
    return " ".join(text_parts)