# Imports

In [125]:
import spacy
from spacy.matcher import Matcher
import pandas as pd
import numpy as np
import docx
from docx import Document
nlp = spacy.load("en_core_web_sm")

import openai

# CV TAILORING

## First tries
I want to keep ths function here, as they were my first attempts to try and replace the action verbs of my cv with action verbs with the job offer. I had some success but dealing with verb conjugation was really tough so I decided it was more effective to treat the bullet points of the cv with open Ai gpt-4o, as its not much generated text the extra incurring costs are not even 1c. However, going forward it would be interesting to complement and turn this functions into my original idea.

In [71]:
import spacy
from spacy.matcher import Matcher
from docx import Document

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return "\n".join(full_text)

def extract_job_info(file_path):
    # Extract text from the document
    job_offer_text = extract_text(file_path)
    
    # Preprocess text
    job_offer_text = job_offer_text.replace('\n', ' ').replace('\r', '')
    
    # Apply NLP model
    doc = nlp(job_offer_text)
    
    # Custom matcher patterns for skills
    matcher = Matcher(nlp.vocab)
    skill_patterns = [
        [{"LOWER": "python"}],
        [{"LOWER": "machine learning"}],
        [{"LOWER": "data analysis"}],
        [{"LOWER": "project management"}],
        [{"LOWER": "sql"}],
        [{"LOWER": "excel"}]
        # Add more patterns as needed
    ]

    # Add patterns to the matcher
    for pattern in skill_patterns:
        matcher.add("SKILL", [pattern])
    
    matches = matcher(doc)
    
    # Extract entities using SpaCy's built-in NER
    skills = set()
    for match_id, start, end in matches:
        span = doc[start:end]
        skills.add(span.text)
    
    values = set([ent.text for ent in doc.ents if ent.label_ in ["ORG", "NORP", "FAC", "EVENT", "LAW", "LOC", "PRODUCT", "WORK_OF_ART", "LANGUAGE"]])
    initiatives = set([ent.text for ent in doc.ents if ent.label_ in ["ORG", "NORP", "FAC", "EVENT", "LAW", "LOC", "PRODUCT", "WORK_OF_ART", "LANGUAGE"]])

    # Extract action verbs
    action_verbs = set([token.lemma_ for token in doc if token.pos_ == "VERB"])

    return {
        "skills": list(skills),
        "values": list(values),
        "initiatives": list(initiatives),
        "action_verbs": list(action_verbs)
    }

In [103]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import wordnet
inflect_engine = inflect.engine()

# action verbs exclusive from my CV to be replaced
my_action_verbs = {
    "automated", "implemented", "developed", "created", "directed", "achieved",
    "led", "deployed", "enhances", "engaged", "integrated", "interpreted", 
    "built", "leveraged", "formulated"}

# WordNet for synonims
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return synonyms

# extract verbs from text using spaCy
def extract_verbs(text):
    doc = nlp(text)
    verbs = set([token.text for token in doc if token.pos_ == 'VERB'])
    return verbs

# replace verbs in the document
def replace_verbs(doc, replacements):
    for paragraph in doc.paragraphs:
        for key, value in replacements.items():
            if key in paragraph.text:
                doc_nlp = nlp(paragraph.text)
                new_text = []
                for token in doc_nlp:
                    if token.text.lower() == key:
                        new_text.append(conjugate_verb(token.text, value))
                    else:
                        new_text.append(token.text)
                paragraph.text = " ".join(new_text)
    return doc


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rodri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rodri\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## CV Tailoring Functions

In [266]:
def read_cv(cv_path):
    doc = docx.Document(cv_path)
    cv_text = {'experience': [], 'projects': []}
    current_section = None

    for para in doc.paragraphs:
        text = para.text.strip()

        if "experience" in text.lower():
            current_section = 'experience'
        elif "relevant projects" in text.lower():
            current_section = 'projects'
        elif current_section == 'experience':
            if text and not text[0].isdigit() and not text.lower().startswith(('logistic', 'atria corp', 'luz del sur', 'operations and data intern', 'falcon management partners', 'consulting analyst')):
                cv_text['experience'].append(text)
        elif current_section == 'projects':
            if text and not text.lower().startswith(('development of machine learning algorithms', 'cv and cover letter tailoring', 'spotify:', 'big foot sightings')):
                cv_text['projects'].append(text)

    return cv_text

def update_cv_sections(cv_text, job_description):
    prompt_text = (f"Given the job description below, update the CV bullet points accordingly. Ensure you produce the same number (17) of bullet points as inputed"
                   f"Maintain the ideas and lengths (LESS THAN 15 WORDS) of the bullet points, but use the ACTION VERBS AND KEY WORDS FROM THE JOB OFFER"
                   f"Do not make drastic changes and do not repeat the same action verb more than twice.\n\n"
                   f"Job Description: {job_description}\n\n"
                   f"CV Experience Section: {' '.join(cv_text['experience'])}\n\n"
                   f"CV Projects Section: {' '.join(cv_text['projects'])}")

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are a recruiter with 20 years of experience in big tech companies, expert in CV tailoring."}, 
                  {"role": "user", "content": prompt_text}],
        max_tokens=450)

    new_bullet_points = response.choices[0].message.content
    return new_bullet_points

def update_cv(cv_path, new_bullets_text):
    # extract new bullet points from the generated text
    new_bullets = [line.strip() for line in new_bullets_text.split('\n') if line.strip() and not line.startswith("###")]

    # format out the titles
    if len(new_bullets) > 17:
        bullets1 = new_bullets[1:10]
        bullets2 = new_bullets[11:20]
        new_bullets = bullets1 + bullets2

    # extract current bullet points using read_cv
    current_cv = read_cv(cv_path)
    current_bullets = current_cv['experience'] + current_cv['projects']

    #if len(new_bullets) != 17 or len(current_bullets) != 17:
        #raise ValueError(f"Either the number of new bullet points ({len(new_bullets)}) or the current bullet points ({len(current_bullets)}) is not 17")

    doc = docx.Document(cv_path)
    bullet_idx = 0

    for para in doc.paragraphs:
        text = para.text.strip()
        if text in current_bullets:
            run = para.runs[0]
            new_text = new_bullets[bullet_idx][2:]
            
            para.clear()
            new_run = para.add_run(new_text)
            new_run.bold = run.bold
            new_run.italic = run.italic
            new_run.underline = run.underline
            new_run.font.size = run.font.size
            new_run.font.name = run.font.name
            new_run.font.color.rgb = run.font.color.rgb
            
            bullet_idx += 1

    return doc
    
def generate_cv(cv_path, job_offer):
    cv_text = read_cv(cv_path)
    new_bullets_text = update_cv_sections(cv_text, job_offer)
    return update_cv(cv_path, new_bullets_text)

def count_words(line):
    words = re.findall(r'\b\w+\b', line)
    return len(words)

def extract_bullet_points(cv_path):
    experience_lines = []
    project_lines = []
    capture_experience = False
    capture_projects = False
    doc = Document(cv_path)
    for para in doc.paragraphs:
        text = para.text.strip()
    
        if text.lower() == 'work experience':
            capture_experience = True
            continue
        
        if text.lower() == 'relevant projects':
            capture_experience = False
            capture_projects = True
            continue
            
        if capture_experience:
            if count_words(text) >= 11:
                clean_line = text.replace('\t', ' ')
                experience_lines.append(clean_line)
        
        if capture_projects:
            if count_words(text) > 11:
                clean_line = text.replace('\t', ' ')
                project_lines.append(clean_line)
    
    bullet_points = {'experience': experience_lines,
                     'relevant_projects': project_lines}
    return bullet_points

In [495]:
import re
import docx
from docx import Document
import json

def update_cv_sections(cv_json, job_description):

    prompt_text = (
        f"Given the job description below, update the json CV activities accordingly. "
        f"Ensure you produce exactly 3 bullet points for the experience section "
        f"and 2 bullet points for the projects section. "
        f"Maintain the main ideas, writing style and lengths (LESS THAN 16 WORDS) of the original activities. Most importantly use the ACTION VERBS "
        f"AND KEYWORDS FROM THE JOB OFFER when possible. Do not make drastic changes and do not repeat the same action verb more than twice. OUTPUT A VALID JSON WITH THE SAME INPUT FORMAT WITH THE UPDATED INFORMATION\n\n"
        f"Job Description: {job_description}\n\n"
        f"CV Experience Section: {json.dumps(cv_json)}")

    # API call
    response = openai.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "system", "content": "You are a recruiter with 20 years of experience in big tech companies, expert in CV tailoring."}, 
                  {"role": "user", "content": prompt_text}],
        max_tokens=600)

    new_bullet_points = response.choices[0].message.content
    
    if new_bullet_points.startswith("```json"):
        new_bullet_points = new_bullet_points[7:]
    if new_bullet_points.endswith("```"):
        new_bullet_points = new_bullet_points[:-3]
 
    updated_cv_json = json.loads(new_bullet_points)
    print(new_bullet_points)  # debug line
    updated_cv_json = json.loads(new_bullet_points)
    return updated_cv_json

def extract_activities(new_bullets):
    experience_activities = []
    relevant_projects_activities = []

    # Extract activities from experience
    for job in new_bullets["experience"]:
        for activity in job["activities"]:
            experience_activities.append(activity)

    # Extract activities from relevant projects
    for project in new_bullets["relevant_projects"]:
        for activity in project["activities"]:
            relevant_projects_activities.append(activity)

    return {
        "experience": experience_activities,
        "relevant_projects": relevant_projects_activities}


def update_cv(cv_path, new_bullets):
    
    doc = Document(cv_path)
    capture_experience = False
    capture_projects = False
    exp_index = 0
    proj_index = 0

    def count_words(line):
        words = re.findall(r'\b\w+\b', line)
        return len(words)

    # iterate through paragraphs in the doc
    for para in doc.paragraphs:
        text = para.text.strip()

        # capturing lines after "WORK EXPERIENCE"
        if text.lower() == 'work experience':
            capture_experience = True
            capture_projects = False
            continue
        
        # capturing lines after "RELEVANT PROJECTS"
        if text.lower() == 'relevant projects':
            capture_experience = False
            capture_projects = True
            continue
        
        # capture and write (preserving format) for experience
        if capture_experience and count_words(text) >= 11 and '\t' not in text:
            if exp_index < len(new_bullets['experience']):
                # get format
                if para.runs:
                    original_run = para.runs[0]
                    font_name = original_run.font.name
                    font_size = original_run.font.size
                    bold = original_run.bold
                    italic = original_run.italic

                # write a bullet point
                new_text = '• ' + new_bullets['experience'][exp_index]
                p = para._element
                for child in p[:]:
                    p.remove(child)
                run = para.add_run(new_text)

                # formatting
                if para.runs:
                    run.font.name = font_name
                    run.font.size = font_size
                    run.bold = bold
                    run.italic = italic 
                
                exp_index += 1
        
        # capture and write (preserving format) for relevant projects
        if capture_projects and count_words(text) >= 11 and '\t' not in text:
            if proj_index < len(new_bullets['relevant_projects']):
                # get format
                if para.runs:
                    original_run = para.runs[0]
                    font_name = original_run.font.name
                    font_size = original_run.font.size
                    bold = original_run.bold
                    italic = original_run.italic

                # write a bullet point
                new_text = '• ' + new_bullets['relevant_projects'][proj_index]
                p = para._element
                for child in p[:]:
                    p.remove(child)
                run = para.add_run(new_text)

                # formatting
                if para.runs:
                    run.font.name = font_name
                    run.font.size = font_size
                    run.bold = bold
                    run.italic = italic
                
                proj_index += 1

    #doc.save('updated_document_with_formatting.docx')
    return doc

def generate_cv(cv_path, cv_json, job_offer):
    updated_cv_json = update_cv_sections(cv_json, job_offer)
    new_bullets = extract_activities(updated_cv_json)
    return update_cv(cv_path, new_bullets)

### CV tailoring
**Total Cost**: 0.83 --> 0.84 ; Delta=0.01
<br>Price: **0.01**

# COVER LETTER

## APPROACH 1 (SECTION GENERATION AND THEN REVIEW BY OPEN AI)

In [281]:
def replace_text_in_docx(filename, company, position):
    doc = Document(filename)
    replacements = {
    '[COMPANY]': company,
    '[POSITION]': position}
    for para in doc.paragraphs:
        for key, value in replacements.items():
            if key in para.text:
                para.text = para.text.replace(key, value)
    return doc
    
def extract_paragraph(filename):
    doc = Document(filename)
    target_heading = "Alignment with Values:"
    is_next_paragraph = False
    for para in doc.paragraphs:
        if is_next_paragraph:
            values_paragraph = para.text
            return values_paragraph
        if target_heading in para.text:
            is_next_paragraph = True
            
def format_paragraph_with_gpt4o(cover_letter_path, job_offer):

    values_paragraph = extract_paragraph(cover_letter_path)
    
    prompt_text = (
        "You are a recruiter with 20 years of experience in big tech companies. "
        "Based on the detailed job information provided below, craft a concise and compelling paragraph with around 150-200 words for a cover letter. "
        "This paragraph should articulate why I am drawn to the company, specifically citing relevant company values "
        "and initiatives that you can find mainly online or mentioned in the job offer. Please align this with the professional writing style "
        "outlined in the provided values paragraph. Ensure the response is tailored to reflect the unique aspects "
        "of both the job information and the values paragraph.\n\n"
        f"Job Information: {job_offer}\n\n"
        f"Values Paragraph: {values_paragraph}")
    
    # API call
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": f"You are a recruiter with 20 years of experience in big tech companies, expert in CV and cover letter taloring"}, 
                  {"role": "user", "content": prompt_text}],
        max_tokens=250)

    return response.choices[0].message.content


def replace_para_in_docx(filename, new_text):
    doc = Document(filename)
    target_heading = "Alignment with Values:"
    is_next_paragraph = False

    for para in doc.paragraphs:
        if is_next_paragraph:
            para.text = new_text
            break
        if target_heading in para.text:
            is_next_paragraph = True

    return doc

In [289]:
def review_cover_letter(job_offer):

    # cover letter text
    cover_letter_path = 'Cover Letter.docx'
    cover_letter_template = extract_text(cover_letter_path)
    
    # prompt for GPT-4o
    prompt_text = (f"You are a recruiter with 20 years of experience in big tech companies. "
               f"Using the cover letter below and the Job Offer Details, review the cover letter and leverage your expertise to make pertinent changes. "
               f"Do not make any drastic changes and always match the tone and writing style to keep it the same. "
               f"The cover letter should articulate why the applicant is drawn to the company and showcase their skills, experiences, and competencies, "
               f"specifically citing relevant company values and initiatives mentioned in the job offer, but mainly those found online. "
               f"Keep it under 470 words, so trim information that is not relevant to the job offer and make sure you return a cover letter from a person you would definitely hire.\n\n"
               f"Job Offer Details: {job_offer}\n\n"
               f"Cover Letter Template: {cover_letter_template}")

    # API call
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": f"You are a recruiter with 20 years of experience in big tech companies, expert in CV and cover letter taloring"}, 
                  {"role": "user", "content": prompt_text}],
        max_tokens=600)
    
    return response.choices[0].message.content

In [290]:
def two_step_tailor():
    job_df = pd.read_csv('Cover Letter List.csv', encoding='latin-1')
    for index, row in job_df.iterrows():
        company = row['Company']
        position = row['Position']
        job_offer = row['Job Offer']
        replace_text_in_docx(cover_letter_path, company, position)
        new_para = format_paragraph_with_gpt4o(cover_letter_path, job_offer)
        cover_letter_new_para = replace_para_in_docx(cover_letter_path, new_para)
        cover_letter_text = review_cover_letter(job_offer)
        
        # save the generated cover letter to a file
        doc = Document()
        doc.add_paragraph(cover_letter_text)
        doc.save(f'2STEP{company}_{position}_Cover_Letter.docx')
        print(f'Cover Letter for {company} for {position} done!')


In [291]:
two_step_tailor()

Cover Letter for Facebook for Data Engineer done!
Cover Letter for Hazen Research for Entry Level Data Scientist done!
Cover Letter for Facebook2 for Data Engineer done!
Cover Letter for Hazen Research2 for Entry Level Data Scientist done!


## APPROACH 2 (ALL GENERATION BY OPEN AI)

In [296]:
def generate_cover_letter(company, position, job_offer):

    cover_letter_path = 'Cover Letter.docx'
    # cover letter text
    cover_letter_template = extract_text(cover_letter_path)
    
    #job_offer_text = extract_text(job_offer_path)
    
    # prompt for GPT-4o
    prompt_text = (f"Using the cover letter template and Job Offer Details provided below, craft a complete, compelling cover letter. "
                   f"The cover letter should articulate why the applicant is drawn to {company}, and fit to be a {position}and showcase my skills, experiences and competences "
                   f"Leverage all my qualities, also showing my softskills, and dont invent false information " 
                   f"specifically citing relevant company values and initiatives mentioned in the job offer but mainly ones found online. "
                   f"Keep it under 470 words so trim information on the template that is not relevant to the job offer and make sure you return a cover letter from a person you would definitely hire."
                   f"Maintain a professional writing style and match the writing style of the template"
                   f"Moreover, I am attacching bullet points from my cv, please adapt them with the info of the cover letter and my cover letter\n\n"
                   f"Job Offer Details: {job_offer}\n\n"
                   f"Cover Letter Template: {cover_letter_template}")
    
    # API call
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": f"You are a recruiter with 20 years of experience in big tech companies, expert in CV and cover letter taloring"}, 
                  {"role": "user", "content": prompt_text}],
        max_tokens=620)

    return response.choices[0].message.content

### 1 Step generation = Faster!
**Total Cost**: 0.55 --> 0.62 ; Delta=0.07
<br>**Full Job Offer**
<br>Price: **0.02**
<br>Quality: 5/5
<br>**Summarized Job Offer**
<br>Price: **0.015**
<br>Quality: 3.5/5

### 2 Step generation
**Total Cost**: 0.45 --> 0.55 ; Delta=0.10
<br>**Full Job Offer**:
<br>Price: **0.03**
<br>Quality: 4.5/5
<br>**Summarized Job Offer**
<br>Price: **0.02**
<br>Quality: 3/5

# Winner: 1 Step Generation with Full Offer!

## FINAL FUNCTION

In [None]:
def tailor():
    job_df = pd.read_csv('Cover Letter List.csv', encoding='latin-1')
    for index, row in job_df.iterrows():
        company = row['Company']
        position = row['Position']
        job_offer = row['Job Offer']
        generate_cover_letter = row['Generate Cover Letter']

        # CV
        cv_path = 'CV Rodrigo Ugarte.docx'
        cv_json = json.load(open('cv_data.json', encoding='utf-8'))
        cv = generate_cv(cv_path, cv_json, job_offer)
        cv.save(f'CV_Rodrigo_Ugarte_{company}_{position}.docx')
        print(f'CV for {company} for {position} done!')

        # COVER LETTER
        if generate_cover_letter:
            cover_letter = generate_cover_letter(company, position, job_offer)
            doc = Document()
            doc.add_paragraph(cover_letter)
            doc.save(f'1STEP_Cover_Letter_{company}_{position}.docx')
            print(f'Cover Letter for {company} for {position} done!')