## Google Colab SETUP ONLY (LOCAL BELOW)

Only run the next 2 blocks in Google Colab

In [None]:
!pip install pdfplumber beautifulsoup4 python-docx python-dotenv

In [None]:
import os
import re
import traceback
import pdfplumber
import requests
from bs4 import BeautifulSoup
from docx import Document
from IPython.display import Markdown, display

from dotenv import load_dotenv
from datetime import datetime
import sys
if 'google.colab' in sys.modules:
    from google.colab import files

    # Upload your .env file
    uploaded = files.upload()  # This will prompt you to upload your .env file

    # Format the date as "Month Day, Year", e.g. "May 15, 2024"
    APPLICATION_DATE = datetime.now().strftime("%B %d, %Y")
    print("Application Date:", APPLICATION_DATE)

    # Now load the environment variables from the .env file
    load_dotenv()

    # Verify that the variables are loaded
    import os
    print("OPENAI_API_KEY:", repr(os.getenv("OPENAI_API_KEY")))
    print("NAME:", os.getenv("NAME"))
    print("Google Colab setup complete.")
else:
    print("This block should only be run in Google Colab.")

## LOCAL SETUP ONLY

Skip this cell if you are using Google Colab.
This cell loads the necessary modules and environment variables.

Copy the following command into your CLI:

pip install pdfplumber beautifulsoup4 python-docx python-dotenv langchain langchain-openai

In [None]:
import os
import re
import traceback
import pdfplumber
import requests
from bs4 import BeautifulSoup
from docx import Document
from dotenv import load_dotenv
from IPython.display import Markdown, display
from datetime import datetime

# Load environment variables from .env file
load_dotenv()

# Format the date as "Month Day, Year", e.g. "May 15, 2024"
APPLICATION_DATE = datetime.now().strftime("%B %d, %Y")
print("Application Date:", APPLICATION_DATE)

# Set OpenAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY").strip()
if not OPENAI_API_KEY:
    raise ValueError("⚠️ OpenAI API Key is missing or invalid!")


# Personal Details
NAME = os.getenv("NAME", "Greg Barker")
ADDRESS = os.getenv("ADDRESS", "[Your Address]")
CITY_STATE_ZIP = os.getenv("CITY_STATE_ZIP", "[City, State, Zip]")
EMAIL = os.getenv("EMAIL", "gregcbarker@gmail.com")
PHONE = os.getenv("PHONE", "+1 403-828-9041")
GITHUB = os.getenv("GITHUB", "https://github.com/savevsgames")
LINKEDIN = os.getenv("LINKEDIN", "https://www.linkedin.com/in/greg-barker-savevsgames/")

print(NAME, ADDRESS, CITY_STATE_ZIP, EMAIL, PHONE, GITHUB, LINKEDIN, sep="\n")
print("✅ Setup complete!")


In [None]:
repr(OPENAI_API_KEY)

## Initialize LLM Model with LangChain

- This cell instantiates the model once so that all subsequent calls can use it.
- All utility functions will use this instance.

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

# Instantiate the ChatOpenAI model with desired parameters
model = ChatOpenAI(temperature=0, api_key=OPENAI_API_KEY.strip(), stream_usage=True)
print("✅ LLM model instantiated.")

# Test call: Check connection and print basic model info
try:
    test_response = model.invoke([
        SystemMessage(content="Connection test: You are a helpful connection tester."),
        HumanMessage(content="Please return your model name and confirm your connection status.")
    ])
    print("✅ Test call successful. Response:")
    print(test_response.content)
except Exception as e:
    print("⚠️ Error during LLM test call:")
    print(traceback.format_exc())

## Utility Functions

This cell defines helper functions for user interaction, file extraction, web scraping, OpenAI calls, and saving the output.

In [None]:
def print_markdown(message):
    """Utility function to display markdown messages in the notebook."""
    display(Markdown(message))


# --- User Input Helpers ---
def select_job_source():
    """
    Prompts the user to enter the number corresponding to the job source.
    (Assumes that the available job sources have already been printed.)
    """
    job_sources = [
        "Indeed", "LinkedIn", "Glassdoor", "JobBank.ca", "AngelList",
        "Stack Overflow Jobs", "WeWorkRemotely", "Referred by Friend",
        "Company Website", "Other"
    ]
    while True:
        choice = input("Enter the number corresponding to the job source: ")
        if choice.isdigit() and 1 <= int(choice) <= len(job_sources):
            return job_sources[int(choice) - 1]
        print("❌ Invalid choice. Please enter a number from the list.")


def get_user_inputs():
    """Prompts the user for job-related inputs."""
    # Now ask for the job source (after the options have been printed in above cell)
    job_source = select_job_source()

    print_markdown("### Paste the full job posting text below:")

    job_posting = input("Job Posting: ")

    company_website = input("Enter the company website (or press Enter to skip): ")

    context_links = []
    print("Enter up to 3 additional research links (press Enter to skip each one):")
    for i in range(3):
        link = input(f"Link {i + 1}: ")
        if link:
            context_links.append(link)


    return job_posting, company_website, context_links, job_source


# --- PDF Extraction ---
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF resume, truncating to 2000 characters."""
    if not os.path.exists(pdf_path):
        print(f"⚠️ PDF not found at: {pdf_path}")
        return ""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

# --- LLM-Token Tracking Function ---
def print_token_usage(response):
    # Try usage_metadata first
    if hasattr(response, "usage_metadata") and response.usage_metadata:
        print("Token usage:", response.usage_metadata)
    # Otherwise, try checking response_metadata for token_usage
    elif hasattr(response, "response_metadata") and response.response_metadata.get("token_usage"):
        print("Token usage:", response.response_metadata["token_usage"])
    else:
        print("Token usage information not available.")


# --- LLM-Powered Functions using the global model ---
def process_job_posting(job_posting_text):
    """
    Extracts the company name and summarizes the job posting using the global model.
    Expected output format (as generated by the LLM):

    Company Name: [Extracted Company Name]
    Job Summary: [Concise Job Summary]
    """
    # Limit the posting to the first 2000 characters to avoid token issues.
    truncated_posting = job_posting_text[:3000]
    prompt = f"""
You are a skilled job assistant.

The user has provided the following job posting text:
---
{truncated_posting}
---

Tasks:
1. Extract the company name.
2. Summarize the job posting concisely with all key details required to find the right candidate (include qualifications, required skills, and other relevant job details).

Output in the following format:
Company Name: [Extracted Company Name]
Job Summary: [Concise Job Summary]
    """
    try:
        response = model.invoke([
            SystemMessage(content="You are a skilled job assistant."),
            HumanMessage(content=prompt)
        ])
        print_token_usage(response)
        result = response.content
        print("LLM output for job posting processing:")
        print(result)
    except Exception as e:
        print("⚠️ LLM error in process_job_posting:")
        print(traceback.format_exc())
        return "Generic", ""

    # Parse the response using simple string splits.
    company_name = "Generic"
    job_summary = ""

    if "Company Name:" in result:
        # Extract text from the first occurrence of "Company Name:" up to the next newline.
        company_name = result.split("Company Name:", 1)[1].split("\n")[0].strip()

    if "Job Summary:" in result:
        # Everything after "Job Summary:" is considered the summary.
        job_summary = result.split("Job Summary:", 1)[1].strip()

    return company_name, job_summary



def generate_cover_letter(resume_text, job_summary, company_info, context_info, job_source, company_name, NAME, EMAIL, ADDRESS, CITY_STATE_ZIP, PHONE, GITHUB, LINKEDIN, APPLICATION_DATE):
    """Generates a personalized cover letter using the global model."""
    prompt = f"""
You are an expert cover letter writer.

Date for Cover Letter:
- Date: {APPLICATION_DATE}

Candidate Details:
- Name: {NAME}
- Email: {EMAIL}
- Phone: {PHONE}
- Address: {ADDRESS}
- City/State/Postal-ZIP: {CITY_STATE_ZIP}
- GitHub: {GITHUB}
- LinkedIn: {LINKEDIN}

Resume:
- Full Resume: {resume_text}

Job Posting Details:
- Company: {company_name}
- Job Source: {job_source}
- Job Summary: {job_summary}
Company Info:
{company_info[:2000]}... (truncated)

Additional Context:
{context_info[:2000]}... (truncated)

Instructions:
- DO NOT USE GENERIC PLACEHOLDERS.
- Replace ALL placeholder text with the candidate’s actual details as provided (Example: no [ ] placeholder tags).
- IF ANY OF THE CONTEXT OR SUMMARY DATA IS RELATED TO SECURITY(https), LOG-IN, SIGN-UP INFORMATION, ETC. YOU ARE TO IGNORE THE CONTEXT THAT REFERS TO THESE DETAILS AND PARSE IT FROM THE GENERATED RESPONSE. THIS DOES NOT INCLUDE THE USER'S DETAILS (NAME, EMAIL, ADDRESS, CITY_STATE_ZIP, PHONE, GITHUB, LINKEDIN) OR THE CURRENT TIME; THOSE SHOULD BE INCLUDED IN THE RESPONSE.
- Use the context carefully and only when it aligns with the job description (Example: A Web Developer job posting for an Oil & Gas company will not be looking for someone to work on a drilling crew, they will be looking for a web developer, but the company's website may inform users of their drilling operations). This type of context CAN and SHOULD be used to better inform you about the company's values and industry/specializations. This can then be used where it is relevant to the job posting to build a better cover letter.
- Generate a concise, high-quality cover letter tailored for the candidate and the job.
    """
    try:
        response = model.invoke([
            SystemMessage(content="You are an expert cover letter writer."),
            HumanMessage(content=prompt)
        ])
        print_token_usage(response)
        result = response.content
    except Exception as e:
        print("⚠️ LLM error in generate_cover_letter:")
        print(traceback.format_exc())
        return ""
    return result


# --- Scraping & Summarization ---
def scrape_web_content(url):
    """Scrapes text content from a URL, truncated to 2000 characters."""
    if not url:
        return ""
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text(separator="\n")[:2000]
    except Exception as e:
        print(f"⚠️ Failed to scrape {url}: {e}")
        return ""


def summarize_scraped_content(text, title="Scraped Content Summary"):
    """
    Uses the global model to generate a concise summary of the given text.
    Prints the summary and returns it.
    """
    if text.strip():
        prompt = f"Please summarize the following text in a concise manner:\n\n{text}"
        try:
            response = model.invoke([HumanMessage(content=prompt)])
            print_token_usage(response)
            summary = response.content
            print(f"✅ {title}:\n{summary}\n")
            return summary
        except Exception as e:
            print(f"⚠️ Error summarizing {title}:")
            print(traceback.format_exc())
            return text
    else:
        print(f"⚠️ No content available for {title}.")
        return ""


# --- Saving Output ---
def sanitize_filename(name):
    """Sanitizes a string for use as a filename."""
    return re.sub(r'[^a-zA-Z0-9_\-]', '_', name)


def save_as_docx(cover_letter, company_name):
    """Saves the cover letter as a DOCX file in an output directory, overwriting if it exists."""
    output_dir = "./output"
    os.makedirs(output_dir, exist_ok=True)
    safe_company = sanitize_filename(company_name) or "Generic"
    filename = os.path.join(output_dir, f"CoverLetter-{safe_company}.docx")

    # Explicitly remove the file if it exists
    if os.path.exists(filename):
        os.remove(filename)

    doc = Document()
    doc.add_paragraph(cover_letter)
    doc.save(filename)
    print(f"✅ Cover letter saved as: {os.path.abspath(filename)}")


## Print Available Job Sources

Below are the available job sources. Please note the numbers as you will need them when prompted.

In [None]:
job_sources = [
    "Indeed", "LinkedIn", "Glassdoor", "JobBank.ca", "AngelList",
    "Stack Overflow Jobs", "WeWorkRemotely", "Referred by Friend",
    "Company Website", "Other"
]
for i, source in enumerate(job_sources, 1):
    print(f"{i}. {source}")

## Gather User Inputs

Run this cell and follow the prompts to paste the job posting, enter the company website,
add up to 3 research links, and select the job source.

In [None]:
job_posting, company_website, context_links, job_source = get_user_inputs()
print("\n✅ User inputs collected!")

## Extract Resume Text

This cell extracts your resume text from the specified PDF file.

In [None]:
resume_pdf_path = "./resume/GregBarkerResume2025.pdf"
resume_text = extract_text_from_pdf(resume_pdf_path)
if resume_text:
    print("✅ Resume text extracted.")
else:
    print("⚠️ Resume text extraction failed or PDF not found.")

## Process Job Posting

This cell uses OpenAI to extract the company name and a summary of the job posting.


In [None]:
company_name, job_summary = process_job_posting(job_posting)
print(f"✅ Extracted Company Name: {company_name}")
print(f"✅ Job Summary: {job_summary}")

## Scrape Company & Context Info

This cell scrapes additional details from the company website and any provided research links.


In [None]:
# Scrape raw content
company_info_raw = scrape_web_content(company_website) if company_website else ""
context_info_raw = "\n".join([scrape_web_content(link) for link in context_links])

# Print raw scraped content (optional)
print("✅ Raw Company Info Scraped:\n", company_info_raw[:500], "\n...")
print("✅ Raw Context Info Scraped:\n", context_info_raw[:500], "\n...")

# Summarize the scraped content using the LLM
company_info = summarize_scraped_content(company_info_raw, "Company Info Summary")
context_info = summarize_scraped_content(context_info_raw, "Context Info Summary")

print("✅ Web scraping and summarization complete.")

## Generate Cover Letter

This cell generates a personalized cover letter using the gathered information and OpenAI.


In [None]:
cover_letter = generate_cover_letter(
    resume_text, job_summary, company_info, context_info, job_source, company_name, NAME, EMAIL, ADDRESS, CITY_STATE_ZIP, PHONE, GITHUB, LINKEDIN, APPLICATION_DATE
)
if cover_letter:
    print("✅ Cover letter generated. Preview below:\n")
    print(cover_letter[:2000] + "\n...")  # Print first 500 characters as a preview
else:
    print("⚠️ Cover letter generation failed.")

## Save Cover Letter

This final cell saves your generated cover letter as a DOCX file.

In [None]:
if cover_letter:
    save_as_docx(cover_letter, company_name)
else:
    print("⚠️ No cover letter to save.")