In [2]:
# === EDIT ME ONCE ===
USER_PROFILE = {
    "name": "Sheikh Abdul Munim",
    "title": "AI/ML & Robotics Engineer",
    "location": "Melbourne, Australia",
    "email": "you@example.com",
    "phone": "+61-4xx-xxx-xxx",
    "links": ["https://github.com/SheikhMunim", "https://www.linkedin.com/in/your-handle"],
    "skills": [
        "Python","PyTorch","TensorFlow","Jupyter","LangChain","RAG","Transformers","BERT","Llama 3","Ollama",
        "Vector databases","ChromaDB","Docker","FastAPI","ROS2","Gazebo","PDDL","Fast Downward","PlanSys2",
        "NLP","Topic modeling","Sentiment analysis","OpenCV","Communication","Stakeholder collaboration",
        "Technical writing","Teaching/mentoring"
    ],
    "achievements": [
        "Built a neural-symbolic pipeline combining BERT NLU with PDDL planning for service robots.",
        "90%+ task success across 8 scenarios in simulation; validated on a real robot.",
        "Multi-head intent parser with slot tagging for motion-level control in ROS2.",
        "Containerized a RAG job-assistant; reproducible local GPU inference via Ollama."
    ],
    "pitch": (
        "I design reliable, explainable AI systems that combine strong language understanding with "
        "symbolic planning to produce safe, human-aligned behavior."
    ),
}

# Where your supporting docs live (resume, project notes, prior cover letters, etc.)
from pathlib import Path

# Move one level up from notebook folder (to project root)
ROOT = Path.cwd().parent.resolve()

# Define main data directories
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

OUT_DIR = DATA_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

RAG_DIR = DATA_DIR / "job_rag"
RAG_DIR.mkdir(parents=True, exist_ok=True)

PROFILE_DIR = RAG_DIR / "profile_docs"
PROFILE_DIR.mkdir(parents=True, exist_ok=True)

print("ROOT directory:       ", ROOT)
print("DATA directory:       ", DATA_DIR)
print("Outputs directory:    ", OUT_DIR)
print("RAG workspace:        ", RAG_DIR)
print("Profile documents in: ", PROFILE_DIR)



ROOT directory:        D:\+Job\rag-job-assistant
DATA directory:        D:\+Job\rag-job-assistant\data
Outputs directory:     D:\+Job\rag-job-assistant\data\outputs
RAG workspace:         D:\+Job\rag-job-assistant\data\job_rag
Profile documents in:  D:\+Job\rag-job-assistant\data\job_rag\profile_docs


In [3]:
import re, json
from collections import Counter
import subprocess, sys

def pip_install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# NLTK + RapidFuzz
try:
    import nltk
except ImportError:
    pip_install("nltk==3.9.1"); import nltk
try:
    from rapidfuzz import process, fuzz
except ImportError:
    pip_install("rapidfuzz==3.9.7"); from rapidfuzz import process, fuzz

nltk.download("punkt", quiet=True); nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize

def normalize_text(text: str) -> str:
    text = re.sub(r"[\r\n]+", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def tokenize_lower(text: str):
    toks = [t.lower() for t in word_tokenize(text)]
    toks = [re.sub(r"^\W+|\W+$","",t) for t in toks]
    return [t for t in toks if t and t not in STOPWORDS and not t.isdigit()]

HARD_SKILL_LEXICON = {
    "Python","PyTorch","TensorFlow","NumPy","Pandas","scikit-learn","Jupyter",
    "Transformers","BERT","Llama","RAG","LangChain","Ollama","OpenAI","Hugging Face",
    "Vector DB","Chroma","FAISS","Pinecone","Weaviate","Milvus",
    "Docker","FastAPI","Flask","REST API","GraphQL",
    "MLflow","Weights & Biases","W&B","Ray","Dask",
    "LLM","Prompt Engineering","Reranking","Guardrails","Retrieval","Chunking",
    "CI/CD","GCP","AWS","Azure","Kubernetes","GPU","CUDA",
    "ROS2","Gazebo","PDDL","Fast Downward","PlanSys2","OpenCV"
}
SOFT_SKILL_LEXICON = {
    "Communication","Collaboration","Leadership","Problem solving","Stakeholder management",
    "Teamwork","Time management","Attention to detail","Documentation","Mentoring","Ownership"
}

def top_terms(tokens, topn=30, min_len=2):
    c = Counter([t for t in tokens if len(t) >= min_len])
    return [w for w,_ in c.most_common(topn)]

def fuzzy_match_candidates(candidates, lexicon, cutoff=86):
    found = set()
    for cand in candidates:
        match, score, _ = process.extractOne(cand, lexicon, scorer=fuzz.WRatio)
        if score >= cutoff:
            found.add(match)
    return sorted(found)

def bullet_list(items): 
    return "\n".join([f"- {x}" for x in items])


In [4]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
import torch, uuid

DB_DIR = RAG_DIR / "chroma_db"; DB_DIR.mkdir(exist_ok=True, parents=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
emb = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={"device": device})
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150, separators=["\n\n","\n"," ",""])

def load_docs_from(folder: Path, doc_type: str):
    docs = []
    for p in sorted(folder.glob("*")):
        if p.suffix.lower()==".pdf": docs += PyPDFLoader(str(p)).load()
        elif p.suffix.lower() in {".txt",".md"}: docs += TextLoader(str(p), encoding="utf-8").load()
    for d in docs:
        d.metadata["source"] = d.metadata.get("source") or str(folder)
        d.metadata["doc_type"] = doc_type
        d.metadata["uid"] = str(uuid.uuid4())[:8]
    return docs

# build an empty persistent DB so we can reuse it
vectordb = Chroma(embedding_function=emb, persist_directory=str(DB_DIR))
retriever_all = vectordb.as_retriever(search_kwargs={"k": 6})

def index_profile_docs():
    docs = load_docs_from(PROFILE_DIR, "profile")
    chunks = splitter.split_documents(docs)
    if chunks:
        vectordb.add_documents(chunks)
    return len(chunks)

def index_jd_text(jd_text: str):
    # we index the JD fresh each time the final cell is run
    tmp = (RAG_DIR / "jd.txt"); tmp.write_text(jd_text, encoding="utf-8")
    docs = load_docs_from(RAG_DIR, "jd")
    # only keep the jd.txt content to avoid re-adding profile docs
    docs = [d for d in docs if "jd.txt" in d.metadata.get("source","")]
    chunks = splitter.split_documents(docs)
    if chunks:
        vectordb.add_documents(chunks)
    return len(chunks)

def retrieve(query: str, k=6, doc_type=None):
    docs = vectordb.similarity_search(query, k=24)
    if doc_type:
        docs = [d for d in docs if d.metadata.get("doc_type")==doc_type]
    return docs[:k]

def format_docs(docs):
    return "\n\n".join(f"[{i+1}] {d.page_content}" for i,d in enumerate(docs,1))

def cite_sources(docs):
    lines=[]
    for i,d in enumerate(docs,1):
        lines.append(f"[{i}] {d.metadata.get('source','')}")
    return "\n".join(lines)

# LLM connector
import os
OLLAMA_HOST = os.getenv("OLLAMA_HOST","http://localhost:11434")
LLM_MODEL   = os.getenv("LLM_MODEL","llama3.2:3b")
llm = ChatOllama(base_url=OLLAMA_HOST, model=LLM_MODEL, temperature=0.3)
parser = StrOutputParser()

def run_prompt(system_prompt: str, user_text: str):
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("user", "{input}")
    ])
    chain = prompt | llm | parser
    return chain.invoke({"input": user_text})

print("RAG ready | device:", device, "| DB:", DB_DIR)


  emb = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={"device": device})
  from tqdm.autonotebook import tqdm, trange
  vectordb = Chroma(embedding_function=emb, persist_directory=str(DB_DIR))
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


RAG ready | device: cuda | DB: D:\+Job\rag-job-assistant\data\job_rag\chroma_db


In [5]:
def build_context(profile: dict, jd_text: str, jd_snips: str, prof_snips: str,
                  jd_hard, jd_soft, keywords, have_hard, have_soft, gaps):
    return f"""
[PROFILE]
Name: {profile['name']} | Title: {profile['title']} | Location: {profile['location']}
Email: {profile['email']} | Phone: {profile['phone']} | Links: {", ".join(profile['links'])}

Pitch:
{profile['pitch']}

Skills:
{bullet_list(profile['skills'])}

Achievements:
{bullet_list(profile['achievements'])}

[JOB_DESCRIPTION_RAW]
{jd_text}

[RETRIEVED_JD_SNIPPETS]
{jd_snips}

[RETRIEVED_PROFILE_SNIPPETS]
{prof_snips}

[EXTRACTED_FROM_JD]
Hard skills: {", ".join(jd_hard)}
Soft skills: {", ".join(jd_soft)}
Extra keywords: {", ".join(keywords[:30])}

[ALIGNMENT_SUMMARY]
You already have (hard): {", ".join(have_hard)}
You already have (soft): {", ".join(have_soft)}
Gaps to phrase carefully: {", ".join(gaps)}
"""

SYSTEM_SKILLS = """
You are a job-application copilot. From the context:
1) HARD skills explicitly relevant to the JD and present in candidate/profile.
2) SOFT skills tailored to the JD.
3) 15–25 SEO keywords for CV/ATS.
Rules:
- Ground items in [RETRIEVED_*] where possible. No fabrications.
- Use canonical names. Output as three sections with bullet lists.
"""

SYSTEM_COVER = """
You are an expert cover-letter writer. Using the context:
- Ground claims in [RETRIEVED_JD_SNIPPETS] and [RETRIEVED_PROFILE_SNIPPETS].
- <=350 words, 3–5 short paragraphs + a 'Relevant Highlights' bullet list (3–5).
- Quote exact JD terms where helpful. No invented experience.
- Confident and specific; clear call-to-action.
"""

SYSTEM_EMAILS = """
Write three short emails tailored to the JD and candidate:
1) Application email (80–140 words) + 2–3 subject options.
2) Cold recruiter outreach (40–80 words) + 2–3 subject options.
3) Follow-up after 7–10 days (50–90 words) + 2–3 subject options.
Ground skills in [RETRIEVED_*]. No exaggeration. Clean signature from [PROFILE].
Format:
=== Email 1 ===
Subject: ...
Body:
...
=== Email 2 ===
...
=== Email 3 ===
...
"""

SYSTEM_ATS = """
Create a compact ATS-friendly resume summary:
- 3 bullets (outcomes-focused) aligned to JD.
- One 'Core Stack' line (comma-separated tools).
Keep to 80–120 words. Ground in [RETRIEVED_*]; no fabrications.
"""

def gen_skills(context): return run_prompt(SYSTEM_SKILLS, context)
def gen_cover(context):  return run_prompt(SYSTEM_COVER,  context)
def gen_emails(context): return run_prompt(SYSTEM_EMAILS, context)
def gen_ats(context):    return run_prompt(SYSTEM_ATS,    context)


In [6]:
from datetime import datetime

# 1) paste JD text here and run this cell only:
JD_INPUT = """
PASTE THE FULL JOB DESCRIPTION HERE.
"""

# 2) index JD (fresh) and ensure your profile docs are indexed once
_ = index_profile_docs()
_ = index_jd_text(JD_INPUT)

# 3) retrieve focused snippets
jd_focus     = retrieve("List must-have requirements and responsibilities.", k=6, doc_type="jd")
profile_focus= retrieve("Find bullets that prove impact, results, metrics.", k=6, doc_type="profile")
RAG_JD       = format_docs(jd_focus)
RAG_PROFILE  = format_docs(profile_focus)

# 4) keyword/skill extraction from JD text
JD_N = normalize_text(JD_INPUT)
TOKS = tokenize_lower(JD_N)
cands = top_terms(TOKS, topn=80, min_len=2)
jd_hard = fuzzy_match_candidates(cands, HARD_SKILL_LEXICON, cutoff=86)
jd_soft = fuzzy_match_candidates(cands, SOFT_SKILL_LEXICON, cutoff=86)

caps = sorted(set(re.findall(r"\b([A-Z][a-zA-Z0-9\-\+&/]{1,})\b", JD_INPUT)))
extra = fuzzy_match_candidates([c.lower() for c in caps], {s.lower():s for s in HARD_SKILL_LEXICON}.keys(), cutoff=90)
extra_cased = [next((x for x in HARD_SKILL_LEXICON if x.lower()==e), e) for e in extra]
jd_hard = sorted(set(jd_hard) | set(extra_cased))

known = {t.lower() for t in jd_hard + jd_soft}
keywords = [t for t in cands if t not in known and len(t) >= 3]

# 5) alignment summary
from rapidfuzz import process, fuzz
def fuzzy_overlap(yours, theirs, cutoff=88):
    matches = []
    for y in yours:
        m = process.extractOne(y, theirs, scorer=fuzz.WRatio)
        if m and m[1] >= cutoff:
            matches.append((y, m[0], m[1]))
    return matches

have_hard = sorted({m[1] for m in fuzzy_overlap(USER_PROFILE["skills"], jd_hard, 88)})
have_soft = sorted({m[1] for m in fuzzy_overlap(USER_PROFILE["skills"], jd_soft, 88)})
gaps = [s for s in jd_hard+jd_soft if s not in set(have_hard+have_soft)]

# 6) build context and generate all outputs
CTX = build_context(USER_PROFILE, JD_INPUT, RAG_JD, RAG_PROFILE, jd_hard, jd_soft, keywords, have_hard, have_soft, gaps)

skills_out = gen_skills(CTX)
cover_out  = gen_cover(CTX)
emails_out = gen_emails(CTX)
ats_out    = gen_ats(CTX)

# 7) show and save
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
(OUT_DIR / f"{ts}_skills_keywords.md").write_text(skills_out, encoding="utf-8")
(OUT_DIR / f"{ts}_cover_letter.md").write_text(cover_out, encoding="utf-8")
(OUT_DIR / f"{ts}_emails.md").write_text(emails_out, encoding="utf-8")
(OUT_DIR / f"{ts}_ats_summary.md").write_text(ats_out, encoding="utf-8")

print("=== SKILLS & KEYWORDS ===\n", skills_out, "\n")
print("=== COVER LETTER ===\n", cover_out, "\n")
print("=== EMAILS ===\n", emails_out, "\n")
print("=== ATS SUMMARY ===\n", ats_out, "\n")
print("Saved files to:", OUT_DIR)


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


=== SKILLS & KEYWORDS ===
 **Hard Skills**

* Python
* PyTorch
* TensorFlow
* Jupyter
* LangChain
* RAG
* Transformers
* BERT
* Llama 3
* Ollama
* Vector databases
* ChromaDB
* Docker
* FastAPI
* ROS2
* Gazebo
* PDDL
* Fast Downward
* PlanSys2
* NLP
* Topic modeling
* Sentiment analysis
* OpenCV

**Soft Skills**

* Communication
* Stakeholder collaboration
* Technical writing
* Teaching/mentoring

**15–25 SEO Keywords for CV/ATS**

* Machine learning engineer
* Computer vision scientist
* Generative AI models
* Deep learning frameworks
* Python programming
* ROS2 development
* Docker containerization
* Cloud computing resources
* Amazon data analysis
* Content automation
* Business stakeholder collaboration
* Research and development
* Novel machine learning techniques
* Tier-1 CV/ML conferences 

=== COVER LETTER ===
 Here's a cover letter tailored to the job description:

Dear Hiring Manager,

I am thrilled to apply for the Computer Vision Scientist position at Amazon Australia, wher