In [61]:
from dotenv import load_dotenv
import os

load_dotenv()  # load variables from .env into the environment

# Access them like this
openai.api_key = os.getenv("OPENAI_API_KEY")
input_folder = Path(os.getenv("INPUT_FOLDER"))
output_folder = Path(os.getenv("OUTPUT_FOLDER"))
csv_output_folder = Path(os.getenv("CSV_FOLDER"))


In [2]:
import os
from langchain import LLMChain, PromptTemplate
from langchain.docstore.document import Document
from langchain.llms import OpenAI
import google.generativeai as genai
from PyPDF2 import PdfReader

In [3]:
genai.configure(api_key="WRITE_YOUR_API_KEY")

In [4]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

In [5]:
import tiktoken

def chunk_text(text, model="gpt-3.5-turbo", max_tokens=3000):
    """Splits text into chunks that fit within the token limit."""
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk = enc.decode(tokens[i : i + max_tokens])
        chunks.append(chunk)
    
    return chunks


In [6]:
def summarize_text(text,jd):
    chunks = chunk_text(text, max_tokens=3000)
    summaries = []
    for chunk in chunks:
        prompt_template = PromptTemplate(
            input_variables=["text","jd"],
            template="""
            Hey, act like a highly skilled ATS (Applicant Tracking System) with deep expertise in software engineering, 
            data science, and AI hiring. Evaluate the resume against the provided job description. The job market is 
            competitive, so provide the most insightful recommendations for improvement. 
    
            Assign a **percentage match** between the resume and the JD, categorize missing keywords, 
            and highlight key strengths and potential concerns.
    
            Resume: {text}
            Job Description: {jd}
    
            I want the response strictly in **JSON format** with the following structure:
    
            {{
                "Name": "<Candidate Name>",
                "JD Match": "<% Match>",
                "Missing Keywords": {{
                    "Technical Skills": [],
                    "Tools & Technologies": [],
                    "Concepts & Methodologies": []
                }},
                "Profile Summary": "<Brief summary of the candidate if it is related to jd> ",
                "Projects": [
                    {{
                        "Project Name": "<Project Title>",
                        "Relevance to JD": "<High/Medium/Low>",
                        "Technologies Used": [],
                        "Impact": "<Brief description of project outcomes>"
                    }}
                ],
                "Certifications & Courses": [
                    "<Relevant Certifications or Courses>"
                ],
                "Skills That Will Contribute to the Company": [],
                "Soft Skills & Leadership Qualities": [
                    "<Communication, Leadership, Problem-Solving, Teamwork, etc.>"
                ],
                "Industry Experience": "<Relevant experience in specific industries like Finance, Healthcare, etc.>",
                "Culture Fit Assessment": "<High/Medium/Low> – Explanation of adaptability to company values",
                "Potential Concerns": ["Any gaps, missing skills, or weaknesses"],
                "Red Flags & Risk Analysis": ["Any major issues that could impact hiring decision"],
                "Candidate’s Growth Potential": "<How much they can grow in the company>",
                "Effort Needed by the Company": "<Low/Medium/High – Explanation>"
            }}
            """
        
        )

    llm = OpenAI(temperature=0.7, max_tokens=1000,model="gpt-3.5-turbo")  # Reduce max_tokens for output
    prompt = prompt_template.format(text=chunk, jd=jd)
    summary = llm(prompt)
    summaries.append(summary)
    
    return " ".join(summaries)

In [7]:
jd = """
Machine Learning Developer Job Description

Job Title: Machine Learning Developer  
Location: [Your Location or Remote]  
Job Type: Full-time  

About the Role:  
We are seeking a **Machine Learning Developer** to design, develop, and deploy intelligent models that enhance our products and services. You will work on cutting-edge projects involving **data processing, model training, and deployment** to solve real-world problems.  

Responsibilities:  
- Develop, train, and optimize machine learning models for various applications.  
- Process, clean, and analyze large datasets to extract meaningful insights.  
- Design and implement scalable ML pipelines and deploy models in production.  
- Collaborate with data scientists, engineers, and product teams to integrate ML solutions.  
- Monitor and improve model performance using evaluation metrics.  
- Stay updated with the latest advancements in ML, AI, and deep learning.  

Required Skills & Qualifications:  
- Bachelor's/Master’s degree in Computer Science, Data Science, or a related field.  
- **Strong programming skills** in Python (NumPy, Pandas, Scikit-learn, TensorFlow, PyTorch).  
- **Experience with ML frameworks** like TensorFlow, PyTorch, or Keras.  
- **Understanding of algorithms** like regression, classification, clustering, and deep learning.  
- Experience with **data preprocessing and feature engineering**.  
- Knowledge of **cloud platforms** (AWS, GCP, Azure) for model deployment.  
- Experience with **APIs, Docker, Kubernetes** is a plus.  

Preferred Qualifications:  
- Experience with **NLP, computer vision, or reinforcement learning**.  
- Knowledge of **big data technologies** like Spark, Hadoop.  
- Experience with **MLOps, CI/CD pipelines for ML models**.  
"""

In [8]:
def summarize_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)

    
    jd = """
Machine Learning Developer Job Description

Job Title: Machine Learning Developer  
Location: [Your Location or Remote]  
Job Type: Full-time  

About the Role:  
We are seeking a **Machine Learning Developer** to design, develop, and deploy intelligent models that enhance our products and services. You will work on cutting-edge projects involving **data processing, model training, and deployment** to solve real-world problems.  

Responsibilities:  
- Develop, train, and optimize machine learning models for various applications.  
- Process, clean, and analyze large datasets to extract meaningful insights.  
- Design and implement scalable ML pipelines and deploy models in production.  
- Collaborate with data scientists, engineers, and product teams to integrate ML solutions.  
- Monitor and improve model performance using evaluation metrics.  
- Stay updated with the latest advancements in ML, AI, and deep learning.  

Required Skills & Qualifications:  
- Bachelor's/Master’s degree in Computer Science, Data Science, or a related field.  
- **Strong programming skills** in Python (NumPy, Pandas, Scikit-learn, TensorFlow, PyTorch).  
- **Experience with ML frameworks** like TensorFlow, PyTorch, or Keras.  
- **Understanding of algorithms** like regression, classification, clustering, and deep learning.  
- Experience with **data preprocessing and feature engineering**.  
- Knowledge of **cloud platforms** (AWS, GCP, Azure) for model deployment.  
- Experience with **APIs, Docker, Kubernetes** is a plus.  

Preferred Qualifications:  
- Experience with **NLP, computer vision, or reinforcement learning**.  
- Knowledge of **big data technologies** like Spark, Hadoop.  
- Experience with **MLOps, CI/CD pipelines for ML models**.  
"""

    
    summary = summarize_text(text,jd)
    return summary

In [9]:
from pathlib import Path

def summarize_multiple_pdfs(pdf_paths):
    summaries = []
    for pdf_path in pdf_paths:
        pdf_path = Path(pdf_path)  # Convert to Path object if it's a string
        print(f"Summarizing: {pdf_path.name}")
        summary = summarize_pdf(pdf_path)
        summaries.append(f"Summary for {pdf_path.name}:\n{summary}\n")
    return summaries


In [10]:
#from pathlib import Path
#import shutil

# Paths
#input_folder = Path(r'C:\Users\adity\BE Project\Final\InputResume_Folder')
#output_folder = Path(r'C:\Users\adity\BE Project\Final\OutputResume_Folder')

# Ensure output folder exists
#output_folder.mkdir(parents=True, exist_ok=True)

# Get all PDFs
#pdf_paths = list(input_folder.glob('*.pdf'))

# Process each one
#for pdf_path in pdf_paths:
 #   filename = pdf_path.name  # Just the file name, like "resume01.pdf"
  #  print(f"Processing: {filename}")
    
   # summaries = summarize_multiple_pdfs([str(pdf_path)])
    #for summary in summaries:
     #   print(summary)
    
    # Move the file
    #shutil.move(str(pdf_path), str(output_folder / filename))
    #print(f"Moved {filename} to OutputResume_Folder\n")


In [46]:
from datetime import datetime
from datetime import datetime
from pathlib import Path

def timestamp():
    return datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")

# === Start of the log ===
with open(log_path, "w") as f:
    f.write(f"{timestamp()} BE Project — Resume Parsing Started\n")

def write_log(filename, status, jd_match=None, error=None, details=None, final=False):
    with open(log_path, "a") as f:
        if status == "START":
            f.write(f"{timestamp()} Parsing Started: {filename}\n")
        elif status == "DETAILS":
            f.write(f"{timestamp()} Resume Summary: {details}\n")
        elif status == "END":
            f.write(f"{timestamp()} Parsing Completed: {filename}\n")
        elif status == "FAILED":
            f.write(f"{timestamp()} Failed to Parse: {filename} | Error: {error}\n")
        elif status == "FINAL" or final:
            f.write(f"{timestamp()} All Resumes Processed — Parsing Ended\n")


# === Log Folder and File Path ===
log_folder = Path("logs")
log_folder.mkdir(parents=True, exist_ok=True)

timestamp_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_path = log_folder / f"log_{timestamp_str}.txt"

def write_log(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_path, "a", encoding="utf-8") as log_file:
        log_file.write(f"[{timestamp}] {message}\n")


In [65]:
import os
import shutil
from pathlib import Path
import json
import pandas as pd
from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

# === Configuration ===
jd_folder = Path("JD_Folder")
import openai
openai.api_key = "your_openai_api_key_here"

input_folder = Path(r'InputResume_Folder')
output_folder = Path(r'OutputResume_Folder')
csv_output_folder = Path("Resume_Parsed_CSVs")
csv_path = csv_output_folder / "resume_summary.csv"

output_folder.mkdir(parents=True, exist_ok=True)
csv_output_folder.mkdir(parents=True, exist_ok=True)

# === PDF Text Extraction ===
def extract_text_from_pdf(pdf_path):
    text = ""
    reader = PdfReader(str(pdf_path))
    for page in reader.pages:
        if page.extract_text():
            text += page.extract_text()
    return text

# === Text Chunking for API limit ===
def chunk_text(text, max_tokens=3000):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_tokens:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


# === Summarization with OpenAI ===
def summarize_pdf(pdf_path,jd):
    resume_text = extract_text_from_pdf(pdf_path)
   # jd = "Software Developer Job Description"  # Replace with actual JD
    chunks = chunk_text(resume_text)

    summaries = []

    for chunk in chunks:
        prompt_template = PromptTemplate(
            input_variables=["text", "jd"],
            template="""
You are an AI assistant. Summarize the relevant resume details below in relation to the following job description.

Resume Chunk:
\"\"\"
{text}
\"\"\"

Job Description:
\"\"\"
{jd}
\"\"\"

Extract the key points in concise bullet form, JSON-style if possible. Do NOT return full JSON yet.
"""
        )

        llm = ChatOpenAI(temperature=0.7, max_tokens=1000, model="gpt-3.5-turbo")
        prompt = prompt_template.format(text=chunk, jd=jd)
        partial_summary = llm.predict(prompt)
        summaries.append(partial_summary)

    # === Final prompt: Ask GPT to merge partial summaries into final JSON ===
    merge_prompt = f"""
You are an expert resume evaluator.

Below are summaries of different chunks of a resume, all based on the same candidate:

\"\"\"
{''.join(summaries)}
\"\"\"

Job Description:
\"\"\"
{jd}
\"\"\"

Using these summaries, return a clean and complete JSON output with this structure:

{{
    "Name": "<Candidate Name>",
    "JD Match": "<% Match>",
    "Missing Keywords": {{
        "Technical Skills": [],
        "Tools & Technologies": [],
        "Concepts & Methodologies": []
    }},
    "Top Matching Keywords": [],
    "Profile Summary": "<Brief summary related to JD>",
    "Projects": [
        {{
            "Project Name": "<Title>",
            "Relevance to JD": "<High/Medium/Low>",
            "Technologies Used": [],
            "Impact": "<Project outcomes>"
        }}
    ],
    "Certifications & Courses": ["<Relevant Certifications>"],
    "Skills That Will Contribute to the Company": [],
    "Soft Skills & Leadership Qualities": ["<Communication, Leadership, etc.>"],
    "Industry Experience": "<Relevant industries like Finance, Healthcare>",
    "Culture Fit Assessment": "<High/Medium/Low – Explanation>",
    "Potential Concerns": ["<Gaps, missing skills, weaknesses>"],
    "Red Flags & Risk Analysis": ["<Major issues>"],
    "Candidate’s Growth Potential": "<How much they can grow in the company>",
    "Effort Needed by the Company": "<Low/Medium/High – Explanation>",

    "Resume Strength Score": "<Numeric score between 0.0 and 10.0 in decimal format like 7.5>",
    "Relevant Experience (yrs)": "<Years of directly relevant experience>",
    "Employment Gaps Detected": true,
    "Relevant Projects Count": 0,
    "Resume Format Quality": "<Good/Average/Poor>",
    "Candidate Type": "<Junior/Mid-Level/Senior>",
    "HR Notes": "<Any special observations for HR>",
}}
Return only the JSON.
"""

    llm = ChatOpenAI(temperature=0.3, max_tokens=1500,model="gpt-3.5-turbo")
    final_response = llm.predict(merge_prompt)

    try:
        structured_data = json.loads(final_response)
    except json.JSONDecodeError:
        print(f"⚠️ Failed to parse JSON for {pdf_path.name}")
        structured_data = {}

    return structured_data
# === Load JD Dynamically ===
jd_files = list(jd_folder.glob("*.txt"))
if not jd_files:
    print("❌ No JD file found. Please add a .txt JD in JD_Folder.")
    exit()

latest_jd_file = max(jd_files, key=lambda f: f.stat().st_mtime)
with open(latest_jd_file, "r", encoding="utf-8") as f:
    jd = f.read()
print(f"📄 Loaded JD from: {latest_jd_file.name}")

# === Main Process ===
#write_log("LOG", "HEADER")
pdf_paths = list(input_folder.glob('*.pdf'))


# ✅ Check if folder is empty
if not pdf_paths:
    print("📭 No resumes found in the input folder. Nothing to process.")
    exit()

all_data = []

write_log("BE Project Resume Parsing Start")
for pdf_path in pdf_paths:
    write_log(f"Parsing started: {pdf_path.name}")
    filename = pdf_path.name
    print(f"Processing: {filename}")
 #   write_log(filename, "START")

    try:
        parsed_data = summarize_pdf(pdf_path,jd)
        parsed_data["resume_name"] = filename
        all_data.append(parsed_data)
        write_log(f"Details: {json.dumps(parsed_data, indent=2)}")
        write_log(f"Parsing completed: {pdf_path.name}")
        summary_line = f"Name: {parsed_data.get('Name', 'N/A')} | JD Match: {parsed_data.get('JD Match', 'N/A')} | Resume Strength Score: {parsed_data.get('Resume Strength Score', 'N/A')}"
      #  write_log(filename, "DETAILS", details=summary_line)

       # write_log(filename, "END")
    except Exception as e:
        #write_log(filename, "FAILED", error=str(e))
        continue

        # Extract JD Match % if available
        jd_match = parsed_data.get("JD Match", None)

     #   write_log(filename, "✅ Parsed", jd_match=jd_match)
    except Exception as e:
      #  write_log(filename, "❌ Failed", error=str(e))
        continue

    shutil.move(str(pdf_path), str(output_folder / filename))
    print(f"Moved {filename} to OutputResume_Folder\n")


# === Save to CSV (Append Mode) ===
df = pd.DataFrame(all_data)

try:
    if csv_path.exists() and os.path.getsize(csv_path) > 0:
        existing_df = pd.read_csv(csv_path)
        combined_df = pd.concat([existing_df, df], ignore_index=True)
    else:
        combined_df = df
except pd.errors.EmptyDataError:
    print("⚠️ Existing CSV is empty or corrupt. Starting fresh.")
    combined_df = df

# ✅ Save result to CSV
combined_df.to_csv(csv_path, index=False)
print(f"✅ Data appended and saved to: {csv_path.resolve()}")


📄 Loaded JD from: jd.txt
Processing: resume_15.pdf
Moved resume_15.pdf to OutputResume_Folder

✅ Data appended and saved to: C:\Users\adity\BE Project\Final\Resume_Parsed_CSVs\resume_summary.csv


In [13]:
import pandas as pd

# Load the CSV
csv_path = "Resume_Parsed_CSVs/resume_summary.csv"
df = pd.read_csv(csv_path)

# Convert 'Resume Strength Score' to numeric (in case it's stored as string like "7/10")
df["Resume Strength Score"] = pd.to_numeric(df["Resume Strength Score"], errors="coerce")

# Sort by Resume Strength Score in descending order
df_sorted = df.sort_values(by="Resume Strength Score", ascending=False)

# Save the sorted DataFrame back to CSV
df_sorted.to_csv(csv_path, index=False)

print("✅ Resume CSV sorted by 'Resume Strength Score' and updated successfully.")


✅ Resume CSV sorted by 'Resume Strength Score' and updated successfully.
