In [2]:
from langchain_community.llms import Ollama
MODEL = "llama3"
model = Ollama(model=MODEL)


  model = Ollama(model=MODEL)


In [7]:
import re
from PyPDF2 import PdfReader

# Step 1: Extract text from PDF using PyPDF2
def read_pdf(file):
    """Reads the PDF and extracts text from it."""
    try:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ''

# Step 2: Function to extract information from resumes using LLaMA
def extract_resume_info(resume_text):
    """Extracts specific details from resume text using LLaMA."""
    prompt = f"""
    Extract the following information from this resume:
    - Name
    - Email
    - Phone Number
    - List of Job Titles (in a comma-separated format)
    - List of Skills (in a comma-separated format)
    - Years of Experience (in numbers, no text, just the number of years)
    - List of Companies worked with (in a comma-separated format)

    Please ensure that the 'Years of Experience' includes only professional job or internship experience, not education experience.

    Here is the resume:
    {resume_text}
    """
    try:
        # Get the response from the LLaMA model
        response = model.invoke(prompt)
        return response.strip()  # Returning raw string response
    except Exception as e:
        print(f"Error in extracting resume info: {e}")
        return ''


# Step 3: Extract job description info using LLaMA
def extract_jd_info(jd_text):
    """Extracts specific details from a job description using LLaMA."""
    prompt = f"""
    Extract the following information from this job description:
    - Company Name
    - Email
    - Phone Number
    - Job Title (in a comma-separated format)
    - List of Required Skills (in a comma-separated format)
    - Years of Experience required (in numbers, no text, just the number of years)

    Here is the job description:
    {jd_text}
    """
    try:
        # Get the response from the LLaMA model
        response = model.invoke(prompt)
        return response.strip()  # Returning raw string response
    except Exception as e:
        print(f"Error in extracting JD info: {e}")
        return ''


# Step 4: Function to save extracted information to a .txt file
def save_extracted_info(info_text, output_file):
    """Saves the extracted information to a text file."""
    try:
        with open(output_file, 'w') as f:
            f.write(info_text)  # Save raw string info directly
        print(f"Information saved to {output_file}")
    except Exception as e:
        print(f"Error saving info: {e}")


# Step 5: Helper function to parse the extracted information from text to dictionary
def parse_extracted_info(text):
    """Parses the extracted information into a dictionary."""
    info = {}
    try:
        # Simple parsing by splitting lines and using key-value pairs
        lines = text.split('\n')
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                info[key.strip()] = value.strip()
    except Exception as e:
        print(f"Error parsing extracted info: {e}")
    return info


# Step 6: Example usage of the functions

# Define the PDF file paths
resume_pdf_path = 'resume3.pdf'
jd_pdf_path = 'jd2.pdf'

# Extract text from the resume and job description PDFs
resume_text = read_pdf(resume_pdf_path)
jd_text = read_pdf(jd_pdf_path)

if resume_text and jd_text:
    # Extract structured information from both the resume and the job description
    resume_info_raw = extract_resume_info(resume_text)
    jd_info_raw = extract_jd_info(jd_text)

    # Parse the raw extracted information into dictionaries (optional for verification)
    resume_info = parse_extracted_info(resume_info_raw)
    jd_info = parse_extracted_info(jd_info_raw)

    # Step 7: Save extracted resume and JD information to .txt files
    save_extracted_info(resume_info_raw, 'extracted_resume_info.txt')
    save_extracted_info(jd_info_raw, 'extracted_jd_info.txt')

else:
    print("Error: Could not extract text from one or both PDFs.")


Information saved to extracted_resume_info.txt
Information saved to extracted_jd_info.txt


In [4]:
from transformers import pipeline

In [106]:
import re
from difflib import SequenceMatcher

# Helper function for fuzzy matching
def fuzzy_match(a, b):
    """Returns a ratio of similarity between two strings using fuzzy matching."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# Helper function to clean and normalize extracted text
def clean_text(text):
    """Removes unwanted symbols like **, *, etc., and normalizes the text."""
    text = re.sub(r'[\*\*]+', '', text)  # Remove any markdown symbols like ** or *
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    return text.strip()                  # Strip leading/trailing spaces

# Function to extract skills from text using multiple regex patterns
def extract_skills(text, type="resume"):
    """Extracts skills from the text using multiple patterns for flexibility."""
    patterns = [
        r'Skills\s*\(comma-separated\):\s*(.+)',  # Pattern 1: Skills (comma-separated)
        r'\*\*Skills:\*\*\s*(.+)',                # Pattern 2: **Skills:** (markdown format)
        r'\* Skills:\s*(.+)',                     # Pattern 3: * Skills: (generic bullet format)
        r'Skills:\s*(.+)',                        # Pattern 4: Simple Skills: with no special chars
        r'\*\*Required Skills\*\*:\s*(.+)',       # Pattern 5: **Required Skills:** (JD markdown format)
        r'Required Skills:\s*(.+)',               # Pattern 6: Required Skills (JD plain format)
        r'\*\*Skills\*\*:\s*(.+)',                # Pattern 7: **Skills**: (new format with colon-separated info)
        r'List of Skills:\s*(.+)',                # Pattern 8: List of Skills format
        r'List of Skills: (.+)',                  # Pattern 9: Similar variant for skills
        r'Skills:\s*(.*)\n',                      # Pattern 10: Skills followed by new line
        r'\* Skills\s*:\s*(.+)',                  # Pattern 11: Bullet point Skills:
        r'Skills\s*(?:\(.+?\):)?\s*(.+)',           # Pattern 1: Skills (comma-separated) or plain skills section
        r'\*\*Skills:\*\*\s*(.+)',                  # Pattern 2: **Skills:** (markdown format)
        r'\* Skills:\s*(.+)',                       # Pattern 3: * Skills: (generic bullet format)
        r'Skills:\s*(.+)',                          # Pattern 4: Simple Skills: with no special chars
        r'Required Skills?:\s*(.+)',                # Pattern 5: Required Skills
        r'\*\*Skills\*\*:\s*(.+)',                  # Pattern 6: **Skills**: (JD format with colon)
        r'List of Skills:\s*(.+)',                  # Pattern 7: List of Skills
        r'Skills\s*:\s*([\w\s,/-]+)',               # Pattern 8: Any variant with slashes or commas
        r'\* Skills\s*:\s*(.+)',                    # Pattern 9: Bullet point Skills: format
        r'(?:Technical|Key|Core) Skills:\s*(.+)',   # Pattern 10: Variants with technical/core skills heading 
    ]
    
    for pattern in patterns:
        skills_regex = re.search(pattern, text)
        if skills_regex:
            # Clean and normalize extracted skills
            return [clean_text(skill.strip().lower()) for skill in skills_regex.group(1).split(',')]
    
    return []  # Return empty list if no match

# Function to extract job titles from text using multiple regex patterns
def extract_job_titles(text, type="resume"):
    """Extracts job titles from the text using multiple patterns for flexibility."""
    patterns = [
        r'Job Titles\s*\(comma-separated\):\s*(.+)',  # Pattern 1: Job Titles (comma-separated)
        r'\*\*Job Titles:\*\*\s*(.+)',               # Pattern 2: **Job Titles:** (markdown format)
        r'\* Job Titles:\s*(.+)',                    # Pattern 3: * Job Titles: (generic bullet format)
        r'Job Titles:\s*(.+)',                       # Pattern 4: Simple Job Titles: with no special chars
        r'\*\*Job Title\*\*:\s*(.+)',                # Pattern 5: **Job Title:** (JD markdown format)
        r'Job Title:\s*(.+)',                        # Pattern 6: Job Title (JD plain format)
        r'\*\*Job Titles\*\*:\s*(.+)',               # Pattern 7: **Job Titles**: (new format with colon-separated info)
        r'List of Job Titles:\s*(.+)',               # Pattern 8: List of Job Titles: (JD format)
        r'List of Job Titles\s*:\s*(.+)',            # Pattern 9: Variation with spaces after colon
        r'Job Titles?\s*(?:\(.+?\):)?\s*(.+)',       # Pattern 1: Job Titles (comma-separated)
        r'\*\*Job Titles?:\*\*\s*(.+)',             # Pattern 2: **Job Titles:** (markdown format)
        r'\* Job Titles?:\s*(.+)',                  # Pattern 3: * Job Titles: (bullet point format)
        r'Job Titles?:\s*(.+)',                     # Pattern 4: Simple Job Titles: with no special chars
        r'Position Titles?:\s*(.+)',                # Pattern 5: Position Titles (JD format)
        r'\*\*Job Title\*\*:\s*(.+)',               # Pattern 6: **Job Title:** (JD format)
        r'Position\s*(?:Held|Held\s*:\s*|Titles?)\s*:\s*(.+)', # Pattern 7: Variants for positions/titles
        r'\b(?:Work Experience|Employment History)\b\s*:\s*(.+)', # Pattern 8: Work Experience/Employment history
    ]
    
    for pattern in patterns:
        job_titles_regex = re.search(pattern, text)
        if job_titles_regex:
            # Clean and normalize extracted job titles
            return [clean_text(title.strip().lower()) for title in job_titles_regex.group(1).split(',')]
    
    return []  # Return empty list if no match

# Function to extract years of experience from text using multiple regex patterns
def extract_experience(text, type="resume"):
    """Extracts years of experience from the text using multiple patterns."""
    patterns = [
        r'Years of Experience:\s*(\d+)',               # Pattern 1: Years of Experience:
        r'\*\*Years of Experience:\*\*\s*(\d+)',       # Pattern 2: **Years of Experience:** (markdown format)
        r'\* Years of Experience:\s*(\d+)',            # Pattern 3: * Years of Experience:
        r'Experience:\s*(\d+)',                        # Pattern 4: Simple Experience:
        r'Experience required:\s*(\d+)',               # Pattern 5: Used in job descriptions (JD)
        r'\*\*Years of Experience required\*\*:\s*(\d+)',  # Pattern 6: **Years of Experience required** (JD markdown format)
        r'\*\*Years of Experience\*\*:\s*(\d+)\s*years?',  # Pattern 7: **Years of Experience**: 10 years (new format with colon-separated info)
        r'Years of Experience\s*:\s*(\d+)',            # Pattern 8: Variant format with spaces after colon
        r'\* Years of Experience\s*:\s*(\d+)',         # Pattern 9: Bullet-point format with * symbol
        r'Years of Experience:\s*(\d+)',               # Pattern 1: Years of Experience:
        r'\*\*Years of Experience:\*\*\s*(\d+)',       # Pattern 2: **Years of Experience:** (markdown format)
        r'Experience\s*(?:Required|Needed|Desired)?\s*:\s*(\d+)',  # Pattern 3: General Experience required
        r'\d+\+?\s*years? (?:of)? experience',         # Pattern 4: 10+ years experience
        r'\d+\s*years? experience(?: required)?',      # Pattern 5: Explicit experience required
        r'\b(?:Experience|Professional Experience)\b\s*:\s*(\d+)',  # Pattern 6: Variants with "Professional"
    ]
    
    for pattern in patterns:
        experience_regex = re.search(pattern, text)
        if experience_regex:
            return int(clean_text(experience_regex.group(1)))  # Clean and extract the number
    
    return 0  # Return 0 if no match


def calculate_resume_score(resume_text, jd_text):
    """Calculates a score for the resume based on the job description."""
    try:
        # Extract resume info
        resume_skills = extract_skills(resume_text, type="resume")
        resume_job_titles = extract_job_titles(resume_text, type="resume")
        resume_experience = extract_experience(resume_text, type="resume")
        
        # Extract JD info
        jd_skills = extract_skills(jd_text, type="jd")
        jd_job_titles = extract_job_titles(jd_text, type="jd")
        jd_experience_required = extract_experience(jd_text, type="jd")

        # Print extracted information for debugging
        print("Resume Skills:", resume_skills)
        print("Resume Job Titles:", resume_job_titles)
        print("Resume Experience (Years):", resume_experience)

        print("\nJD Skills:", jd_skills)
        print("JD Job Titles:", jd_job_titles)
        print("JD Experience Required (Years):", jd_experience_required)

        # Initialize score variables
        score = 0
        max_score = 100
        weights = {
            "skills": 0.6,  # 60% for skills
            "experience": 0.2,  # 20% for experience
            "job_title": 0.2  # 20% for job titles
        }

        # --- Skills Matching ---
        resume_skills_set = set(resume_skills)
        jd_skills_set = set(jd_skills)

        # Exact skill matches
        exact_skill_matches = resume_skills_set.intersection(jd_skills_set)
        partial_skill_matches = set()

        # Fuzzy matching for partial skill matches
        for resume_skill in resume_skills_set:
            for jd_skill in jd_skills_set:
                if fuzzy_match(resume_skill, jd_skill) > 0.55:  # Fuzzy match with a higher threshold
                    partial_skill_matches.add(resume_skill)

        # Combine exact and partial matches, avoiding double-counting
        total_skill_matches = exact_skill_matches.union(partial_skill_matches)
        skill_match_ratio = len(total_skill_matches) / len(jd_skills_set) if jd_skills_set else 0
        
        # Skills score is based on the match ratio, with more weight given to exact matches
        skill_score = skill_match_ratio * weights['skills'] * max_score
        score += skill_score

        print("\nExact Skill Matches:", exact_skill_matches)
        print("Partial Skill Matches:", partial_skill_matches)
        print("Skill Match Ratio:", skill_match_ratio)
        print("Skill Score:", skill_score)

        # --- Experience Matching ---
        experience_match_ratio = resume_experience / jd_experience_required if jd_experience_required else 0

        if resume_experience >= jd_experience_required:
            # Full marks for experience if the resume experience is greater than or equal to the JD requirement
            experience_score = weights['experience'] * max_score
        else:
            # Proportional score if resume experience is less than required
            experience_score = experience_match_ratio * weights['experience'] * max_score

        score += experience_score

        print("Experience Match Ratio:", experience_match_ratio)
        print("Experience Score:", experience_score)

        # --- Job Title Matching ---
        best_title_match = 0
        for resume_title in resume_job_titles:
            for jd_title in jd_job_titles:
                best_title_match = max(best_title_match, fuzzy_match(resume_title, jd_title))

        # Add job title score if a good match exists
        job_title_score = best_title_match * weights['job_title'] * max_score if best_title_match > 0.7 else 0
        score += job_title_score

        print("Best Job Title Match Score:", best_title_match)
        print("Job Title Score:", job_title_score)

        # Return final score rounded to 2 decimal places
        final_score = round(score, 2)
        print("\nFinal Resume Score:", final_score)
        return final_score

    except Exception as e:
        print(f"Error calculating resume score: {e}")
        return 0

# Function to read the contents of a file
def read_file(file_path):
    """Reads and returns the content of a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# File paths for resume and job description files
resume_file = 'extracted_resume_info.txt'  # Path to the resume text file
jd_file = 'extracted_jd_info.txt'

# Read resume and job description content
resume_text = read_file(resume_file)
jd_text = read_file(jd_file)

# If both files are successfully read, calculate and print the resume score
if resume_text and jd_text:
    score = calculate_resume_score(resume_text, jd_text)
    print(f"\nResume Score: {score}")
else:
    print("Error: One or both files could not be read.")


Resume Skills: ['python', 'c', 'sql', 'mysql', 'postgresql', 'redshift', 'aws', 'azure cloud', 'pytorch', 'scikit-learn', 'tensorflow', 'nlp', 'opencv', 'yolo', 'docker', 'vs code', 'dbeaver', 'jupyter notebook', 'git', 'problem-solving', 'analytical thinking', 'communication', 'team collaboration (comma-separated)']
Resume Job Titles: ['machine learning engineer', 'data engineer', 'intern (comma-separated)']
Resume Experience (Years): 2

JD Skills: ['python', 'tensorflow', 'pytorch', 'scikit-learn', 'cloud platforms (aws', 'gcp', 'azure)', 'ml libraries/frameworks', 'supervised and unsupervised learning techniques', 'deep learning', 'reinforcement learning', 'natural language processing', 'data preprocessing techniques', 'feature engineering', 'model evaluation metrics', 'sql and nosql databases']
JD Job Titles: ['machine learning engineer', 'software engineer', 'product manager']
JD Experience Required (Years): 2

Exact Skill Matches: {'python', 'scikit-learn', 'tensorflow', 'pytorch

In [91]:
import re
from difflib import SequenceMatcher

# Helper function for fuzzy matching
def fuzzy_match(a, b):
    """Returns a ratio of similarity between two strings using fuzzy matching."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# Helper function to clean and normalize extracted text
def clean_text(text):
    """Removes unwanted symbols like **, *, etc., and normalizes the text."""
    text = re.sub(r'[\*\*]+', '', text)  # Remove markdown symbols like ** or *
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    return text.strip()                  # Strip leading/trailing spaces

# Function to extract skills from text using multiple regex patterns
def extract_skills(text, type="resume"):
    """Extracts skills from the text using multiple patterns for flexibility."""
    patterns = [
        r'Skills\s*(?:\(.+?\):)?\s*(.+)',           # Pattern 1: Skills (comma-separated) or plain skills section
        r'\*\*Skills:\*\*\s*(.+)',                  # Pattern 2: **Skills:** (markdown format)
        r'\* Skills:\s*(.+)',                       # Pattern 3: * Skills: (generic bullet format)
        r'Skills:\s*(.+)',                          # Pattern 4: Simple Skills: with no special chars
        r'Required Skills?:\s*(.+)',                # Pattern 5: Required Skills
        r'\*\*Skills\*\*:\s*(.+)',                  # Pattern 6: **Skills**: (JD format with colon)
        r'List of Skills:\s*(.+)',                  # Pattern 7: List of Skills
        r'Skills\s*:\s*([\w\s,/-]+)',               # Pattern 8: Any variant with slashes or commas
        r'\* Skills\s*:\s*(.+)',                    # Pattern 9: Bullet point Skills: format
        r'(?:Technical|Key|Core) Skills:\s*(.+)',   # Pattern 10: Variants with technical/core skills heading
    ]
    
    for pattern in patterns:
        skills_regex = re.search(pattern, text, re.IGNORECASE)
        if skills_regex:
            # Clean and normalize extracted skills
            return [clean_text(skill.strip().lower()) for skill in re.split(r',|/|-|\n', skills_regex.group(1))]
    
    return []  # Return empty list if no match

# Function to extract job titles from text using multiple regex patterns
def extract_job_titles(text, type="resume"):
    """Extracts job titles from the text using multiple patterns for flexibility."""
    patterns = [
        r'Job Titles?\s*(?:\(.+?\):)?\s*(.+)',       # Pattern 1: Job Titles (comma-separated)
        r'\*\*Job Titles?:\*\*\s*(.+)',             # Pattern 2: **Job Titles:** (markdown format)
        r'\* Job Titles?:\s*(.+)',                  # Pattern 3: * Job Titles: (bullet point format)
        r'Job Titles?:\s*(.+)',                     # Pattern 4: Simple Job Titles: with no special chars
        r'Position Titles?:\s*(.+)',                # Pattern 5: Position Titles (JD format)
        r'\*\*Job Title\*\*:\s*(.+)',               # Pattern 6: **Job Title:** (JD format)
        r'Position\s*(?:Held|Held\s*:\s*|Titles?)\s*:\s*(.+)', # Pattern 7: Variants for positions/titles
        r'\b(?:Work Experience|Employment History)\b\s*:\s*(.+)', # Pattern 8: Work Experience/Employment history
    ]
    
    for pattern in patterns:
        job_titles_regex = re.search(pattern, text, re.IGNORECASE)
        if job_titles_regex:
            # Clean and normalize extracted job titles
            return [clean_text(title.strip().lower()) for title in re.split(r',|/|-|\n', job_titles_regex.group(1))]
    
    return []  # Return empty list if no match

# Function to extract years of experience from text using multiple regex patterns
def extract_experience(text, type="resume"):
    """Extracts years of experience from the text using multiple patterns."""
    patterns = [
        r'Years of Experience:\s*(\d+)',               # Pattern 1: Years of Experience:
        r'\*\*Years of Experience:\*\*\s*(\d+)',       # Pattern 2: **Years of Experience:** (markdown format)
        r'Experience\s*(?:Required|Needed|Desired)?\s*:\s*(\d+)',  # Pattern 3: General Experience required
        r'\d+\+?\s*years? (?:of)? experience',         # Pattern 4: 10+ years experience
        r'\d+\s*years? experience(?: required)?',      # Pattern 5: Explicit experience required
        r'\b(?:Experience|Professional Experience)\b\s*:\s*(\d+)',  # Pattern 6: Variants with "Professional"
    ]
    
    for pattern in patterns:
        experience_regex = re.search(pattern, text, re.IGNORECASE)
        if experience_regex:
            return int(clean_text(experience_regex.group(1)))  # Clean and extract the number
    
    return 0  # Return 0 if no match


def calculate_resume_score(resume_text, jd_text):
    """Calculates a score for the resume based on the job description."""
    try:
        # Extract resume info
        resume_skills = extract_skills(resume_text, type="resume")
        resume_job_titles = extract_job_titles(resume_text, type="resume")
        resume_experience = extract_experience(resume_text, type="resume")
        
        # Extract JD info
        jd_skills = extract_skills(jd_text, type="jd")
        jd_job_titles = extract_job_titles(jd_text, type="jd")
        jd_experience_required = extract_experience(jd_text, type="jd")

        # Print extracted information for debugging
        print("Resume Skills:", resume_skills)
        print("Resume Job Titles:", resume_job_titles)
        print("Resume Experience (Years):", resume_experience)

        print("\nJD Skills:", jd_skills)
        print("JD Job Titles:", jd_job_titles)
        print("JD Experience Required (Years):", jd_experience_required)

        # Initialize score variables
        score = 0
        max_score = 100
        weights = {
            "skills": 0.6,  # 60% for skills
            "experience": 0.2,  # 20% for experience
            "job_title": 0.2  # 20% for job titles
        }

        # --- Skills Matching ---
        resume_skills_set = set(resume_skills)
        jd_skills_set = set(jd_skills)

        # Exact skill matches
        exact_skill_matches = resume_skills_set.intersection(jd_skills_set)
        partial_skill_matches = set()

        # Fuzzy matching for partial skill matches
        for resume_skill in resume_skills_set:
            for jd_skill in jd_skills_set:
                if fuzzy_match(resume_skill, jd_skill) > 0.7:  # Fuzzy match with a higher threshold
                    partial_skill_matches.add(resume_skill)

        # Combine exact and partial matches, avoiding double-counting
        total_skill_matches = exact_skill_matches.union(partial_skill_matches)
        skill_match_ratio = len(total_skill_matches) / len(jd_skills_set) if jd_skills_set else 0
        
        # Skills score is based on the match ratio, with more weight given to exact matches
        skill_score = skill_match_ratio * weights['skills'] * max_score
        score += skill_score

        print("\nExact Skill Matches:", exact_skill_matches)
        print("Partial Skill Matches:", partial_skill_matches)
        print("Skill Match Ratio:", skill_match_ratio)
        print("Skill Score:", skill_score)

        # --- Experience Matching ---
        experience_match_ratio = resume_experience / jd_experience_required if jd_experience_required else 0

        if resume_experience >= jd_experience_required:
            # Full marks for experience if the resume experience is greater than or equal to the JD requirement
            experience_score = weights['experience'] * max_score
        else:
            # Proportional score if resume experience is less than required
            experience_score = experience_match_ratio * weights['experience'] * max_score

        score += experience_score

        print("Experience Match Ratio:", experience_match_ratio)
        print("Experience Score:", experience_score)

        # --- Job Title Matching ---
        best_title_match = 0
        for resume_title in resume_job_titles:
            for jd_title in jd_job_titles:
                best_title_match = max(best_title_match, fuzzy_match(resume_title, jd_title))

        # Add job title score if a good match exists
        job_title_score = best_title_match * weights['job_title'] * max_score if best_title_match > 0.7 else 0
        score += job_title_score

        print("Best Job Title Match Score:", best_title_match)
        print("Job Title Score:", job_title_score)

        # Return final score rounded to 2 decimal places
        final_score = round(score, 2)
        print("\nFinal Resume Score:", final_score)
        return final_score

    except Exception as e:
        print(f"Error calculating resume score: {e}")
        return 0

# Function to read the contents of a file
def read_file(file_path):
    """Reads and returns the content of a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# File paths for resume and job description files
resume_file = 'extracted_resume_info.txt'  # Path to the resume text file
jd_file = 'extracted_jd_info.txt'

# Read resume and job description content
resume_text = read_file(resume_file)
jd_text = read_file(jd_file)

# If both files are successfully read, calculate and print the resume score
if resume_text and jd_text:
    score = calculate_resume_score(resume_text, jd_text)
    print(f"\nResume Score: {score}")
else:
    print("Error: One or both files could not be read.")


Resume Skills: [': python', 'c', 'sql', 'mysql', 'postgresql', 'redshift', 'aws', 'azure cloud', 'pytorch', 'scikit', 'learn', 'tensorflow', 'nlp', 'opencv', 'yolo', 'docker', 'vs code', 'dbeaver', 'jupyter notebook', 'git', 'problem', 'solving', 'analytical thinking', 'communication', 'team collaboration (comma', 'separated format)']
Resume Job Titles: [': machine learning engineer', 'data engineering intern', 'intern', 'intern (comma', 'separated format)']
Resume Experience (Years): 2

JD Skills: [': python', 'tensorflow', 'pytorch', 'scikit', 'learn', 'aws', 'gcp', 'azure', 'mlops', 'sql', 'nosql databases', 'git', 'ci', 'cd practices (comma', 'separated format)']
JD Job Titles: [': machine learning engineer', 'data scientist', 'software engineer', 'product manager (comma', 'separated format)']
JD Experience Required (Years): 2

Exact Skill Matches: {'scikit', ': python', 'learn', 'sql', 'git', 'separated format)', 'tensorflow', 'pytorch', 'aws'}
Partial Skill Matches: {'scikit', ':

In [None]:
import pymysql
import re
from difflib import SequenceMatcher
from PyPDF2 import PdfReader

# MySQL connection (without specifying a database)
def get_server_connection():
    connection = pymysql.connect(
        host="localhost",
        user="root",
        password="root",
        cursorclass=pymysql.cursors.DictCursor  # DictCursor to get results as dictionaries
    )
    return connection

# Helper function for fuzzy matching
def fuzzy_match(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# Helper function to clean and normalize extracted text
def clean_text(text):
    text = re.sub(r'[\\]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Step 1: Extract text from PDF using PyPDF2
def read_pdf(file):
    """Reads the PDF and extracts text from it."""
    try:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ''

# Step 2: Function to extract information from resumes using LLaMA
def extract_resume_info(resume_text):
    """Extracts specific details from resume text using LLaMA."""
    prompt = f"""
    Extract the following information from this resume:
    - Name
    - Email
    - Phone Number
    - List of Job Titles (in a comma-separated format)
    - List of Skills (in a comma-separated format)
    - Years of Experience (in numbers, no text, just the number of years)
    - List of Companies worked with (in a comma-separated format)

    Here is the resume:
    {resume_text}
    """
    try:
        # Get the response from the LLaMA model
        response = model.invoke(prompt)
        return response.strip()  # Returning raw string response
    except Exception as e:
        print(f"Error in extracting resume info: {e}")
        return ''

# Step 3: Extract job description info using LLaMA
def extract_jd_info(jd_text):
    """Extracts specific details from a job description using LLaMA."""
    prompt = f"""
    Extract the following information from this job description:
    - Company Name
    - Email
    - Phone Number
    - Job Title (in a comma-separated format)
    - List of Required Skills (in a comma-separated format)
    - Years of Experience required (in numbers, no text, just the number of years)

    Here is the job description:
    {jd_text}
    """
    try:
        # Get the response from the LLaMA model
        response = model.invoke(prompt)
        return response.strip()  # Returning raw string response
    except Exception as e:
        print(f"Error in extracting JD info: {e}")
        return ''

# Step 4: Function to save extracted information to a .txt file
def save_extracted_info(info_text, output_file):
    """Saves the extracted information to a text file."""
    try:
        with open(output_file, 'w') as f:
            f.write(info_text)  # Save raw string info directly
        print(f"Information saved to {output_file}")
    except Exception as e:
        print(f"Error saving info: {e}")

# Step 5: Helper function to parse the extracted information from text to dictionary
def parse_extracted_info(text):
    """Parses the extracted information into a dictionary."""
    info = {}
    try:
        # Simple parsing by splitting lines and using key-value pairs
        lines = text.split('\n')
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                info[key.strip()] = value.strip()
    except Exception as e:
        print(f"Error parsing extracted info: {e}")
    return info

# Function to create a database and information table
def create_company_db(company_name):
    conn = get_server_connection()
    cursor = conn.cursor()
    try:
        # Create the database if it doesn't exist
        cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{company_name}`")
        conn.commit()

        # Select the created database
        conn.select_db(company_name)

        # Create the information table if it doesn't exist
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS information (
                id INT AUTO_INCREMENT PRIMARY KEY,
                name VARCHAR(255),
                phone_number VARCHAR(20),
                email VARCHAR(255),
                skills TEXT,
                score FLOAT
            )
        """)
        conn.commit()
    except Exception as e:
        print(f"Error creating database/table: {e}")
    finally:
        cursor.close()
        conn.close()

# Function to insert extracted resume info into the table
def insert_resume_info(company_name, resume_info, score):
    conn = get_server_connection()
    cursor = conn.cursor()
    try:
        # Select the database
        conn.select_db(company_name)

        # Ensure all required fields are present
        name = resume_info.get('Name')
        phone_number = resume_info.get('Phone Number')
        email = resume_info.get('Email')
        skills = resume_info.get('List of Skills')

        # Debugging output
        print(f"Inserting into DB - Name: {name}, Phone: {phone_number}, Email: {email}, Skills: {skills}, Score: {score}")

        # Insert into the database
        query = "INSERT INTO information (name, phone_number, email, skills, score) VALUES (%s, %s, %s, %s, %s)"
        cursor.execute(query, (name, phone_number, email, skills, score))
        conn.commit()

    except Exception as e:
        print(f"Error inserting resume info into the database: {e}")
    finally:
        cursor.close()
        conn.close()

# Function to extract skills from text using multiple regex patterns
def extract_skills(text, type="resume"):
    patterns = [
        r'Skills\s*\(comma-separated\):\s*(.+)',  
        r'\\*Skills:\\\s(.+)',                
        r'\* Skills:\s*(.+)',                    
        r'Skills:\s*(.+)',                       
        r'\\*Required Skills\\:\s(.+)',       
        r'Required Skills:\s*(.+)',              
        r'\\*Skills\\:\s(.+)',               
        r'List of Skills:\s*(.+)',               
        r'List of Skills: (.+)',                  
        r'Skills:\s*(.*)\n',                      
        r'\* Skills\s*:\s*(.+)',                  
        r'Skills\s*(?:\(.+?\):)?\s*(.+)',       
        r'\\*Skills:\\\s(.+)',              
        r'\* Skills:\s*(.+)',                    
        r'Skills:\s*(.+)',                       
        r'Required Skills?:\s*(.+)',            
        r'\\*Skills\\:\s(.+)',              
        r'List of Skills:\s*(.+)',              
        r'Skills\s*(?:\(.+?\):)?\s*(.+)',       
        r'\\*Skills:\\\s(.+)',              
        r'\* Skills:\s*(.+)',                    
        r'Skills:\s*(.+)',                       
        r'Required Skills?:\s*(.+)',            
        r'\\*Skills\\:\s(.+)',              
        r'Position Titles?:\s*(.+)',           
        r'\\*Job Title\\:\s(.+)',          
        r'Position\s*(?:Held|Held\s*:\s*|Titles?)\s*:\s*(.+)', 
        r'\b(?:Work Experience|Employment History)\b\s*:\s*(.+)',                       
    ]
    
    for pattern in patterns:
        skills_regex = re.search(pattern, text)
        if skills_regex:
            return [clean_text(skill.strip().lower()) for skill in skills_regex.group(1).split(',')]
    
    return []  

# Function to extract job titles from text using multiple regex patterns
def extract_job_titles(text, type="resume"):
    patterns = [
        r'Job Titles\s*\(comma-separated\):\s*(.+)',  
        r'\\*Job Titles:\\\s(.+)',               
        r'\* Job Titles:\s*(.+)',                    
        r'Job Titles:\s*(.+)',                       
        r'\\*Job Title\\:\s(.+)',              
        r'Job Title:\s*(.+)',                        
        r'\\*Job Titles\\:\s(.+)',              
        r'List of Job Titles:\s*(.+)',              
        r'List of Job Titles\s*:\s*(.+)',            
        r'Job Titles?\s*(?:\(.+?\):)?\s*(.+)',       
        r'\\*Job Titles:\\\s(.+)',              
        r'\* Job Titles?:\s*(.+)',                  
        r'Job Titles?:\s*(.+)',                     
        r'Position Titles?:\s*(.+)',               
        r'\\*Job Title\\:\s(.+)',              
        r'Position\s*(?:Held|Held\s*:\s*|Titles?)\s*:\s*(.+)', 
        r'\b(?:Work Experience|Employment History)\b\s*:\s*(.+)',                     
    ]
    
    for pattern in patterns:
        job_titles_regex = re.search(pattern, text)
        if job_titles_regex:
            return [clean_text(title.strip().lower()) for title in job_titles_regex.group(1).split(',')]
    
    return []  

# Function to extract years of experience from text using multiple regex patterns
def extract_experience(text, type="resume"):
    patterns = [
         r'Years of Experience:\s*(\d+)',               
        r'\\ Years of Experience:\\\s*(\ d+)',       
        r'\* Years of Experience:\s*(\d+)',            
        r'Experience:\s*(\d+)',                        
        r'Experience required:\s*(\d+)',              
        r'\\*Years of Experience required\\:\s(\d+)',  
        r'\\*Years of Experience\\:\s(\d+)\s*years?',  
        r'Years of Experience\s*:\s*(\d+)',            
        r'\* Years of Experience\s*:\s*(\d+)',         
        r'Years of Experience:\s*(\d+)',               
        r'\\*Years of Experience:\\\s(\d+)',       
        r'Experience\s*(?:Required|Needed|Desired)?\s*:\s*(\d+)',  
        r'\d+\+?\s*years? (?:of)? experience',        
        r'\d+\s*years? experience(?: required)?',      
        r'\b(?:Experience|Professional Experience)\b\s*:\s*(\d+)',               
    ]
    
    for pattern in patterns:
        experience_regex = re.search(pattern, text)
        if experience_regex:
            return int(clean_text(experience_regex.group(1)))  
    
    return 0  

# Function to calculate the resume score
def calculate_resume_score(resume_text, jd_text):
    try:
        # Extract resume info
        resume_skills = extract_skills(resume_text, type="resume")
        resume_job_titles = extract_job_titles(resume_text, type="resume")
        resume_experience = extract_experience(resume_text, type="resume")
        
        # Extract JD info
        jd_skills = extract_skills(jd_text, type="jd")
        jd_job_titles = extract_job_titles(jd_text, type="jd")
        jd_experience_required = extract_experience(jd_text, type="jd")

        # Print extracted information for debugging
        print("Resume Skills:", resume_skills)
        print("Resume Job Titles:", resume_job_titles)
        print("Resume Experience (Years):", resume_experience)

        print("\nJD Skills:", jd_skills)
        print("JD Job Titles:", jd_job_titles)
        print("JD Experience Required (Years):", jd_experience_required)

        # Initialize score variables
        score = 0
        max_score = 100
        weights = {
            "skills": 0.6,  # 60% for skills
            "experience": 0.2,  # 20% for experience
            "job_title": 0.2  # 20% for job titles
        }

        # --- Skills Matching ---
        resume_skills_set = set(resume_skills)
        jd_skills_set = set(jd_skills)

        # Exact skill matches
        exact_skill_matches = resume_skills_set.intersection(jd_skills_set)
        partial_skill_matches = set()

        # Fuzzy matching for partial skill matches
        for resume_skill in resume_skills_set:
            for jd_skill in jd_skills_set:
                if fuzzy_match(resume_skill, jd_skill) > 0.55:  # Fuzzy match with a higher threshold
                    partial_skill_matches.add(resume_skill)

        # Combine exact and partial matches, avoiding double-counting
        total_skill_matches = exact_skill_matches.union(partial_skill_matches)
        skill_match_ratio = len(total_skill_matches) / len(jd_skills_set) if jd_skills_set else 0
        
        # Skills score is based on the match ratio, with more weight given to exact matches
        skill_score = skill_match_ratio * weights['skills'] * max_score
        score += skill_score

        print("\nExact Skill Matches:", exact_skill_matches)
        print("Partial Skill Matches:", partial_skill_matches)
        print("Skill Match Ratio:", skill_match_ratio)
        print("Skill Score:", skill_score)

        # --- Experience Matching ---
        experience_match_ratio = resume_experience / jd_experience_required if jd_experience_required else 0

        if resume_experience >= jd_experience_required:
            # Full marks for experience if the resume experience is greater than or equal to the JD requirement
            experience_score = weights['experience'] * max_score
        else:
            # Proportional score if resume experience is less than required
            experience_score = experience_match_ratio * weights['experience'] * max_score

        score += experience_score

        print("Experience Match Ratio:", experience_match_ratio)
        print("Experience Score:", experience_score)

        # --- Job Title Matching ---
        best_title_match = 0
        for resume_title in resume_job_titles:
            for jd_title in jd_job_titles:
                best_title_match = max(best_title_match, fuzzy_match(resume_title, jd_title))

        # Add job title score if a good match exists
        job_title_score = best_title_match * weights['job_title'] * max_score if best_title_match > 0.7 else 0
        score += job_title_score

        print("Best Job Title Match Score:", best_title_match)
        print("Job Title Score:", job_title_score)

        # Return final score rounded to 2 decimal places
        final_score = round(score, 2)
        print("\nFinal Resume Score:", final_score)
        return final_score

    except Exception as e:
        print(f"Error calculating resume score: {e}")
        return 0

# Function to read the contents of a file
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# Step 6: Example usage of the functions
def process_resume():
    # Define the PDF file paths
    resume_pdf_path = 'resume4.pdf'
    jd_pdf_path = 'jd.pdf'

    # Extract text from the resume and job description PDFs
    resume_text = read_pdf(resume_pdf_path)
    jd_text = read_pdf(jd_pdf_path)

    if resume_text and jd_text:
        # Extract structured information from both the resume and the job description
        resume_info_raw = extract_resume_info(resume_text)
        jd_info_raw = extract_jd_info(jd_text)

        # Save extracted info to .txt files
        save_extracted_info(resume_info_raw, 'extracted_resume_info.txt')
        save_extracted_info(jd_info_raw, 'extracted_jd_info.txt')

        # Step 3: Read the extracted info from the .txt files
        resume_info_text = read_file('extracted_resume_info.txt')
        jd_info_text = read_file('extracted_jd_info.txt')

        # Parse extracted information
        resume_info = parse_extracted_info(resume_info_text)
        jd_info = parse_extracted_info(jd_info_text)

        # Debugging output
        print("Resume Info:", resume_info)
        print("JD Info:", jd_info)

        # Step 4: Create a database based on the company name
        company_name = jd_info.get('Company Name', 'default_company').replace(' ', '_')
        create_company_db(company_name)

        # Step 5: Calculate resume score
        score = calculate_resume_score(resume_info_text, jd_info_text)

        # Step 6: Insert resume info into the database
        insert_resume_info(company_name, resume_info, score)

        print(f"Resume processed and stored in database '{company_name}' with score {score}")
    else:
        print("Error: Failed to process resume or job description.")

# Run the processing function
if __name__ == '__main__':
    process_resume()


In [None]:
import pymysql
import re
from difflib import SequenceMatcher
from PyPDF2 import PdfReader

# MySQL connection (without specifying a database)
def get_server_connection():
    connection = pymysql.connect(
        host="localhost",
        user="root",
        password="root",
        cursorclass=pymysql.cursors.DictCursor  # DictCursor to get results as dictionaries
    )
    return connection

# Helper function for fuzzy matching
def fuzzy_match(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# Helper function to clean and normalize extracted text
def clean_text(text):
    text = re.sub(r'[\\]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Step 1: Extract text from PDF using PyPDF2
def read_pdf(file):
    """Reads the PDF and extracts text from it."""
    try:
        reader = PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ''

# Step 2: Extract key information using regex from the resume and JD text
def extract_info_from_text(text, info_type="resume"):
    """Extracts specific details from the given text using regular expressions."""
    extracted_info = {}

    if info_type == "resume":
        # Extracting resume information using regex
        extracted_info['Name'] = re.search(r'Name:\s*(.*)', text).group(1) if re.search(r'Name:\s*(.*)', text) else None
        extracted_info['Email'] = re.search(r'Email:\s*(.*)', text).group(1) if re.search(r'Email:\s*(.*)', text) else None
        extracted_info['Phone Number'] = re.search(r'Phone Number:\s*(.*)', text).group(1) if re.search(r'Phone Number:\s*(.*)', text) else None
        extracted_info['Job Titles'] = re.search(r'List of Job Titles:\s*(.*)', text).group(1) if re.search(r'List of Job Titles:\s*(.*)', text) else None
        extracted_info['Skills'] = re.search(r'List of Skills:\s*(.*)', text).group(1) if re.search(r'List of Skills:\s*(.*)', text) else None
        extracted_info['Years of Experience'] = re.search(r'Years of Experience:\s*(\d+)', text).group(1) if re.search(r'Years of Experience:\s*(\d+)', text) else None
        extracted_info['Companies'] = re.search(r'List of Companies worked with:\s*(.*)', text).group(1) if re.search(r'List of Companies worked with:\s*(.*)', text) else None
    elif info_type == "jd":
        # Extracting job description information using regex
        extracted_info['Company Name'] = re.search(r'Company Name:\s*(.*)', text).group(1) if re.search(r'Company Name:\s*(.*)', text) else None
        extracted_info['Email'] = re.search(r'Email:\s*(.*)', text).group(1) if re.search(r'Email:\s*(.*)', text) else None
        extracted_info['Phone Number'] = re.search(r'Phone Number:\s*(.*)', text).group(1) if re.search(r'Phone Number:\s*(.*)', text) else None
        extracted_info['Job Titles'] = re.search(r'Job Title:\s*(.*)', text).group(1) if re.search(r'Job Title:\s*(.*)', text) else None
        extracted_info['Required Skills'] = re.search(r'Required Skills:\s*(.*)', text).group(1) if re.search(r'Required Skills:\s*(.*)', text) else None
        extracted_info['Years of Experience'] = re.search(r'Years of Experience required:\s*(\d+)', text).group(1) if re.search(r'Years of Experience required:\s*(\d+)', text) else None

    # Cleaning the extracted info
    for key, value in extracted_info.items():
        if value:
            extracted_info[key] = clean_text(value)

    return extracted_info

# Step 3: Function to create a database and information table
def create_company_db(company_name):
    conn = get_server_connection()
    cursor = conn.cursor()
    try:
        # Create the database if it doesn't exist
        cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{company_name}`")
        conn.commit()

        # Select the created database
        conn.select_db(company_name)

        # Create the information table if it doesn't exist
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS information (
                id INT AUTO_INCREMENT PRIMARY KEY,
                name VARCHAR(255),
                phone_number VARCHAR(20),
                email VARCHAR(255),
                skills TEXT,
                score FLOAT
            )
        """)
        conn.commit()
    except Exception as e:
        print(f"Error creating database/table: {e}")
    finally:
        cursor.close()
        conn.close()

# Step 4: Function to insert extracted resume info into the table
def insert_resume_info(company_name, resume_info, score):
    conn = get_server_connection()
    cursor = conn.cursor()
    try:
        # Select the database
        conn.select_db(company_name)

        # Insert into the database
        query = "INSERT INTO information (name, phone_number, email, skills, score) VALUES (%s, %s, %s, %s, %s)"
        cursor.execute(query, (resume_info.get('Name'), resume_info.get('Phone Number'), resume_info.get('Email'), resume_info.get('Skills'), score))
        conn.commit()

    except Exception as e:
        print(f"Error inserting resume info into the database: {e}")
    finally:
        cursor.close()
        conn.close()

# Step 5: Function to calculate the resume score based on matching criteria
def calculate_resume_score(resume_info, jd_info):
    """Calculate the resume score based on skills, experience, and job title matches."""
    score = 0
    max_score = 100
    weights = {
        "skills": 0.6,
        "experience": 0.2,
        "job_title": 0.2
    }

    # Extracted details from the resume and job description
    resume_skills = set(resume_info.get('Skills', '').split(', '))
    jd_skills = set(jd_info.get('Required Skills', '').split(', '))

    # Skill matching (exact and partial match)
    exact_skill_matches = resume_skills.intersection(jd_skills)
    partial_skill_matches = set()

    # Fuzzy matching for partial skill matches
    for resume_skill in resume_skills:
        for jd_skill in jd_skills:
            if fuzzy_match(resume_skill, jd_skill) > 0.55:  # Fuzzy match threshold
                partial_skill_matches.add(resume_skill)

    # Combine exact and partial matches
    total_skill_matches = exact_skill_matches.union(partial_skill_matches)
    skill_match_ratio = len(total_skill_matches) / len(jd_skills) if jd_skills else 0
    skill_score = skill_match_ratio * weights['skills'] * max_score
    score += skill_score

    # Experience matching
    resume_experience = int(resume_info.get('Years of Experience', 0))
    jd_experience_required = int(jd_info.get('Years of Experience', 0))
    experience_match_ratio = resume_experience / jd_experience_required if jd_experience_required else 0
    experience_score = experience_match_ratio * weights['experience'] * max_score
    score += experience_score

    # Job title matching
    resume_job_titles = set(resume_info.get('Job Titles', '').split(', '))
    jd_job_titles = set(jd_info.get('Job Titles', '').split(', '))
    best_title_match = max([fuzzy_match(rjt, jjt) for rjt in resume_job_titles for jjt in jd_job_titles], default=0)
    job_title_score = best_title_match * weights['job_title'] * max_score if best_title_match > 0.7 else 0
    score += job_title_score

    return round(score, 2)

# Step 6: Example usage of the functions
def process_resume():
    # Define the file paths
    resume_info_file = 'extracted_resume_info.txt'
    jd_info_file = 'extracted_jd_info.txt'

    # Read the extracted text from the files
    resume_info_text = read_file(resume_info_file)
    jd_info_text = read_file(jd_info_file)

    if resume_info_text and jd_info_text:
        # Extract structured information from both the resume and the job description
        resume_info = extract_info_from_text(resume_info_text, info_type="resume")
        jd_info = extract_info_from_text(jd_info_text, info_type="jd")

        # Debugging output
        print("Extracted Resume Info:", resume_info)
        print("Extracted JD Info:", jd_info)

        # Create a database based on the company name
        company_name = jd_info.get('Company Name', 'default_company').replace(' ', '_')
        create_company_db(company_name)

        # Calculate resume score
        score = calculate_resume_score(resume_info, jd_info)

        # Insert resume info into the database
        insert_resume_info(company_name, resume_info, score)

        print(f"Resume processed and stored in database '{company_name}' with score {score}")
    else:
        print("Error: Failed to process resume or job description.")

# Function to read the contents of a file
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# Run the processing function
if __name__ == '__main__':
    process_resume()
