# NEW lematizaion

In [4]:
import os
import re
import PyPDF2

skill_lemma_dict = {
    "reactjs": ["react.js", "react js", "react"],
    "python": ["python", "py"],
    "java": ["java"],
    "cpp": ["c++", "c programing", "c"],
    "sql": ["mysql", "sql", "postgresql"]
}

def custom_lemmatizer(text, lemma_dict):
    tokens = re.findall(r'\b\w+\b', text)  # Tokenize text while handling punctuation
    lemmatized_tokens = []
    for token in tokens:
        found = False
        for key, val in lemma_dict.items():
            if token.lower() in val:
                lemmatized_tokens.append(key)
                found = True
                break
        if not found:
            lemmatized_tokens.append(token)
    return lemmatized_tokens

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def clean_function(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

def process_resume(resume_path, job_skills):
    resume_text = extract_text_from_pdf(resume_path)
    cleaned_resume_text = clean_function(resume_text.lower())
    lemmatized_tokens = custom_lemmatizer(cleaned_resume_text, skill_lemma_dict)
    lemmatized_tokens = list(set(lemmatized_tokens))
    matching_skills = set(lemmatized_tokens).intersection(job_skills)
    num_matching_skills = len(matching_skills)
    resume_score = (num_matching_skills / len(job_skills)) * 100
    return resume_score

def process_resumes_in_folder(resume_folder, job_skills, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(resume_folder):
        if filename.endswith(".pdf"):
            resume_path = os.path.join(resume_folder, filename)
            resume_score = process_resume(resume_path, job_skills)

            # Specify the threshold for selecting resumes
            threshold = 50  # Adjust as needed

            if resume_score >= threshold:
                selected_resume_path = os.path.join(output_folder, filename.replace(".pdf", f"_score_{resume_score:.2f}.txt"))
                with open(selected_resume_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(f"Resume Score: {resume_score:.2f}%\n\n")
                    output_file.write(extract_text_from_pdf(resume_path))

    print("Resumes processed and selected ones saved in:", output_folder)

# Specify paths and job skills
resume_folder_path = "/content/my resume"
output_folder_path = "/content/Output"
job_skills = ["reactjs", "python", "java", "sql", "azur", "aws"]

# Process resumes in the specified folder
process_resumes_in_folder(resume_folder_path, job_skills, output_folder_path)


Resumes processed and selected ones saved in: /content/Output


# Cosine

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(job_description, resume):
    # Convert job description and resume into a list
    documents = [job_description, resume]

    # Initialize CountVectorizer to convert text into a bag of words
    vectorizer = CountVectorizer()

    # Fit and transform the documents into a matrix of token counts
    matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity between job description and resume
    cos_sim = cosine_similarity(matrix)

    # Extract cosine similarity value for job description and resume
    job_resume_similarity = cos_sim[0, 1]

    return job_resume_similarity

def process_resume_in_folder(resume_folder, job_description, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(resume_folder):
        if filename.endswith(".txt"):  # Assuming resumes are in text files
            resume_path = os.path.join(resume_folder, filename)
            with open(resume_path, 'r', encoding='utf-8') as resume_file:
                resume_text = resume_file.read()

            resume_score = calculate_cosine_similarity(job_description, resume_text)

            # Specify the threshold for selecting resumes
            threshold = 0.8  # Adjust as needed

            if resume_score >= threshold:
                selected_resume_path = os.path.join(output_folder, filename.replace(".txt", f"_score_{resume_score:.2f}.txt"))
                with open(selected_resume_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(f"Resume Score: {resume_score:.2f}\n\n")
                    output_file.write(resume_text)

    print("Resumes processed and selected ones saved in:", output_folder)

# Specify paths and job description
resume_folder_path = "path/to/resumes/folder"
output_folder_path = "path/to/output/folder"
job_description = "Looking for a software engineer proficient in Python and machine learning"

# Process resumes in the specified folder
process_resume_in_folder(resume_folder_path, job_description, output_folder_path)


# Jaccard Similarity

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_similarity_score

def calculate_jaccard_similarity(job_description, resume):
    # Convert job description and resume into a list
    documents = [job_description, resume]

    # Initialize CountVectorizer to convert text into a bag of words
    vectorizer = CountVectorizer()

    # Fit and transform the documents into a matrix of token counts
    matrix = vectorizer.fit_transform(documents).toarray()

    # Compute Jaccard similarity between job description and resume
    jaccard_sim = jaccard_similarity_score(matrix[0], matrix[1])

    return jaccard_sim

def process_resume_in_folder(resume_folder, job_description, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(resume_folder):
        if filename.endswith(".txt"):  # Assuming resumes are in text files
            resume_path = os.path.join(resume_folder, filename)
            with open(resume_path, 'r', encoding='utf-8') as resume_file:
                resume_text = resume_file.read()

            resume_score = calculate_jaccard_similarity(job_description, resume_text)

            # Specify the threshold for selecting resumes
            threshold = 0.5  # Adjust as needed

            if resume_score >= threshold:
                selected_resume_path = os.path.join(output_folder, filename.replace(".txt", f"_score_{resume_score:.2f}.txt"))
                with open(selected_resume_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(f"Resume Score: {resume_score:.2f}\n\n")
                    output_file.write(resume_text)

    print("Resumes processed and selected ones saved in:", output_folder)

# Specify paths and job description
resume_folder_path = "path/to/resumes/folder"
output_folder_path = "path/to/output/folder"
job_description = "Looking for a software engineer proficient in Python and machine learning"

# Process resumes in the specified folder
process_resume_in_folder(resume_folder_path, job_description, output_folder_path)


Euclidean Distance

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

def calculate_euclidean_distance(job_description, resume):
    # Convert job description and resume into a list
    documents = [job_description, resume]

    # Initialize CountVectorizer to convert text into a bag of words
    vectorizer = CountVectorizer()

    # Fit and transform the documents into a matrix of token counts
    matrix = vectorizer.fit_transform(documents).toarray()

    # Compute Euclidean distance between job description and resume
    euclidean_dist = euclidean_distances([matrix[0]], [matrix[1]])

    return euclidean_dist[0][0]

def process_resume_in_folder(resume_folder, job_description, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(resume_folder):
        if filename.endswith(".txt"):  # Assuming resumes are in text files
            resume_path = os.path.join(resume_folder, filename)
            with open(resume_path, 'r', encoding='utf-8') as resume_file:
                resume_text = resume_file.read()

            resume_score = calculate_euclidean_distance(job_description, resume_text)

            # Specify the threshold for selecting resumes
            threshold = 100  # Adjust as needed

            if resume_score <= threshold:
                selected_resume_path = os.path.join(output_folder, filename.replace(".txt", f"_score_{resume_score:.2f}.txt"))
                with open(selected_resume_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(f"Resume Score: {resume_score:.2f}\n\n")
                    output_file.write(resume_text)

    print("Resumes processed and selected ones saved in:", output_folder)

# Specify paths and job description
resume_folder_path = "path/to/resumes/folder"
output_folder_path = "path/to/output/folder"
job_description = "Looking for a software engineer proficient in Python and machine learning"

# Process resumes in the specified folder
process_resume_in_folder(resume_folder_path, job_description, output_folder_path)


# TF-IDF

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_tfidf_similarity(job_description, resume):
    # Convert job description and resume into a list
    documents = [job_description, resume]

    # Initialize TfidfVectorizer to convert text into TF-IDF vectors
    vectorizer = TfidfVectorizer()

    # Fit and transform the documents into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity between job description and resume based on TF-IDF
    tfidf_sim = cosine_similarity(tfidf_matrix)

    # Extract cosine similarity value for job description and resume
    job_resume_similarity = tfidf_sim[0, 1]

    return job_resume_similarity

def process_resume_in_folder(resume_folder, job_description, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(resume_folder):
        if filename.endswith(".txt"):  # Assuming resumes are in text files
            resume_path = os.path.join(resume_folder, filename)
            with open(resume_path, 'r', encoding='utf-8') as resume_file:
                resume_text = resume_file.read()

            resume_score = calculate_tfidf_similarity(job_description, resume_text)

            # Specify the threshold for selecting resumes
            threshold = 0.5  # Adjust as needed

            if resume_score >= threshold:
                selected_resume_path = os.path.join(output_folder, filename.replace(".txt", f"_score_{resume_score:.2f}.txt"))
                with open(selected_resume_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(f"Resume Score: {resume_score:.2f}\n\n")
                    output_file.write(resume_text)

    print("Resumes processed and selected ones saved in:", output_folder)

# Specify paths and job description
resume_folder_path = "path/to/resumes/folder"
output_folder_path = "path/to/output/folder"
job_description = "Looking for a software engineer proficient in Python and machine learning"

# Process resumes in the specified folder
process_resume_in_folder(resume_folder_path, job_description, output_folder_path)
