In [18]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Synonym dictionary for skills standardization
synonym_dict = {
    'ML': 'machine learning',
    'AI': 'artificial intelligence',
    'k8s': 'kubernetes',
    'sql': 'database management',
}

# Function to replace synonyms
def replace_synonyms(skills):
    return [synonym_dict.get(skill.lower(), skill.lower()) for skill in skills]

# Function to preprocess text
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [synonym_dict.get(token.lemma_, token.lemma_) for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

# Function to parse raw entries
def parse_raw_entry(raw_entry):
    try:
        # Attempt to parse as standard JSON
        parsed_entry = json.loads(raw_entry)
        return parsed_entry
    except (json.JSONDecodeError, TypeError):
        # Handle raw, malformed entries
        lines = raw_entry.split("\n")
        structured_entry = {"college name": None, "10th marks": None, "12th marks": None, "CGPA": None, "skills": []}

        for line in lines:
            line = line.strip()
            if line.startswith("* College Name:"):
                structured_entry["college name"] = line.replace("* College Name:", "").strip()
            elif line.startswith("* 10th Marks:"):
                structured_entry["10th marks"] = line.replace("* 10th Marks:", "").strip()
            elif line.startswith("* 12th Marks:"):
                structured_entry["12th marks"] = line.replace("* 12th Marks:", "").strip()
            elif line.startswith("* CGPA:"):
                structured_entry["CGPA"] = line.replace("* CGPA:", "").strip()
            elif line.startswith("* Skills:"):
                skills_part = line.replace("* Skills:", "").strip()
                skills = [skill.strip() for skill in skills_part.split(",") if skill.strip()]
                structured_entry["skills"].extend(skills)

        return structured_entry

# Function to clean raw extractions
def clean_extractions(data):
    for entry in data:
        if "error" in entry:
            # Skip entries with errors
            continue
        response = entry.get("response")
        if isinstance(response, str):  # Detect raw string format
            entry["response"] = parse_raw_entry(response)
    return data

# Extract valid resumes
def extract_valid_resumes(data):
    resumes = []
    for entry in data:
        response = entry.get("response")
        if response:
            skills = replace_synonyms(response.get("skills", []))
            resumes.append({
                "file_name": entry["file_name"],
                "skills": " ".join(skills),
                "10th_marks": response.get("10th marks"),
                "12th_marks": response.get("12th marks"),
                "CGPA": response.get("CGPA"),
            })
    return pd.DataFrame(resumes)

# Skip normalization for CGPA and marks
def normalize_features(df, columns):
    """
    Normalize numerical features (e.g., marks and CGPA) and handle non-numeric data.
    Skips normalization for non-numeric columns like CGPA and marks.
    """
    for col in columns:
        if col in ['10th_marks', '12th_marks', 'CGPA']:
            continue  # Skip these columns from normalization
        df[col] = df[col].fillna(0)  # Fill missing values with 0 (or use another strategy)
    
    return df


In [19]:
def compute_similarity(df, job_description):
    job_vector = nlp(preprocess_text(job_description)).vector

    if job_vector is None or len(job_vector) == 0:
        raise ValueError("Job description vector is invalid.")

    # Generate skill vectors for resumes, replacing empty ones with zeros
    df['Skill_Vector'] = df['skills'].apply(
        lambda x: nlp(x).vector if isinstance(x, str) and x.strip() else np.zeros(nlp.vocab.vectors_length)
    )

    # Remove rows with invalid skill vectors
    df = df[df['Skill_Vector'].apply(lambda x: np.any(x))]

    # Compute similarity scores based on skill vectors
    similarity_scores = [
        cosine_similarity(vec.reshape(1, -1), job_vector.reshape(1, -1))[0][0]
        for vec in df['Skill_Vector']
    ]

    df['Similarity_Score'] = similarity_scores

    # Sort by similarity score, take the top 10, and ensure it's a DataFrame
    top_10_resumes = df[['file_name', 'Similarity_Score', '10th_marks', '12th_marks', 'CGPA']].sort_values(
        by='Similarity_Score', ascending=False
    ).head(10)

    return pd.DataFrame(top_10_resumes)



# Main function
def rank_resumes(json_path, job_description):
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    cleaned_data = clean_extractions(data)

    # Extract valid resumes and preprocess
    resumes_df = extract_valid_resumes(cleaned_data)
    resumes_df = normalize_features(resumes_df, ['10th_marks', '12th_marks', 'CGPA'])

    ranked_df = compute_similarity(resumes_df, job_description)
    return ranked_df.head(20)



In [20]:
if __name__ == "__main__":
    json_path = "bulk_responses.json"  # Replace with your JSON file path
    job_description_path = "job1.txt"  # Path to your job description file

    # Read job description from file
    with open(job_description_path, 'r', encoding='utf-8') as file:
        job_description = file.read().strip()  # Read and clean up whitespace

    # Rank resumes based on the job description
    ranked_resumes = rank_resumes(json_path, job_description)

    # Display the top 10 resumes
    print(ranked_resumes)

    # Save the ranked resumes to a JSON file
    json_output_path = "ranked_resumes.json"  # Path for the JSON output file
    ranked_resumes.to_json(json_output_path, orient="records", lines=False, indent=4)
    print(f"Ranked resumes have been saved to {json_output_path}")


                              file_name  Similarity_Score 10th_marks  \
61          22220131_ARYAN_LAD_VIIT.pdf          0.922561       None   
39  22110633_ATHARVA_NANDURKAR_VIIT.pdf          0.915797       None   
45    22110712_SANDESH_BUCHKUL_VIIT.pdf          0.900788       None   
44   22110709_PRASAD_KANAKGIRI_VIIT.pdf          0.900788       None   
43      22110705_PRAJWAL_PATIL_VIIT.pdf          0.900765       None   
47       22110724_GANESH_JOSHI_VIIT.pdf          0.894242       None   
32     22110511_APURVA_BELSARE_VIIT.pdf          0.885377       None   
76      22220287_MUSKAN_SHAIKH_VIIT.pdf          0.885334       None   
74     22220262_PUSHKAR_INGALE_VIIT.pdf          0.885190       None   
56        22110853_KUSHAL_MALU_VIIT.pdf          0.882646       None   

   12th_marks          CGPA  
61       None          None  
39       None          9.23  
45       None       9.03/10  
44       None     8.79/10.0  
43       None  8.95 / 10.00  
47       None          8.95

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Similarity_Score'] = similarity_scores
