# Job Recommendation System

Import required libraries and setup


In [1]:
import pandas as pd
import PyPDF2
import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os
from collections import Counter

ModuleNotFoundError: No module named 'PyPDF2'

## Step 1: Gemini API Setup


In [None]:
genai.configure(api_key="AIzaSyB7mPAywVlTSjT7HLTpDJhyI8_qdI0tnFA")  # Replace with your actual key
model = genai.GenerativeModel("gemini-2.0-flash-exp")

## Step 2: PDF Text Extraction


In [None]:
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

## Step 3: Skills Cache Management


In [None]:
SKILLS_CACHE_FILE = 'skills_cache.json'

def load_skills_cache():
    if os.path.exists(SKILLS_CACHE_FILE):
        with open(SKILLS_CACHE_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_skills_cache(cache):
    with open(SKILLS_CACHE_FILE, 'w') as f:
        json.dump(cache, f)

## Step 4: Skills Extraction


In [None]:
def extract_skills(cv_text):
    cache_key = cv_text[:100]
    cache = load_skills_cache()

    if cache_key in cache:
        return cache[cache_key]

    prompt = f"""Extract only the technical and professional skills from this resume text:
    {cv_text}
    Return the skills in a comma-separated list. Focus on technical skills only."""
    
    response = model.generate_content(prompt)
    skills = [skill.strip().lower() for skill in response.text.split(",") if skill.strip()]
    
    cache[cache_key] = skills
    save_skills_cache(cache)
    
    return skills

## Step 5: Job Data Processing


In [None]:
def load_jobs(csv_path):
    df = pd.read_csv(csv_path)
    df.dropna(subset=["description"], inplace=True)
    
    tech_keywords = ['developer', 'engineer', 'programmer', 'software', 'data', 'python',
                    'java', 'web', 'full stack', 'backend', 'frontend', 'ai', 'ml']
    df['is_tech'] = df['title'].str.lower().str.contains('|'.join(tech_keywords))
    df = df[df['is_tech']]
    
    return df

## Step 6: Job Matching Functions


In [None]:
def calculate_skill_importance(skills):
    # Enhanced skill weighting with error handling
    if not skills:
        return {}
        
    # core_skills = {
    #     'python': 1.6, 'java': 1.4, 'javascript': 1.3,
    #     'machine learning': 1.5, 'artificial intelligence': 1.5,
    #     'data science': 1.4, 'tensorflow': 1.4, 'scikit-learn': 1.3,
    #     'opencv': 1.3, 'android': 1.3, 'nextjs': 1.2,
    #     'html': 1.1, 'css': 1.1, 'bootstrap': 1.1,
    #     'git': 1.2, 'pandas': 1.3, 'numpy': 1.3,
    #     'laravel': 1.2, 'plotly': 1.2
    # }
    
    technology_groups = {
        'frontend': ['html', 'css', 'javascript', 'bootstrap', 'nextjs'],
        'backend': ['python', 'java', 'laravel'],
        'data_science': ['pandas', 'numpy', 'scikit-learn', 'tensorflow', 'plotly'],
        'ai_ml': ['machine learning', 'artificial intelligence', 'opencv'],
        'mobile': ['android', 'android development'],
        'version_control': ['git'],
        'web_development': ['web', 'full stack', 'developer', 'engineer', 'programmer'],
        'general': ['software', 'data'],
        'digital_marketing': ['digital marketing', 'seo', 'sem'],
        'cloud_computing': ['aws', 'azure', 'google cloud', 'cloud computing'],
        'devops': ['docker', 'kubernetes', 'jenkins', 'ci/cd'],
        'cyber_security': ['cyber security', 'network security', 'penetration testing'],
        'blockchain': ['blockchain', 'cryptocurrency', 'ethereum'],
    }
    core_skills = {}
    
    try:
        skill_weights = {skill.lower(): core_skills.get(skill.lower(), 1.0) for skill in skills}
        
        # Boost weights for skill combinations
        for skill in skills:
            skill_lower = skill.lower()
            for group, group_skills in technology_groups.items():
                if skill_lower in [s.lower() for s in group_skills]:
                    related_skills = sum(1 for s in skills if s.lower() in [gs.lower() for gs in group_skills])
                    if related_skills > 1:
                        skill_weights[skill_lower] *= (1 + 0.1 * (related_skills - 1))
        
        return skill_weights
    except Exception as e:
        print(f"Error in calculate_skill_importance: {str(e)}")
        return {skill.lower(): 1.0 for skill in skills}  # Fallback to default weights

def recommend_jobs(skills, jobs_df):
    if not isinstance(skills, list) or not skills:
        return pd.DataFrame()  # Return empty DataFrame if skills is invalid
    
    if jobs_df.empty:
        return pd.DataFrame()  # Return empty DataFrame if no jobs
        
    try:
        skill_weights = calculate_skill_importance(skills)
        
        # Ensure description column exists and handle missing values
        if 'description' not in jobs_df.columns:
            print("Error: 'description' column not found in jobs dataframe")
            return pd.DataFrame()
            
        jobs_df['description'] = jobs_df['description'].fillna('')
        job_texts = jobs_df["description"].str.lower().tolist()
        user_profile = " ".join(skills)
        
        # Calculate TF-IDF similarity with error handling
        try:
            vectorizer = TfidfVectorizer(
                stop_words='english',
                ngram_range=(1, 2),
                max_features=10000
            )
            vectors = vectorizer.fit_transform([user_profile] + job_texts)
            tfidf_similarity = cosine_similarity(vectors[0:1], vectors[1:])[0]
        except Exception as e:
            print(f"Error in TF-IDF calculation: {str(e)}")
            tfidf_similarity = np.zeros(len(job_texts))
        
        # Enhanced skill matching with error handling
        skill_matches = []
        for desc in job_texts:
            try:
                skill_score = 0
                matched_skills = set()
                
                for skill in skills:
                    skill_lower = skill.lower()
                    if skill_lower in desc.lower():
                        matched_skills.add(skill_lower)
                        skill_score += skill_weights.get(skill_lower, 1.0)
                
                coverage_ratio = len(matched_skills) / len(skills)
                skill_score = (skill_score / len(skills)) * (1 + coverage_ratio)
                skill_matches.append(skill_score)
            except Exception as e:
                print(f"Error in skill matching: {str(e)}")
                skill_matches.append(0)
        
        # Combine scores safely
        combined_scores = [0.65 * sm + 0.35 * ts for sm, ts in zip(skill_matches, tfidf_similarity)]
        
        # Apply scores and create result DataFrame
        result_df = jobs_df.copy()
        result_df['score'] = combined_scores
        result_df['matched_skills_count'] = [sum(1 for skill in skills if skill.lower() in desc.lower()) 
                                         for desc in job_texts]
        
        # Filter and sort results
        qualified_jobs = result_df[
            (result_df['score'] > 0.3) &  # Minimum relevance score
            (result_df['matched_skills_count'] >= len(skills) * 0.3)  # At least 30% skill match
        ]
        
        if qualified_jobs.empty:
            return pd.DataFrame()  # Return empty DataFrame if no matches
        
        # Calculate final score and sort
        qualified_jobs['final_score'] = (qualified_jobs['score'] * 0.7 + 
                                     (qualified_jobs['matched_skills_count'] / len(skills)) * 0.3)
        
        # Ensure all required columns exist
        required_columns = ["title", "company", "location", "description", "final_score"]
        for col in required_columns:
            if col not in qualified_jobs.columns:
                qualified_jobs[col] = ""  # Add empty column if missing
        
        return qualified_jobs.sort_values(
            by='final_score', 
            ascending=False
        )[required_columns]
        
    except Exception as e:
        print(f"Error in job recommendation: {str(e)}")
        return pd.DataFrame()  # Return empty DataFrame on error

## Step 7: Run the System


In [None]:
# Extract skills from CV
cv_text = extract_text_from_pdf("Harsh_Jaiswal_updated.pdf")
skills = extract_skills(cv_text)
print("Extracted Skills:")
print(skills)

Extracted Skills:
['python', 'artificial intelligence', 'data science', 'machine learning', 'git', 'pandas', 'numpy', 'android', 'android development', 'bootstrap', 'java', 'laravel', 'html', 'css', 'javascript', 'plotly', 'tensorflow', 'scikit-learn', 'opencv', 'nextjs']


In [None]:
# Load and process jobs
jobs_df = load_jobs("jobs.csv")
print(f"Total jobs loaded: {len(jobs_df)}")

Total jobs loaded: 7543


In [None]:
# Get all matching jobs

skills = ["flutter","dart","Android"]


matching_jobs = recommend_jobs(skills, jobs_df)
print(f"\nTotal matching jobs found: {len(matching_jobs)}")
print("\nMatching Jobs (sorted by relevance):")


matching_jobs





Total matching jobs found: 241

Matching Jobs (sorted by relevance):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified_jobs['final_score'] = (qualified_jobs['score'] * 0.7 +


Unnamed: 0,title,company,location,description,final_score
345,Software Engineer (Android Experience Required),"NuStep, LLC","Ann Arbor, MI",Apply to: Careers@NuStep.com Basic Function Re...,0.796527
1565,Software Engineer,Skycart,"San Jose, CA",Skycart is currently looking for a Software De...,0.753048
6729,Senior Software Engineer - Android,Realtor.com,"Austin, TX","At realtor.com®, we believe that everyone dese...",0.735545
6722,Senior Software Engineer - Android,Realtor.com,"Morgantown, WV","At realtor.com®, we believe that everyone dese...",0.735545
6728,Senior Software Engineer - Android,Realtor.com,"Westlake Village, CA","At realtor.com®, we believe that everyone dese...",0.735545
...,...,...,...,...,...
7131,Junior Software Engineer - Location Flexible,Dropbox,"Seattle, WA",Company Description Dropbox is a leading globa...,0.310197
6251,Junior Software Engineer - Location Flexible,Dropbox,"Seattle, WA",Company Description Dropbox is a leading globa...,0.310197
6249,Junior Software Engineer - Location Flexible,Dropbox,"San Francisco, CA",Company Description Dropbox is a leading globa...,0.310197
6985,Software Engineer Intern,LinkedIn,"Sunnyvale, CA",Software Engineer Internship - Summer 2022 Lin...,0.310119


In [None]:
df = pd.read_csv("jobs.csv")
df.loc[345]

Employment type                                              Full-time
Industries           Electrical/Electronic Manufacturing, Consumer ...
Job function                    Engineering and Information Technology
Seniority level                                            Entry level
company                                                    NuStep, LLC
company_id                                                    130481.0
context                                                            NaN
date                                                               NaN
description          Apply to: Careers@NuStep.com Basic Function Re...
education                                                          NaN
location                                                 Ann Arbor, MI
months_experience                                                  NaN
post_id                                                     2645088621
post_url             https://www.linkedin.com/jobs/view/software-en...
sal_hi