In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
import re
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Loading dataset from Hugging Face...")
dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
print(f"Dataset loaded. Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")

Loading dataset from Hugging Face...
Dataset loaded. Train size: 6241, Test size: 1759


In [3]:
class JobSearchRAG:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.job_descriptions = []
        self.job_embeddings = None
    
    def clean_text(self, text: str) -> str:
        """Clean text by removing extra whitespace and normalizing newlines."""
        text = re.sub(r'\s+', ' ', text)
        text = text.replace('\\n', '\n').strip()
        return text
    
    def extract_job_info(self, text: str, fit_label: str = None) -> Dict[str, str]:
        """Extract structured information from job description text."""
        text = self.clean_text(text)
        
        # Extract job title/role
        role_patterns = [
            r"(?:Job Title|Position|Role):\s*([^\n]+)",
            r"^(?:Senior|Junior|Lead|Principal|Staff)?\s*([A-Za-z\s]+(?:Engineer|Developer|Analyst|Manager|Architect|Designer|Consultant|Director|Specialist))[^\n]*",
            r"([A-Za-z\s]+(?:Engineer|Developer|Analyst|Manager|Architect|Designer|Consultant|Director|Specialist))[^\n]*"
        ]
        
        role = "Role Not Specified"
        for pattern in role_patterns:
            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
            if match:
                role = match.group(1).strip()
                break
        
        # Extract location
        location_patterns = [
            r"Location:\s*([^\n]+)",
            r"(?:based in|located in|location):\s*([^\n]+)",
            r"(?:City|State|Country):\s*([^\n]+)"
        ]
        
        location = "Location Not Specified"
        for pattern in location_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                location = match.group(1).strip()
                break
        
        # Extract work mode
        work_mode_patterns = [
            r"(?:Work Mode|Work Type|Work Location):\s*([^\n]+)",
            r"(?:Remote|Hybrid|On-site|In-office|Virtual)(?:\s+work|\s+position)?"
        ]
        
        work_mode = "Not Specified"
        for pattern in work_mode_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                work_mode = match.group(1).strip() if match.groups() else match.group(0).strip()
                break
                
        # Create unique identifier for deduplication
        import hashlib
        content_hash = hashlib.md5(text.encode()).hexdigest()
        
        return {
            'role': role,
            'location': location,
            'work_mode': work_mode,
            'text': text,
            'fit_label': fit_label,
            'hash': content_hash
        }

    def process_dataset(self, dataset):
        """Process both train and test splits of the dataset with deduplication."""
        print("Processing dataset...")
        seen_hashes = set()
        all_jobs = []
        
        # Process both train and test sets
        for split in ['train', 'test']:
            for item in dataset[split]:
                job_text = item['job_description_text']
                fit_label = item['label']  # Get the fit label
                
                if isinstance(job_text, str) and len(job_text.strip()) > 0:
                    job_info = self.extract_job_info(job_text, fit_label)
                    
                    # Only add if we haven't seen this exact job before
                    if job_info['hash'] not in seen_hashes:
                        seen_hashes.add(job_info['hash'])
                        # Remove hash before adding to final list
                        del job_info['hash']
                        all_jobs.append(job_info)
        
        print(f"Found {len(all_jobs)} unique job descriptions")
        return all_jobs

    def add_job_descriptions(self, dataset):
        """Process and add job descriptions from the dataset."""
        self.job_descriptions = self.process_dataset(dataset)
        
        # Create embeddings for job descriptions
        print("Creating embeddings...")
        job_texts = [job['text'] for job in self.job_descriptions]
        self.job_embeddings = self.model.encode(job_texts, show_progress_bar=True)
        print("Embeddings created successfully!")
        
    def search_jobs(self, query: str, num_results: int = 5) -> List[Dict]:
        """Search for jobs and return results with fit labels."""
        # Encode query
        query_embedding = self.model.encode([query])[0]
        
        # Calculate similarities
        similarities = cosine_similarity([query_embedding], self.job_embeddings)[0]
        
        # Get top k matches
        top_indices = np.argsort(similarities)[-num_results:][::-1]
        
        results = []
        for idx in top_indices:
            job = self.job_descriptions[idx].copy()
            job['similarity_score'] = float(similarities[idx])
            results.append(job)
            
        return results

In [4]:
rag_system = JobSearchRAG()
rag_system.add_job_descriptions(dataset)

Processing dataset...
Found 351 unique job descriptions
Creating embeddings...


Batches: 100%|██████████| 11/11 [00:02<00:00,  4.09it/s]

Embeddings created successfully!





In [5]:
print("Saving processed data...")
np.save('job_embeddings.npy', rag_system.job_embeddings)
import pickle
with open('job_descriptions.pkl', 'wb') as f:
    pickle.dump(rag_system.job_descriptions, f)

Saving processed data...


In [6]:
print("\nTesting the system...")
test_query = "I am skilled in Python and SQL with 5 years of experience. Looking for a job in New York"
results = rag_system.search_jobs(test_query, num_results=5)

print("\nTest Results:")
for i, job in enumerate(results, 1):
    print(f"\n{i}. Role: {job['role']}")
    print(f"Location: {job['location']}")
    print(f"Work Mode: {job['work_mode']}")
    print(f"Fit Label: {job['fit_label']}")
    print(f"Similarity Score: {job['similarity_score']:.2f}")
    print("-" * 50)


Testing the system...

Test Results:

1. Role: and software test engineer
Location: Location Not Specified
Work Mode: Not Specified
Fit Label: No Fit
Similarity Score: 0.52
--------------------------------------------------

2. Role: Software Developer
Location: Location Not Specified
Work Mode: Hybrid
Fit Label: No Fit
Similarity Score: 0.51
--------------------------------------------------

3. Role: Software Data Engineer
Location: Location Not Specified
Work Mode: Hybrid
Fit Label: No Fit
Similarity Score: 0.50
--------------------------------------------------

4. Role: Data AnalystLocation: Plano, TX (Day 1 Hybrid)Duration: Long Term Contract Description: Must have skills SQL and Tableau 4 years of Data Analysis and business knowledge within a healthcare environment and testing arena3-5 years of Microsoft SQL experience specifically using SQL to build queries, reports.--
Location: Plano, TX (Day 1 Hybrid)Duration: Long Term Contract Description: Must have skills SQL and Tableau 