In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import pandas as pd

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.127 Safari/537.36')
prefs = {
    "profile.default_content_setting_values.geolocation": 2  # 2 = Deny, 1 = Allow, 0 = Ask
}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--disable-geolocation')

# Set up the Chrome driver
driver = webdriver.Chrome(options=chrome_options)

# Clear cookies and site data to remove any existing permissions
driver.delete_all_cookies()

The chromedriver version (134.0.6998.165) detected in PATH at C:\chromedriver-win64\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (135.0.7049.96); currently, chromedriver 135.0.7049.95 is recommended for chrome 135.*, so it is advised to delete the driver in PATH and retry


In [2]:
url = "https://careers.walmart.com/results?q=Data%20Analytics&page=1&sort=rank&jobEmploymentType=0000015a-721c-dcbc-afda-779e92ad0000&expand=department,brand,type,rate&jobCareerArea=all"
driver.get(url)

# Debug: Check page load
print("Page title:", driver.title)
print("Page URL:", driver.current_url)

# Wait for job listings to load (up to 5 seconds)
try:
    WebDriverWait(driver, 5).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.search-result.job-listing"))
    )
except:
    print("Timed out waiting for job listings to load.")
    print(driver.page_source)  # Debug: Print page source
    # driver.quit()  # Commented out to keep the browser open
    exit()
    
jobs = driver.find_elements(By.CSS_SELECTOR, "li.search-result.job-listing")
print(f"Found {len(jobs)} job elements.")

Page title: Results
Page URL: https://careers.walmart.com/results?q=Data%20Analytics&page=1&sort=rank&jobEmploymentType=0000015a-721c-dcbc-afda-779e92ad0000&expand=department,brand,type,rate&jobCareerArea=all
Found 25 job elements.


In [3]:
# Store job details
job_data = []
for job in jobs:
    try:
        # Extract title directly from the <a> tag inside h4
        title = job.find_element(By.CSS_SELECTOR, "h4.job-listing__title a").text.strip()

        # Extract department (sibling element in div.job-listing__headline)
        department = job.find_element(By.CSS_SELECTOR, "span.job-listing__department").text.strip()

        # Extract location
        location = job.find_element(By.CSS_SELECTOR, "span.job-listing__location").text.strip()

        # Extract date
        date = job.find_element(By.CSS_SELECTOR, "span.job-listing__created").text.strip()

        # Extract link
        link = job.find_element(By.CSS_SELECTOR, "h4.job-listing__title a").get_attribute("href")

        # Add to job_data (include department as a separate field if needed)
        job_data.append({"title": title, "department": department, "location": location, "date": date, "link": link, "description": ""})
    except Exception as e:
        print(f"Error scraping job: {e}")
        continue
print(job_data)

[{'title': 'Senior Manager, Data Analytics', 'department': 'DATA ANALYTICS AND BUSINESS INTELLIGENCE', 'location': 'BENTONVILLE, AR', 'date': '03/17/25', 'link': 'https://careers.walmart.com/us/jobs/WD2114848-senior-manager-data-analytics', 'description': ''}, {'title': 'Senior Manager, Data Analytics - Product and Marketing Analytics', 'department': 'DATA ANALYTICS AND BUSINESS INTELLIGENCE', 'location': 'BENTONVILLE, AR', 'date': '03/11/25', 'link': 'https://careers.walmart.com/us/jobs/WD2091244-senior-manager-data-analytics-product-and-marketing-analytics', 'description': ''}, {'title': '(USA) Senior Manager, Data Analytics', 'department': 'DATA ANALYTICS AND BUSINESS INTELLIGENCE', 'location': 'BENTONVILLE, AR', 'date': '04/18/25', 'link': 'https://careers.walmart.com/us/jobs/WD2154982-usa-senior-manager-data-analytics', 'description': ''}, {'title': '(USA) Manager, Data Analytics', 'department': 'DATA ANALYTICS AND BUSINESS INTELLIGENCE', 'location': 'BENTONVILLE, AR', 'date': '04

In [4]:
def get_job_details(driver, job_link):
    try:
        driver.get(job_link)
        
        # Wait for job details to load
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "body > main > section.job-details"))
        )
        
        # Extract requisition ID using the provided selector
        try:
            requisition_id = driver.find_element(
                By.CSS_SELECTOR, 
                "body > main > section.job-details > div > div.job-data > ul > li:nth-child(6) > span.job-data__value"
            ).text.strip()
        except Exception as e:
            print(f"Error finding requisition ID: {e}")
            requisition_id = "Not found"
        
        # Extract job description using your provided selector
        try:
            description = driver.find_element(
                By.CSS_SELECTOR, 
                "body > main > section.job-details > div > div.job-description > div > div:nth-child(1)"
            ).text.strip()
        except:
            description = "Description not found"
        
        return requisition_id, description
    
    except Exception as e:
        print(f"Error loading job page {job_link}: {e}")
        return "Error", "Error"

# Update job_data with requisition ID and description
for job in job_data:
    print(f"Processing: {job['title']}")
    requisition_id, description = get_job_details(driver, job['link'])
    job['requisition_id'] = requisition_id
    job['description'] = description
    time.sleep(2)  # Add delay to avoid overwhelming the server
    
# Print updated job data
for job in job_data:
    print(f"Title: {job['title']}")
    print(f"Requisition ID: {job['requisition_id']}")
    print(f"Description preview: {job['description'][:100]}...")
    print("-" * 50)

Processing: Senior Manager, Data Analytics
Processing: Senior Manager, Data Analytics - Product and Marketing Analytics
Processing: (USA) Senior Manager, Data Analytics
Processing: (USA) Manager, Data Analytics
Processing: (USA) Director, Data Analytics
Processing: (USA) Senior Manager, Data Analytics
Processing: Director, Data Analytics – Contribution Profit Intelligence
Processing: Senior Director, Data Science – Walmart Connect – Head of Pricing & Yield Analytics
Processing: Senior Data Scientist, Member Engagement
Processing: Data Scientist III
Processing: Data Scientist, Sam’s Member Engagement
Processing: Data Scientist III
Processing: Senior Data Scientist
Processing: Data Scientist III
Processing: Senior Data Scientist
Processing: Senior Manager, Advanced Analytics
Processing: Senior Data Scientist
Processing: Senior Data Scientist
Processing: Data Scientist
Processing: Senior Data Scientist
Processing: (USA) Senior Manager, Advanced Analytics
Processing: (USA) Manager, Advance

In [5]:
# Optional: Save to CSV
df = pd.DataFrame(job_data)
df_jd = df[['requisition_id', 'description']]
df_details = df[['requisition_id', 'title']]
df_jd.to_csv('walmart_jobs_id_desc.csv', index=False)
print("Data saved to walmart_jobs_id_desc.csv")
df_details.to_csv('walmart_jobs_id_title.csv', index=False)
print("Data saved to walmart_jobs_id_title.csv")

Data saved to walmart_jobs_id_desc.csv
Data saved to walmart_jobs_id_title.csv


In [6]:
df.describe()

Unnamed: 0,title,department,location,date,link,description,requisition_id
count,25,25,25,25,25,25,25
unique,17,1,2,8,25,25,25
top,Senior Data Scientist,DATA ANALYTICS AND BUSINESS INTELLIGENCE,"BENTONVILLE, AR",04/14/25,https://careers.walmart.com/us/jobs/WD2114848-...,Position Summary...\nJoin the Walmart US (WMUS...,R-2114848
freq,6,25,24,15,1,1,1


In [7]:
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

resume_folder = "JD_PDF"
def create_pdf(text, output_path):
    c = canvas.Canvas(output_path, pagesize=letter)
    width, height = letter
    y = height - 50  # Start near the top
    lines = text.split('\n')
    
    for line in lines:
        if y < 50:  # If near bottom, add new page
            c.showPage()
            y = height - 50
        c.drawString(50, y, line)
        y -= 15  # Move down for next line
    
    c.save()

# Loop through DataFrame and convert each job description to PDF
for index, row in df.iterrows():
    job_description = str(row['description'])  # Ensure it’s a string
    job_id = str(row['requisition_id'])  # Ensure it’s a string
    # Clean job_id to make it filename-safe (remove special characters)
    safe_job_id = ''.join(c for c in job_id if c.isalnum())
    job_pdf_path = os.path.join(resume_folder, f"job_{safe_job_id}.pdf")
    
    create_pdf(job_description, job_pdf_path)
    print(f"Job {safe_job_id} saved as PDF at: {job_pdf_path}")

Job R2114848 saved as PDF at: JD_PDF\job_R2114848.pdf
Job R2091244 saved as PDF at: JD_PDF\job_R2091244.pdf
Job R2154982 saved as PDF at: JD_PDF\job_R2154982.pdf
Job R2155309 saved as PDF at: JD_PDF\job_R2155309.pdf
Job R2140275 saved as PDF at: JD_PDF\job_R2140275.pdf
Job R2124281 saved as PDF at: JD_PDF\job_R2124281.pdf
Job R2114193 saved as PDF at: JD_PDF\job_R2114193.pdf
Job R2126555 saved as PDF at: JD_PDF\job_R2126555.pdf
Job R2148780 saved as PDF at: JD_PDF\job_R2148780.pdf
Job R2149267 saved as PDF at: JD_PDF\job_R2149267.pdf
Job R2148534 saved as PDF at: JD_PDF\job_R2148534.pdf
Job R2149273 saved as PDF at: JD_PDF\job_R2149273.pdf
Job R2148535 saved as PDF at: JD_PDF\job_R2148535.pdf
Job R2149258 saved as PDF at: JD_PDF\job_R2149258.pdf
Job R2148518 saved as PDF at: JD_PDF\job_R2148518.pdf
Job R2148532 saved as PDF at: JD_PDF\job_R2148532.pdf
Job R2148517 saved as PDF at: JD_PDF\job_R2148517.pdf
Job R2148524 saved as PDF at: JD_PDF\job_R2148524.pdf
Job R2148513 saved as PDF at

In [21]:
import os
import re
import pandas as pd
import numpy as np
from pathlib import Path
import PyPDF2
import torch
from transformers import BertTokenizer, BertModel

class DirectResumeMatcherBERT:
    def __init__(self, job_data):
        """Initialize the BERT-based resume matcher with Walmart job data.

        Args:
            job_data (list): List of dictionaries with job details
                           (title, department, location, requisition_id, description).
        """
        self.job_data = job_data
        
        # Set up directories
        self.base_dir = Path("C:\\Users\\suhas\\Job Scrapping\\DirectResumeMatcher")
        self.resume_dir = self.base_dir / "Resumes"
        self.output_dir = self.base_dir / "Output"
        
        self.resume_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # Load BERT model and tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        
        # Prepare job descriptions
        self.processed_jobs = self._prepare_job_descriptions()
    
    def _extract_text_from_pdf(self, pdf_path):
        """Extract text from a PDF resume using PyPDF2.

        Args:
            pdf_path (Path): Path to the PDF file.

        Returns:
            str: Extracted text or empty string if extraction fails.
        """
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                text = re.sub(r'\s+', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()
                return text
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {e}")
            return ""
    
    def _preprocess_text(self, text):
        """Lightly preprocess text for BERT (lowercase, normalize spaces).

        Args:
            text (str): Raw text.

        Returns:
            str: Preprocessed text.
        """
        text = text.lower()
        text = re.sub(r'\s+', ' ', text.strip())
        return text
    
    def _get_bert_embedding(self, text):
        """Generate BERT embedding for text.

        Args:
            text (str): Input text.

        Returns:
            torch.Tensor: BERT embedding (CLS token).
        """
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=512,
            truncation=True,
            padding=True
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use CLS token embedding (first token)
            embedding = outputs.last_hidden_state[:, 0, :].squeeze()
        
        return embedding
    
    def _prepare_job_descriptions(self):
        """Prepare job descriptions with BERT embeddings.

        Returns:
            list: List of processed job dictionaries.
        """
        processed_jobs = []
        for job in self.job_data:
            job_text = f"{job['title']} {job['department']} {job['description']}"
            processed_text = self._preprocess_text(job_text)
            embedding = self._get_bert_embedding(processed_text)
            processed_jobs.append({
                'title': job['title'],
                'department': job['department'],
                'location': job['location'],
                'requisition_id': job['requisition_id'],
                'raw_text': job_text,
                'processed_text': processed_text,
                'embedding': embedding
            })
        return processed_jobs
    
    def find_resumes(self):
        """Find PDF resumes in the resume directory.

        Returns:
            list: List of Path objects for PDF files.
        """
        print(f"Looking for resumes in {self.resume_dir}...")
        resume_files = list(self.resume_dir.glob("*.pdf"))
        if not resume_files:
            print("No PDF resumes found.")
            print(f"Please add PDF resumes to: {self.resume_dir}")
            return []
        print(f"Found {len(resume_files)} resume(s).")
        return resume_files
    
    def process_resumes(self, resume_files):
        """Process resume PDFs with BERT embeddings.

        Args:
            resume_files (list): List of Path objects for resume PDFs.

        Returns:
            list: List of processed resume dictionaries.
        """
        processed_resumes = []
        for resume_path in resume_files:
            print(f"Processing resume: {resume_path.name}")
            raw_text = self._extract_text_from_pdf(resume_path)
            if not raw_text:
                print(f"Warning: Could not extract text from {resume_path.name}")
                continue
            processed_text = self._preprocess_text(raw_text)
            embedding = self._get_bert_embedding(processed_text)
            processed_resumes.append({
                'filename': resume_path.name,
                'path': resume_path,
                'raw_text': raw_text,
                'processed_text': processed_text,
                'embedding': embedding
            })
        return processed_resumes
    
    def match_resumes_to_jobs(self, processed_resumes):
        """Match resumes to jobs using BERT embeddings and cosine similarity.

        Args:
            processed_resumes (list): List of processed resume dictionaries.

        Returns:
            list: List of match results for each resume.
        """
        if not processed_resumes:
            print("No processed resumes to match.")
            return []
        
        results = []
        for resume in processed_resumes:
            resume_embedding = resume['embedding']
            job_matches = []
            
            for job in self.processed_jobs:
                job_embedding = job['embedding']
                # Compute cosine similarity using torch
                similarity = torch.cosine_similarity(
                    resume_embedding.unsqueeze(0),
                    job_embedding.unsqueeze(0)
                ).item()
                
                job_matches.append({
                    'job_title': job['title'],
                    'requisition_id': job['requisition_id'],
                    'department': job['department'],
                    'location': job['location'],
                    'similarity_score': float(similarity)
                })
            
            job_matches.sort(key=lambda x: x['similarity_score'], reverse=True)
            results.append({
                'resume': resume['filename'],
                'matches': job_matches
            })
        
        return results
    
    def save_results(self, results):
        """Save matching results to CSV and JSON.

        Args:
            results (list): List of match results.

        Returns:
            pandas.DataFrame: DataFrame of results or None if no results.
        """
        if not results:
            print("No results to save.")
            return None
        
        all_matches = []
        for result in results:
            resume_name = result['resume']
            for match in result['matches']:
                match_data = {
                    'Resume': resume_name,
                    'Job Title': match['job_title'],
                    'Requisition ID': match['requisition_id'],
                    'Department': match['department'],
                    'Location': match['location'],
                    'Similarity Score': match['similarity_score']
                }
                all_matches.append(match_data)
        
        df = pd.DataFrame(all_matches)
        csv_path = self.output_dir / "resume_job_matches.csv"
        df.to_csv(csv_path, index=False)
        print(f"Saved results to {csv_path}")
        
        json_path = self.output_dir / "resume_job_matches.json"
        with open(json_path, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Saved detailed results to {json_path}")
        
        return df
    
    def display_results(self, results):
        """Display top 5 matching results for each resume.

        Args:
            results (list): List of match results.
        """
        if not results:
            print("No results to display.")
            return
        
        print("\n===== RESUME MATCHING RESULTS (BERT) =====\n")
        for result in results:
            resume_name = result['resume']
            matches = result['matches']
            print(f"\nRESUME: {resume_name}")
            print("-" * 80)
            for i, match in enumerate(matches[:5]):
                print(f"{i+1}. {match['job_title']} (ID: {match['requisition_id']})")
                print(f"   Department: {match['department']}")
                print(f"   Location: {match['location']}")
                print(f"   Similarity Score: {match['similarity_score']:.2f}")
                print()
    
    def run(self):
        """Run the BERT-based resume matcher.

        Returns:
            pandas.DataFrame: DataFrame of results or None if no matches.
        """
        print("Starting Direct Resume Matcher with BERT...")
        resume_files = self.find_resumes()
        if not resume_files:
            print("\nPlease add resume PDF files to:")
            print(f"{self.resume_dir}\n")
            return None
        processed_resumes = self.process_resumes(resume_files)
        results = self.match_resumes_to_jobs(processed_resumes)
        self.display_results(results)
        df = self.save_results(results)
        print("\nDirect Resume Matcher with BERT completed!")
        return df

def run_direct_resume_matcher(job_data):
    """Run the BERT-based resume matcher.

    Args:
        job_data (list): List of job dictionaries.

    Returns:
        pandas.DataFrame: DataFrame of results or None.
    """
    matcher = DirectResumeMatcherBERT(job_data)
    return matcher.run()

if __name__ == "__main__":
    results_df = run_direct_resume_matcher(job_data)

Starting Direct Resume Matcher with BERT...
Looking for resumes in C:\Users\suhas\Job Scrapping\DirectResumeMatcher\Resumes...
Found 3 resume(s).
Processing resume: Data Engineering Resume Suhas Ramesh.pdf
Processing resume: ML Resume Suhas Ramesh.pdf
Processing resume: SE Resume Suhas Ramesh.pdf

===== RESUME MATCHING RESULTS (BERT) =====


RESUME: Data Engineering Resume Suhas Ramesh.pdf
--------------------------------------------------------------------------------
1. (USA) Director, Data Analytics (ID: R-2140275)
   Department: DATA ANALYTICS AND BUSINESS INTELLIGENCE
   Location: BENTONVILLE, AR
   Similarity Score: 0.78

2. Data Scientist III (ID: R-2149267)
   Department: DATA ANALYTICS AND BUSINESS INTELLIGENCE
   Location: BENTONVILLE, AR
   Similarity Score: 0.77

3. Staff Data Analyst (ID: R-2147731)
   Department: DATA ANALYTICS AND BUSINESS INTELLIGENCE
   Location: BENTONVILLE, AR
   Similarity Score: 0.76

4. (USA) Manager, Advanced Analytics (ID: R-2155352)
   Departme

In [23]:
import os
import shutil
from pathlib import Path
import pandas as pd
from scripts.pdf_parser import parse_pdf  # Resume-Matcher: Parse PDFs
from scripts.text_processor import extract_keywords  # Resume-Matcher: Extract keywords
from scripts.vectorizer import compute_similarity  # Resume-Matcher: Compute similarity

def integrate_resume_matcher(job_data):
    """Integrate Resume-Matcher's pipeline to match resumes to Walmart job descriptions."""
    # Set up Resume-Matcher directories
    base_dir = Path('Resume-Matcher-main')
    resume_dir = base_dir / 'Data' / 'Resumes'
    jd_dir = base_dir / 'Data' / 'JobDescription'
    output_dir = base_dir / 'Output'
    
    resume_dir.mkdir(parents=True, exist_ok=True)
    jd_dir.mkdir(parents=True, exist_ok=True)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Move job description PDFs from JD_PDF/ to Resume-Matcher/Data/JobDescription
    original_jd_dir = Path('JD_PDF')
    if original_jd_dir.exists():
        for pdf_file in original_jd_dir.glob('*.pdf'):
            shutil.move(str(pdf_file), str(jd_dir / pdf_file.name))
        print(f"Moved job description PDFs from {original_jd_dir} to {jd_dir}")
    
    # Find resumes and job descriptions
    resume_files = list(resume_dir.glob("*.pdf"))
    jd_files = list(jd_dir.glob("*.pdf"))
    
    if not resume_files or not jd_files:
        print("No resumes or job descriptions found.")
        print(f"Resumes: {resume_dir}")
        print(f"Job Descriptions: {jd_dir}")
        return None
    
    print(f"Found {len(resume_files)} resume(s) and {len(jd_files)} job description(s).")
    
    # Process resumes and match to jobs
    results = []
    for resume_path in resume_files:
        print(f"Processing resume: {resume_path.name}")
        resume_text = parse_pdf(resume_path)  # Parse resume using Resume-Matcher
        resume_keywords = extract_keywords(resume_text)  # Extract keywords
        
        resume_matches = []
        for jd_path in jd_files:
            jd_text = parse_pdf(jd_path)  # Parse job description
            jd_keywords = extract_keywords(jd_text)
            similarity_score = compute_similarity(resume_keywords, jd_keywords)  # Compute similarity
            
            # Find corresponding job details
            jd_filename = jd_path.name
            job_id = jd_filename.replace('job_', '').replace('.pdf', '')
            job = next((j for j in job_data if j['requisition_id'].replace('-', '') == job_id), None)
            
            if job:
                resume_matches.append({
                    'job_title': job['title'],
                    'requisition_id': job['requisition_id'],
                    'department': job['department'],
                    'location': job['location'],
                    'similarity_score': similarity_score
                })
        
        resume_matches.sort(key=lambda x: x['similarity_score'], reverse=True)
        results.append({
            'resume': resume_path.name,
            'matches': resume_matches[:5]  # Top 5 matches
        })
    
    # Display results
    print("\n===== RESUME MATCHING RESULTS (Resume-Matcher) =====\n")
    for result in results:
        print(f"\nRESUME: {result['resume']}")
        print("-" * 80)
        for i, match in enumerate(result['matches'], 1):
            print(f"{i}. {match['job_title']} (ID: {match['requisition_id']})")
            print(f"   Department: {match['department']}")
            print(f"   Location: {match['location']}")
            print(f"   Similarity Score: {match['similarity_score']:.2f}")
            print()
    
    # Save results
    all_matches = []
    for result in results:
        for match in result['matches']:
            all_matches.append({
                'Resume': result['resume'],
                'Job Title': match['job_title'],
                'Requisition ID': match['requisition_id'],
                'Department': match['department'],
                'Location': match['location'],
                'Similarity Score': match['similarity_score']
            })
    df_results = pd.DataFrame(all_matches)
    csv_path = output_dir / 'resume_job_matches.csv'
    df_results.to_csv(csv_path, index=False)
    print(f"Saved results to {csv_path}")
    
    # Optional: Note for Streamlit visualization
    print("\nTo visualize results, run: streamlit run Resume-Matcher/streamlit_app.py")
    
    return df_results

# Run the Resume-Matcher integration
results_df = integrate_resume_matcher(job_data)

ModuleNotFoundError: No module named 'scripts.pdf_parser'