<a href="https://colab.research.google.com/github/rahulrainarr/mycode/blob/main/JD_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#!/usr/bin/env python3
"""
Job Search Automation Script
Parses cover letters and searches multiple job platforms for relevant positions.
"""

import re
import csv
import json
import time
import requests
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import List, Dict, Set, Optional
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from urllib.parse import urlencode, quote_plus
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class JobListing:
    """Data structure for job listings"""
    title: str
    company: str
    location: str
    date_posted: str
    description: str
    link: str
    platform: str
    days_old: int = 0
    location_score: int = 0

class CoverLetterParser:
    """Parse cover letter to extract relevant job search parameters"""

    def __init__(self):
        # Download required NLTK data
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('corpora/stopwords')
            nltk.data.find('tokenizers/punkt_tab') # Add this line to download the missing resource
        except LookupError:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('punkt_tab') # Add this line to download the missing resource

        self.stop_words = set(stopwords.words('english'))

        # Common job titles and skills patterns
        self.job_title_patterns = [
            r'software\s+engineer', r'data\s+scientist', r'product\s+manager',
            r'marketing\s+manager', r'business\s+analyst', r'project\s+manager',
            r'full\s+stack', r'frontend', r'backend', r'devops', r'ui/ux',
            r'sales\s+manager', r'account\s+manager', r'consultant'
        ]

        self.location_patterns = [
            r'[A-Z][a-z]+,\s*[A-Z]{2}',  # City, State
            r'[A-Z][a-z]+\s+[A-Z][a-z]+',  # City Name
            r'remote', r'hybrid', r'work\s+from\s+home'
        ]

    def extract_keywords(self, text: str) -> Dict[str, List[str]]:
        """Extract job-relevant keywords from cover letter"""
        text = text.lower()

        # Extract job titles
        job_titles = []
        for pattern in self.job_title_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            job_titles.extend(matches)

        # Extract locations
        locations = []
        for pattern in self.location_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            locations.extend(matches)

        # Extract skills (common technical terms)
        skill_keywords = [
            'python', 'java', 'javascript', 'react', 'angular', 'node.js',
            'sql', 'mongodb', 'aws', 'docker', 'kubernetes', 'machine learning',
            'data analysis', 'project management', 'agile', 'scrum',
            'marketing', 'sales', 'excel', 'powerbi', 'tableau'
        ]

        skills = [skill for skill in skill_keywords if skill in text]

        # Extract general keywords (excluding stop words)
        tokens = word_tokenize(text)
        keywords = [word for word in tokens if word.isalpha() and
                   len(word) > 3 and word not in self.stop_words]

        return {
            'job_titles': job_titles,
            'locations': locations,
            'skills': skills,
            'keywords': keywords[:20]  # Top 20 keywords
        }

    def detect_work_preferences(self, text: str) -> Dict[str, bool]:
        """Detect work arrangement preferences"""
        text = text.lower()
        return {
            'remote': any(term in text for term in ['remote', 'work from home', 'telecommute']),
            'hybrid': 'hybrid' in text,
            'onsite': any(term in text for term in ['on-site', 'office', 'in-person'])
        }

class JobSearcher:
    """Search job platforms for relevant positions"""

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.job_listings = []
        self.seen_jobs = set()  # To avoid duplicates

    def search_linkedin_jobs(self, keywords: List[str], location: str = "", limit: int = 25) -> List[JobListing]:
        """Search LinkedIn Jobs (using their public job search)"""
        jobs = []
        try:
            search_terms = ' '.join(keywords[:3])  # Use top 3 keywords
            params = {
                'keywords': search_terms,
                'location': location,
                'f_TPR': 'r604800',  # Past week
                'sortBy': 'DD'  # Date descending
            }

            url = f"https://www.linkedin.com/jobs/search?{urlencode(params)}"

            # Note: LinkedIn has anti-scraping measures, this is a simplified example
            # In production, you'd use LinkedIn's API or specialized tools
            logger.info(f"Searching LinkedIn for: {search_terms} in {location}")

            # Simulated LinkedIn job results (replace with actual scraping/API calls)
            for i in range(min(limit, 10)):  # Simulate finding 10 jobs
                job = JobListing(
                    title=f"Software Engineer - {search_terms}",
                    company=f"Tech Company {i+1}",
                    location=location or "Remote",
                    date_posted=(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d"),
                    description=f"Position involving {search_terms} and related technologies",
                    link=f"https://linkedin.com/jobs/view/{1000000+i}",
                    platform="LinkedIn",
                    days_old=i
                )
                jobs.append(job)

        except Exception as e:
            logger.error(f"Error searching LinkedIn: {e}")

        return jobs

    def search_indeed_jobs(self, keywords: List[str], location: str = "", limit: int = 25) -> List[JobListing]:
        """Search Indeed for jobs"""
        jobs = []
        try:
            search_terms = ' '.join(keywords[:3])
            params = {
                'q': search_terms,
                'l': location,
                'fromage': '7',  # Last 7 days
                'sort': 'date'
            }

            url = f"https://www.indeed.com/jobs?{urlencode(params)}"
            logger.info(f"Searching Indeed for: {search_terms} in {location}")

            # Simulated Indeed results
            for i in range(min(limit, 15)):
                job = JobListing(
                    title=f"{search_terms.title()} Specialist",
                    company=f"Indeed Company {i+1}",
                    location=location or "Various Locations",
                    date_posted=(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d"),
                    description=f"Looking for experienced professional in {search_terms}",
                    link=f"https://indeed.com/viewjob?jk={2000000+i}",
                    platform="Indeed",
                    days_old=i
                )
                jobs.append(job)

        except Exception as e:
            logger.error(f"Error searching Indeed: {e}")

        return jobs

    def search_google_jobs(self, keywords: List[str], location: str = "", limit: int = 25) -> List[JobListing]:
        """Search Google Jobs"""
        jobs = []
        try:
            search_terms = ' '.join(keywords[:3])
            # Google Jobs search would require Google Custom Search API
            logger.info(f"Searching Google Jobs for: {search_terms} in {location}")

            # Simulated Google Jobs results
            for i in range(min(limit, 10)):
                job = JobListing(
                    title=f"Senior {search_terms.title()} Role",
                    company=f"Google Partner {i+1}",
                    location=location or "Multiple Locations",
                    date_posted=(datetime.now() - timedelta(days=i+1)).strftime("%Y-%m-%d"),
                    description=f"Exciting opportunity in {search_terms} field",
                    link=f"https://careers.google.com/jobs/results/{3000000+i}",
                    platform="Google Jobs",
                    days_old=i+1
                )
                jobs.append(job)

        except Exception as e:
            logger.error(f"Error searching Google Jobs: {e}")

        return jobs

    def search_monster_jobs(self, keywords: List[str], location: str = "", limit: int = 25) -> List[JobListing]:
        """Search Monster.com for jobs"""
        jobs = []
        try:
            search_terms = ' '.join(keywords[:3])
            logger.info(f"Searching Monster for: {search_terms} in {location}")

            # Simulated Monster results
            for i in range(min(limit, 8)):
                job = JobListing(
                    title=f"{search_terms.title()} Professional",
                    company=f"Monster Corp {i+1}",
                    location=location or "Remote/Hybrid",
                    date_posted=(datetime.now() - timedelta(days=i+2)).strftime("%Y-%m-%d"),
                    description=f"Join our team working with {search_terms} technologies",
                    link=f"https://monster.com/job-openings/{4000000+i}",
                    platform="Monster",
                    days_old=i+2
                )
                jobs.append(job)

        except Exception as e:
            logger.error(f"Error searching Monster: {e}")

        return jobs

class JobFilter:
    """Filter and rank job listings based on criteria"""

    def __init__(self, preferred_locations: List[str], work_preferences: Dict[str, bool]):
        self.preferred_locations = [loc.lower() for loc in preferred_locations]
        self.work_preferences = work_preferences

    def calculate_location_score(self, job_location: str) -> int:
        """Calculate location relevance score"""
        location_lower = job_location.lower()
        score = 0

        # Check for remote/hybrid preferences
        if self.work_preferences.get('remote') and any(term in location_lower for term in ['remote', 'work from home']):
            score += 10
        if self.work_preferences.get('hybrid') and 'hybrid' in location_lower:
            score += 8

        # Check preferred locations
        for pref_loc in self.preferred_locations:
            if pref_loc in location_lower:
                score += 15

        return score

    def filter_and_rank(self, jobs: List[JobListing]) -> List[JobListing]:
        """Filter duplicates and rank jobs"""
        # Remove duplicates based on title and company
        unique_jobs = {}
        for job in jobs:
            key = f"{job.title.lower()}_{job.company.lower()}"
            if key not in unique_jobs:
                unique_jobs[key] = job

        filtered_jobs = list(unique_jobs.values())

        # Calculate location scores
        for job in filtered_jobs:
            job.location_score = self.calculate_location_score(job.location)

        # Sort by recency (days_old) and location score
        ranked_jobs = sorted(filtered_jobs,
                           key=lambda x: (-x.location_score, x.days_old))

        return ranked_jobs

class JobSearchAutomation:
    """Main automation class that coordinates the entire process"""

    def __init__(self):
        self.parser = CoverLetterParser()
        self.searcher = JobSearcher()

    def process_cover_letter(self, cover_letter_path: str) -> Dict:
        """Process cover letter file and extract job search parameters"""
        try:
            with open(cover_letter_path, 'r', encoding='utf-8') as file:
                content = file.read()

            keywords = self.parser.extract_keywords(content)
            work_preferences = self.parser.detect_work_preferences(content)

            return {
                'keywords': keywords,
                'work_preferences': work_preferences,
                'raw_content': content
            }
        except FileNotFoundError:
            logger.error(f"Cover letter file not found: {cover_letter_path}")
            return {}

    def search_all_platforms(self, search_params: Dict) -> List[JobListing]:
        """Search all job platforms"""
        all_jobs = []
        keywords = search_params['keywords']

        # Use job titles and skills as search terms
        search_terms = keywords.get('job_titles', []) + keywords.get('skills', [])
        primary_location = keywords.get('locations', [''])[0] if keywords.get('locations') else ''

        # Search each platform
        platforms = [
            self.searcher.search_linkedin_jobs,
            self.searcher.search_indeed_jobs,
            self.searcher.search_google_jobs,
            self.searcher.search_monster_jobs
        ]

        for search_func in platforms:
            try:
                jobs = search_func(search_terms, primary_location, limit=15)
                all_jobs.extend(jobs)
                time.sleep(2)  # Rate limiting
            except Exception as e:
                logger.error(f"Error in {search_func.__name__}: {e}")

        return all_jobs

    def generate_report(self, jobs: List[JobListing], output_format: str = 'csv') -> str:
        """Generate job report in specified format"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        if output_format.lower() == 'csv':
            filename = f"job_search_results_{timestamp}.csv"
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                fieldnames = ['Title', 'Company', 'Location', 'Date Posted',
                             'Platform', 'Days Old', 'Location Score', 'Link', 'Description']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                writer.writeheader()
                for job in jobs:
                    writer.writerow({
                        'Title': job.title,
                        'Company': job.company,
                        'Location': job.location,
                        'Date Posted': job.date_posted,
                        'Platform': job.platform,
                        'Days Old': job.days_old,
                        'Location Score': job.location_score,
                        'Link': job.link,
                        'Description': job.description[:200] + '...' if len(job.description) > 200 else job.description
                    })

        elif output_format.lower() == 'excel':
            filename = f"job_search_results_{timestamp}.xlsx"
            df = pd.DataFrame([{
                'Title': job.title,
                'Company': job.company,
                'Location': job.location,
                'Date Posted': job.date_posted,
                'Platform': job.platform,
                'Days Old': job.days_old,
                'Location Score': job.location_score,
                'Link': job.link,
                'Description': job.description[:200] + '...' if len(job.description) > 200 else job.description
            } for job in jobs])
            df.to_excel(filename, index=False)

        return filename

    def run_automation(self, cover_letter_path: str, output_format: str = 'csv', max_results: int = 20) -> str:
        """Run the complete job search automation"""
        logger.info("Starting job search automation...")

        # Step 1: Process cover letter
        search_params = self.process_cover_letter(cover_letter_path)
        if not search_params:
            return "Error: Could not process cover letter"

        logger.info(f"Extracted keywords: {search_params['keywords']}")
        logger.info(f"Work preferences: {search_params['work_preferences']}")

        # Step 2: Search all platforms
        logger.info("Searching job platforms...")
        all_jobs = self.search_all_platforms(search_params)
        logger.info(f"Found {len(all_jobs)} total job listings")

        # Step 3: Filter and rank jobs
        job_filter = JobFilter(
            preferred_locations=search_params['keywords'].get('locations', []),
            work_preferences=search_params['work_preferences']
        )

        ranked_jobs = job_filter.filter_and_rank(all_jobs)
        top_jobs = ranked_jobs[:max_results]

        logger.info(f"Filtered to top {len(top_jobs)} relevant positions")

        # Step 4: Generate report
        report_file = self.generate_report(top_jobs, output_format)
        logger.info(f"Report generated: {report_file}")

        return report_file

# Example usage
def main():
    """Example usage of the job search automation"""
    automation = JobSearchAutomation()

    # For demonstration, create a sample cover letter
    sample_cover_letter = """
    Dear Hiring Manager,

    I am writing to express my interest in software engineering positions, particularly
    in Python development and machine learning roles. With 5 years of experience in
    full-stack development using Python, JavaScript, React, and SQL, I am seeking
    opportunities in San Francisco, CA or remote positions.

    My background includes working with AWS, Docker, and agile methodologies. I am
    particularly interested in data science roles and would prefer hybrid or remote
    work arrangements.

    Best regards,
    [Your Name]
    """

    # Save sample cover letter
    with open('Cover_letter.txt', 'w') as f:
        f.write(sample_cover_letter)

    # Run automation
    try:
        result_file = automation.run_automation('Cover_letter.txt', 'csv', 20)
        print(f"Job search completed! Results saved to: {result_file}")
    except Exception as e:
        print(f"Error running automation: {e}")

if __name__ == "__main__":
    main()

Job search completed! Results saved to: job_search_results_20250806_052043.csv
