In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
import os
import re
from typing import List, Dict, Set
from pydantic import BaseModel, Field
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def load_skills_dictionary():
    """Load comprehensive skills dictionary"""
    return {
        # Technical Skills
        'programming_languages': {
            'python', 'java', 'javascript', 'c++', 'c#', 'ruby', 'php', 'swift', 'kotlin', 
            'golang', 'rust', 'typescript', 'scala', 'perl', 'r', 'matlab'
        },
        'web_technologies': {
            'html', 'css', 'react', 'angular', 'vue.js', 'node.js', 'express.js', 'django',
            'flask', 'spring', 'asp.net', 'jquery', 'bootstrap', 'sass', 'less'
        },
        'databases': {
            'sql', 'mysql', 'postgresql', 'mongodb', 'oracle', 'redis', 'elasticsearch',
            'cassandra', 'dynamodb', 'neo4j', 'graphql'
        },
        'cloud_platforms': {
            'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins',
            'circleci', 'gitlab', 'heroku', 'digitalocean'
        },
        'ai_ml': {
            'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'keras',
            'scikit-learn', 'nlp', 'computer vision', 'neural networks', 'ai'
        },
        # Soft Skills
        'soft_skills': {
            'leadership', 'communication', 'teamwork', 'problem solving',
            'project management', 'time management', 'critical thinking',
            'adaptability', 'creativity', 'emotional intelligence'
        }
    }

class CandidateInfo(BaseModel):
    full_name: str = Field(default="Not found")
    email: str = Field(default="Not found")
    phone: str = Field(default="Not found")
    current_position: str = Field(default="Not found")
    years_experience: str = Field(default="Not found")
    key_skills: List[str] = Field(default_factory=list)
    education: str = Field(default="Not found")

class ResumeMatchingAgent:
    def __init__(self, groq_api_key: str):
        self.llm = ChatGroq(
            temperature=0,
            groq_api_key=groq_api_key,
            model_name="mixtral-8x7b-32768"
        )
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )
        self.text_splitter = CharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separator="\n"
        )
        self.vector_store = None
        self.resume_metadata = {}
        self.processed_resumes = set()
        self.skills_dict = load_skills_dictionary()
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            os.system('python -m spacy download en_core_web_sm')
            self.nlp = spacy.load('en_core_web_sm')
        self._setup_agent()


    def _setup_agent(self):
        tools = [
            Tool(
                name="Search Resumes",
                func=self.find_matching_resumes,
                description="Search for resumes matching a job description and requirements"
            ),
            Tool(
                name="Get Candidate Info",
                func=self.get_candidate_info,
                description="Get detailed information about a specific candidate"
            ),
            Tool(
                name="Analyze Skills Gap",
                func=self.analyze_skills_gap,
                description="Analyze the skills gap between required skills and candidate skills"
            )
        ]

        prompt = PromptTemplate.from_template("""
        You are a helpful HR assistant that helps analyze resumes and find the best candidates.
        Your primary task is to identify and display the candidate's name first, followed by other relevant details.
        You have access to the following tools:

        {tools}

        Use the following format:

        Question: the input question you must answer
        Thought: you should always think about what to do
        Action: the action to take, should be one of [{tool_names}]
        Action Input: the input to the action
        Observation: the result of the action
        ... (this Thought/Action/Action Input/Observation can repeat N times)
        Thought: I now know the final answer
        Final Answer: The candidate's name is [Candidate Name]. [Additional details based on the query].

        Begin!

        Question: {input}
        {agent_scratchpad}
        """)

        agent = create_react_agent(self.llm, tools, prompt)
        self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

    def load_resumes(self, resume_dir: str) -> List:
        if not os.path.exists(resume_dir):
            raise ValueError(f"Resume directory '{resume_dir}' does not exist.")

        documents = []
        unique_candidates = {}  # Track unique candidates by email/phone

        for filename in os.listdir(resume_dir):
            if filename.endswith(".pdf"):
                if filename in self.processed_resumes:
                    print(f"Skipping duplicate resume file: {filename}")
                    continue

                file_path = os.path.join(resume_dir, filename)
                try:
                    print(f"Loading resume: {filename}")
                    loader = PyPDFLoader(file_path)
                    docs = loader.load()
                    candidate_info = self._extract_candidate_info(docs)

                    # Create unique identifier for candidate
                    candidate_id = f"{candidate_info['Email']}_{candidate_info['Phone']}"

                    if candidate_id in unique_candidates:
                        print(f"Duplicate candidate found in {filename}. Using first occurrence.")
                        continue

                    # Store unique candidate
                    unique_candidates[candidate_id] = True
                    documents.extend(docs)
                    self.resume_metadata[filename] = {
                        'candidate_info': candidate_info
                    }
                    self.processed_resumes.add(filename)

                except Exception as e:
                    print(f"Error loading resume '{filename}': {str(e)}")

        if documents:
            self.create_vector_store(documents)

        return documents

    def _extract_candidate_info(self, docs: List) -> Dict:
        candidate_info = {
            'Full Name': 'Not found',
            'Email': 'Not found',
            'Phone': 'Not found',
            'Current Position': 'Not found',
            'Years of Experience': 'Not found',
            'Key Skills': [],
            'Education': 'Not found'
        }

        full_text = "\n".join([doc.page_content for doc in docs])

        # Improved name extraction
        # Common headers that might appear above or before names
        name_headers = ["name:", "candidate:", "profile:", "curriculum vitae:", "resume:", "cv:"]
        # Words that typically don't appear in names
        non_name_words = {
            "email", "phone", "skills", "experience", "education", "linkedin", "github", 
            "resume", "cv", "address", "summary", "objective", "profile", "contact",
            "professional", "career", "tel", "mobile", "www", "http", "https",
            "university", "college", "institute", "school", "academy"  # Added educational keywords
        }

        # First try: Look for name after common headers
        name_found = False
        lines = [line.strip() for line in full_text.split('\n') if line.strip()]

        for i, line in enumerate(lines[:10]):  # Check first 10 lines
            line_lower = line.lower()

            # Check if line contains a header
            if any(header in line_lower for header in name_headers):
                # Get the text after the header
                for header in name_headers:
                    if header in line_lower:
                        potential_name = line_lower.split(header)[1].strip()
                        if potential_name:
                            # Clean and verify the name
                            cleaned_name = ' '.join(word.strip() for word in potential_name.split())
                            if (len(cleaned_name.split()) >= 2 and  # At least two words
                                not any(word.lower() in non_name_words for word in cleaned_name.split()) and
                                cleaned_name[0].isalpha()):  # Starts with a letter
                                candidate_info['Full Name'] = cleaned_name.title()
                                name_found = True
                                break
                if name_found:
                    break

        # Second try: Look for a name pattern in the first few lines
        if not name_found:
            for line in lines[:5]:  # Check first 5 lines
                # Skip lines with common non-name indicators
                if any(word in line.lower() for word in non_name_words):
                    continue

                # Check if line matches name pattern:
                # - 2-4 words
                # - Each word starts with capital letter
                # - No special characters except hyphen and apostrophe
                words = line.split()
                if (2 <= len(words) <= 4 and
                    all(word[0].isupper() for word in words) and
                    all(word.replace('-', '').replace("'", '').isalnum() for word in words)):
                    candidate_info['Full Name'] = line
                    name_found = True
                    break

        # Third try: Use simple pattern matching for remaining cases
        if not name_found:
            name_pattern = r'^([A-Z][a-zA-Z\'-]+\s+[A-Z][a-zA-Z\'-]+(?:\s+[A-Z][a-zA-Z\'-]+)?(?:\s+[A-Z][a-zA-Z\'-]+)?)$'
            for line in lines[:10]:
                if re.match(name_pattern, line):
                    candidate_info['Full Name'] = line
                    break

        # Improved regex pattern for extracting emails
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

        # Find all email matches in the text
        emails = re.findall(email_pattern, full_text)

        # Filter and select the most likely candidate email
        if emails:
            # Prioritize emails with common domains (e.g., gmail.com, yahoo.com, etc.)
            common_domains = ['gmail', 'yahoo', 'outlook', 'hotmail', 'icloud']
            for email in emails:
                domain = email.split('@')[1].split('.')[0]  # Extract domain part
                if domain.lower() in common_domains:
                    candidate_info['Email'] = email
                    break
            else:
                # If no common domain is found, use the first email
                candidate_info['Email'] = emails[0]

        # Extract phone
        phone_pattern = r'\b(?:\+?1[-.]?)?\s*(?:\([0-9]{3}\)|[0-9]{3})[-.]?[0-9]{3}[-.]?[0-9]{4}\b'
        phones = re.findall(phone_pattern, full_text)
        if phones:
            candidate_info['Phone'] = phones[0]

        # Extract skills
        skills_section = ''
        for header in ['Skills:', 'Technical Skills:', 'Core Competencies:', 'Key Skills:']:
            if header in full_text:
                try:
                    skills_section = full_text.split(header)[1].split('\n\n')[0]
                    break
                except IndexError:
                    continue

        if skills_section:
            skills = re.split(r'[,•|\n]', skills_section)
            candidate_info['Key Skills'] = [skill.strip() for skill in skills if skill.strip()]

        # Extract current position
        position_keywords = ["Current Position:", "Current Role:", "Present:", "Role:"]
        for keyword in position_keywords:
            if keyword in full_text:
                try:
                    position_section = full_text.split(keyword)[1].split('\n')[0].strip()
                    candidate_info['Current Position'] = position_section
                    break
                except IndexError:
                    continue

        # Extract years of experience
        experience_pattern = r'\b(\d+)\s*(years?|yrs?)\b'
        experience_match = re.search(experience_pattern, full_text, re.IGNORECASE)
        if experience_match:
            candidate_info['Years of Experience'] = f"{experience_match.group(1)} years"

        # Extract education
        education_keywords = ["Education:", "Academic Background:", "Degree:"]
        for keyword in education_keywords:
            if keyword in full_text:
                try:
                    education_section = full_text.split(keyword)[1].split('\n\n')[0].strip()
                    candidate_info['Education'] = education_section
                    break
                except IndexError:
                    continue

        return candidate_info

    def create_vector_store(self, documents: List):
        if not documents:
            raise ValueError("No documents provided to create vector store.")

        self.vector_store = FAISS.from_documents(
            documents=documents,
            embedding=self.embeddings
        )

    def find_matching_resumes(self, job_description: str, requirements: List[str]) -> List[Dict]:
        if not self.vector_store:
            raise ValueError("Vector store not initialized. Please load resumes first.")

        # Extract required skills from job description
        required_skills = self._extract_skills_from_text(job_description)
        
        results = self.vector_store.similarity_search_with_score(job_description)
        matches = []
        seen_candidates = set()

        for doc, score in results:
            filename = os.path.basename(doc.metadata.get('source', ''))
            if filename in self.resume_metadata:
                candidate_info = self.resume_metadata[filename]['candidate_info']
                
                # Create unique identifier for candidate
                candidate_id = f"{candidate_info['Email']}_{candidate_info['Phone']}"
                
                if candidate_id in seen_candidates:
                    continue
                    
                seen_candidates.add(candidate_id)
                
                # Extract candidate skills with categories
                candidate_skills = self._extract_skills_from_text(doc.page_content)
                
                # Calculate matches and gaps for each skill category
                skill_matches = {
                    'technical': {
                        'matching': candidate_skills['technical_skills'].intersection(required_skills['technical_skills']),
                        'missing': required_skills['technical_skills'] - candidate_skills['technical_skills'],
                        'additional': candidate_skills['technical_skills'] - required_skills['technical_skills']
                    },
                    'soft': {
                        'matching': candidate_skills['soft_skills'].intersection(required_skills['soft_skills']),
                        'missing': required_skills['soft_skills'] - candidate_skills['soft_skills'],
                        'additional': candidate_skills['soft_skills'] - required_skills['soft_skills']
                    }
                }
                
                # Calculate match scores
                technical_match = (len(skill_matches['technical']['matching']) / 
                                 len(required_skills['technical_skills'])) * 100 if required_skills['technical_skills'] else 0
                soft_match = (len(skill_matches['soft']['matching']) / 
                            len(required_skills['soft_skills'])) * 100 if required_skills['soft_skills'] else 0
                
                # Overall match score (weighted)
                match_score = (technical_match * 0.7) + (soft_match * 0.3)
                
                matches.append({
                    'filename': filename,
                    'name': candidate_info['Full Name'],
                    'email': candidate_info['Email'],
                    'phone': candidate_info['Phone'],
                    'current_position': candidate_info['Current Position'],
                    'experience': candidate_info['Years of Experience'],
                    'education': candidate_info['Education'],
                    'technical_skills': {
                        'matching': list(skill_matches['technical']['matching']),
                        'missing': list(skill_matches['technical']['missing']),
                        'additional': list(skill_matches['technical']['additional'])
                    },
                    'soft_skills': {
                        'matching': list(skill_matches['soft']['matching']),
                        'missing': list(skill_matches['soft']['missing']),
                        'additional': list(skill_matches['soft']['additional'])
                    },
                    'match_score': round(match_score, 2),
                    'technical_match': round(technical_match, 2),
                    'soft_match': round(soft_match, 2)
                })

        matches.sort(key=lambda x: x['match_score'], reverse=True)
        return matches

    def _extract_skills_from_text(self, text: str) -> Dict[str, Set[str]]:
        """Extract skills from text with categorization"""
        text = text.lower()
        found_skills = {
            'technical_skills': set(),
            'soft_skills': set(),
            'domain_skills': set()
        }
        
        # Process text with spaCy
        doc = self.nlp(text)
        
        # Extract noun phrases and individual tokens
        potential_skills = set()
        for chunk in doc.noun_chunks:
            potential_skills.add(chunk.text.lower())
        for token in doc:
            if not token.is_stop and not token.is_punct:
                potential_skills.add(token.text.lower())
        
        # Match against skills dictionary
        for skill_type, skills in self.skills_dict.items():
            for skill in skills:
                # Check for exact matches
                if skill in text:
                    if skill_type == 'soft_skills':
                        found_skills['soft_skills'].add(skill)
                    else:
                        found_skills['technical_skills'].add(skill)
                
                # Check for variations (e.g., "Python programming", "Python developer")
                for potential in potential_skills:
                    if skill in potential and len(skill) > 2:  # Avoid matching too short strings
                        if skill_type == 'soft_skills':
                            found_skills['soft_skills'].add(skill)
                        else:
                            found_skills['technical_skills'].add(skill)
        
        return found_skills

    def _calculate_experience_score(self, experience: str) -> float:
        try:
            if isinstance(experience, str):
                years = float(re.findall(r'\d+', experience)[0])
            else:
                years = float(experience)
            return min(100, years * 20)
        except:
            return 0

    def get_candidate_info(self, filename: str) -> Dict:
        if filename not in self.resume_metadata:
            return {
                "error": f"No candidate found with filename: {filename}",
                "status": "error"
            }

        return {
            "status": "success",
            "candidate_info": self.resume_metadata[filename]['candidate_info']
        }

    def analyze_skills_gap(self, filename: str, job_description: str) -> Dict:
        if filename not in self.resume_metadata:
            return {
                "error": f"No candidate found with filename: {filename}",
                "status": "error"
            }

        candidate_info = self.resume_metadata[filename]['candidate_info']
        job_skills = self._extract_skills_from_text(job_description)
        candidate_skills = set(skill.lower() for skill in candidate_info['Key Skills'])

        matching_skills = candidate_skills.intersection(job_skills)
        missing_skills = job_skills - candidate_skills
        additional_skills = candidate_skills - job_skills

        match_percentage = (len(matching_skills) / len(job_skills) * 100) if job_skills else 0

        analysis = {
            "status": "success",
            "analysis": {
                "candidate_name": candidate_info['Full Name'],
                "match_percentage": round(match_percentage, 2),
                "matching_skills": list(matching_skills),
                "missing_skills": list(missing_skills),
                "additional_skills": list(additional_skills),
                "total_required_skills": len(job_skills),
                "total_matching_skills": len(matching_skills),
                "total_missing_skills": len(missing_skills)
            }
        }

        return analysis

    def run(self, query: str, max_retries: int = 3) -> str:
        """
        Execute a query with retry mechanism.

        Args:
            query (str): The user's query.
            max_retries (int): Maximum number of retries if the query fails.

        Returns:
            str: The final answer or an error message.
        """
        for attempt in range(max_retries):
            try:
                print(f"Attempt {attempt + 1} to process query: {query}")

                # Extract required skills from the query
                required_skills = self._extract_skills_from_text(query)
                if required_skills:
                    query = f"{query} (Required skills identified: {', '.join(required_skills)})"

                # Execute the agent
                response = self.agent_executor.invoke({"input": query})

                # Check if the response is satisfactory
                if self._is_response_satisfactory(response):
                    return response["output"] if "output" in response else str(response)
                else:
                    print("Response not satisfactory. Retrying...")

            except Exception as e:
                print(f"Error processing query: {str(e)}. Retrying...")

        # If all retries fail, return an error message
        return f"Failed to process query after {max_retries} attempts. Please try again later."

    def _is_response_satisfactory(self, response: dict) -> bool:
        """
        Check if the agent's response is satisfactory.

        Args:
            response (dict): The agent's response.

        Returns:
            bool: True if the response is satisfactory, False otherwise.
        """
        # Example: Check if the response contains valid data
        if "output" in response and response["output"]:
            return True
        return False