Updated version (#264)

* added data extraction files * additional script files * parsing and processing file run_first.py * main.py * removed cohere dependency , update codes . (#261) (cherry picked from commit a1cc170) * removed tkinter , removed cohere dependent code * added resumes , jd with new name format
srbhr · Jun 7, 2024 · 0b39f8c · 0b39f8c
1 parent a1cc170
commit 0b39f8c
Show file tree

Hide file tree

Showing 23 changed files with 1,078 additions and 2,579 deletions.
diff --git a/...n/JobDescription-job_desc_front_end_engineer.pdf9e508eff-a083-4e2d-8b1e-577cbc8f50fc.json b/...n/JobDescription-job_desc_front_end_engineer.pdf9e508eff-a083-4e2d-8b1e-577cbc8f50fc.json
diff --git a/.../JobDescription-job_desc_full_stack_engineer.pdfcccf72e0-5f20-4aa8-8679-d91b720b7247.json b/.../JobDescription-job_desc_full_stack_engineer.pdfcccf72e0-5f20-4aa8-8679-d91b720b7247.json
diff --git a/...ption/JobDescription-job_desc_java_developer.pdf1fb6435c-531a-4f04-84c2-c4e73e1f1a3f.json b/...ption/JobDescription-job_desc_java_developer.pdf1fb6435c-531a-4f04-84c2-c4e73e1f1a3f.json
diff --git a/...tion/JobDescription-job_desc_product_manager.pdf0b613898-1183-49c5-ad07-e03bd9af72e5.json b/...tion/JobDescription-job_desc_product_manager.pdf0b613898-1183-49c5-ad07-e03bd9af72e5.json
diff --git a/...rocessed/Resumes/Resume-alfred_pennyworth_pm.pdfb36fac00-1d60-49ba-a9e4-33477c928e98.json b/...rocessed/Resumes/Resume-alfred_pennyworth_pm.pdfb36fac00-1d60-49ba-a9e4-33477c928e98.json
diff --git a/Data/Processed/Resumes/Resume-barry_allen_fe.pdfbef9d9b1-fa14-4aa0-8111-6033cceacb5a.json b/Data/Processed/Resumes/Resume-barry_allen_fe.pdfbef9d9b1-fa14-4aa0-8111-6033cceacb5a.json
diff --git a/...ocessed/Resumes/Resume-bruce_wayne_fullstack.pdf04c228e3-04a4-4970-a149-a3cffc92aee8.json b/...ocessed/Resumes/Resume-bruce_wayne_fullstack.pdf04c228e3-04a4-4970-a149-a3cffc92aee8.json
diff --git a/Data/Processed/Resumes/Resume-harvey_dent_mle.pdf5dde9183-5af4-4107-91e1-7b2d66fec490.json b/Data/Processed/Resumes/Resume-harvey_dent_mle.pdf5dde9183-5af4-4107-91e1-7b2d66fec490.json
diff --git a/Data/Processed/Resumes/Resume-john_doe.pdf817fb796-14b9-4819-a224-7eed66b3ce04.json b/Data/Processed/Resumes/Resume-john_doe.pdf817fb796-14b9-4819-a224-7eed66b3ce04.json
diff --git a/resume_matcher/__init__.py b/resume_matcher/__init__.py
diff --git a/resume_matcher/dataextractor/DataExtractor.py b/resume_matcher/dataextractor/DataExtractor.py
@@ -0,0 +1,221 @@
+import re
+import urllib
+
+import spacy
+
+from resume_matcher.dataextractor.TextCleaner import TextCleaner
+
+# Load the English model
+nlp = spacy.load("en_core_web_md")
+
+RESUME_SECTIONS = [
+    "Contact Information",
+    "Objective",
+    "Summary",
+    "Education",
+    "Experience",
+    "Skills",
+    "Projects",
+    "Certifications",
+    "Licenses",
+    "Awards",
+    "Honors",
+    "Publications",
+    "References",
+    "Technical Skills",
+    "Computer Skills",
+    "Programming Languages",
+    "Software Skills",
+    "Soft Skills",
+    "Language Skills",
+    "Professional Skills",
+    "Transferable Skills",
+    "Work Experience",
+    "Professional Experience",
+    "Employment History",
+    "Internship Experience",
+    "Volunteer Experience",
+    "Leadership Experience",
+    "Research Experience",
+    "Teaching Experience",
+]
+
+
+class DataExtractor:
+    """
+    A class for extracting various types of data from text.
+    """
+
+    def __init__(self, raw_text: str):
+        """
+        Initialize the DataExtractor object.
+
+        Args:
+            raw_text (str): The raw input text.
+        """
+
+        self.text = raw_text
+        self.clean_text = TextCleaner.clean_text(self.text)
+        self.doc = nlp(self.clean_text)
+
+    def extract_links(self):
+        """
+        Find links of any type in a given string.
+
+        Args:
+            text (str): The string to search for links.
+
+        Returns:
+            list: A list containing all the found links.
+        """
+        link_pattern = r"\b(?:https?://|www\.)\S+\b"
+        links = re.findall(link_pattern, self.text)
+        return links
+
+    def extract_links_extended(self):
+        """
+        Extract links of all kinds (HTTP, HTTPS, FTP, email, www.linkedin.com,
+          and github.com/user_name) from a webpage.
+
+        Args:
+            url (str): The URL of the webpage.
+
+        Returns:
+            list: A list containing all the extracted links.
+        """
+        links = []
+        try:
+            response = urllib.request.urlopen(self.text)
+            html_content = response.read().decode("utf-8")
+            pattern = r'href=[\'"]?([^\'" >]+)'
+            raw_links = re.findall(pattern, html_content)
+            for link in raw_links:
+                if link.startswith(
+                    (
+                        "http://",
+                        "https://",
+                        "ftp://",
+                        "mailto:",
+                        "www.linkedin.com",
+                        "github.com/",
+                        "twitter.com",
+                    )
+                ):
+                    links.append(link)
+        except Exception as e:
+            print(f"Error extracting links: {str(e)}")
+        return links
+
+    def extract_names(self):
+        """Extracts and returns a list of names from the given
+        text using spaCy's named entity recognition.
+
+        Args:
+            text (str): The text to extract names from.
+
+        Returns:
+            list: A list of strings representing the names extracted from the text.
+        """
+        names = [ent.text for ent in self.doc.ents if ent.label_ == "PERSON"]
+        return names
+
+    def extract_emails(self):
+        """
+        Extract email addresses from a given string.
+
+        Args:
+            text (str): The string from which to extract email addresses.
+
+        Returns:
+            list: A list containing all the extracted email addresses.
+        """
+        email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
+        emails = re.findall(email_pattern, self.text)
+        return emails
+
+    def extract_phone_numbers(self):
+        """
+        Extract phone numbers from a given string.
+
+        Args:
+            text (str): The string from which to extract phone numbers.
+
+        Returns:
+            list: A list containing all the extracted phone numbers.
+        """
+        phone_number_pattern = (
+            r"^(\+\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"
+        )
+        phone_numbers = re.findall(phone_number_pattern, self.text)
+        return phone_numbers
+
+    def extract_experience(self):
+        """
+        Extract experience from a given string. It does so by using the Spacy module.
+
+        Args:
+            text (str): The string from which to extract experience.
+
+        Returns:
+            str: A string containing all the extracted experience.
+        """
+        experience_section = []
+        in_experience_section = False
+
+        for token in self.doc:
+            if token.text in RESUME_SECTIONS:
+                if token.text == "Experience" or "EXPERIENCE" or "experience":
+                    in_experience_section = True
+                else:
+                    in_experience_section = False
+
+            if in_experience_section:
+                experience_section.append(token.text)
+
+        return " ".join(experience_section)
+
+    def extract_position_year(self):
+        """
+        Extract position and year from a given string.
+
+        Args:
+            text (str): The string from which to extract position and year.
+
+        Returns:
+            list: A list containing the extracted position and year.
+        """
+        position_year_search_pattern = (
+            r"(\b\w+\b\s+\b\w+\b),\s+(\d{4})\s*-\s*(\d{4}|\bpresent\b)"
+        )
+        position_year = re.findall(position_year_search_pattern, self.text)
+        return position_year
+
+    def extract_particular_words(self):
+        """
+        Extract nouns and proper nouns from the given text.
+
+        Args:
+            text (str): The input text to extract nouns from.
+
+        Returns:
+            list: A list of extracted nouns.
+        """
+        pos_tags = ["NOUN", "PROPN"]
+        nouns = [token.text for token in self.doc if token.pos_ in pos_tags]
+        return nouns
+
+    def extract_entities(self):
+        """
+        Extract named entities of types 'GPE' (geopolitical entity) and 'ORG' (organization) from the given text.
+
+        Args:
+            text (str): The input text to extract entities from.
+
+        Returns:
+            list: A list of extracted entities.
+        """
+        entity_labels = ["GPE", "ORG"]
+        entities = [
+            token.text for token in self.doc.ents if token.label_ in entity_labels
+        ]
+        return list(set(entities))
diff --git a/resume_matcher/dataextractor/KeyTermExtractor.py b/resume_matcher/dataextractor/KeyTermExtractor.py
@@ -0,0 +1,154 @@
+import spacy
+import textacy
+from textacy import extract
+
+# Load the English model
+nlp = spacy.load("en_core_web_md")
+
+RESUME_SECTIONS = [
+    "Contact Information",
+    "Objective",
+    "Summary",
+    "Education",
+    "Experience",
+    "Skills",
+    "Projects",
+    "Certifications",
+    "Licenses",
+    "Awards",
+    "Honors",
+    "Publications",
+    "References",
+    "Technical Skills",
+    "Computer Skills",
+    "Programming Languages",
+    "Software Skills",
+    "Soft Skills",
+    "Language Skills",
+    "Professional Skills",
+    "Transferable Skills",
+    "Work Experience",
+    "Professional Experience",
+    "Employment History",
+    "Internship Experience",
+    "Volunteer Experience",
+    "Leadership Experience",
+    "Research Experience",
+    "Teaching Experience",
+]
+
+REGEX_PATTERNS = {
+    "email_pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
+    "phone_pattern": r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}",
+    "link_pattern": r"\b(?:https?://|www\.)\S+\b",
+}
+
+READ_RESUME_FROM = "Data/Resumes/"
+SAVE_DIRECTORY_RESUME = "Data/Processed/Resumes"
+
+READ_JOB_DESCRIPTION_FROM = "Data/JobDescription/"
+SAVE_DIRECTORY_JOB_DESCRIPTION = "Data/Processed/JobDescription"
+
+
+class KeytermExtractor:
+    """
+    A class for extracting keyterms from a given text using various algorithms.
+    """
+
+    def __init__(self, raw_text: str, top_n_values: int = 20):
+        """
+        Initialize the KeytermExtractor object.
+
+        Args:
+            raw_text (str): The raw input text.
+            top_n_values (int): The number of top keyterms to extract.
+        """
+        self.raw_text = raw_text
+        self.text_doc = textacy.make_spacy_doc(self.raw_text, lang="en_core_web_md")
+        self.top_n_values = top_n_values
+
+    def get_keyterms_based_on_textrank(self):
+        """
+        Extract keyterms using the TextRank algorithm.
+
+        Returns:
+            List[str]: A list of top keyterms based on TextRank.
+        """
+        return list(
+            extract.keyterms.textrank(
+                self.text_doc, normalize="lemma", topn=self.top_n_values
+            )
+        )
+
+    def get_keyterms_based_on_sgrank(self):
+        """
+        Extract keyterms using the SGRank algorithm.
+
+        Returns:
+            List[str]: A list of top keyterms based on SGRank.
+        """
+        return list(
+            extract.keyterms.sgrank(
+                self.text_doc, normalize="lemma", topn=self.top_n_values
+            )
+        )
+
+    def get_keyterms_based_on_scake(self):
+        """
+        Extract keyterms using the sCAKE algorithm.
+
+        Returns:
+            List[str]: A list of top keyterms based on sCAKE.
+        """
+        return list(
+            extract.keyterms.scake(
+                self.text_doc, normalize="lemma", topn=self.top_n_values
+            )
+        )
+
+    def get_keyterms_based_on_yake(self):
+        """
+        Extract keyterms using the YAKE algorithm.
+
+        Returns:
+            List[str]: A list of top keyterms based on YAKE.
+        """
+        return list(
+            extract.keyterms.yake(
+                self.text_doc, normalize="lemma", topn=self.top_n_values
+            )
+        )
+
+    def bi_gramchunker(self):
+        """
+        Chunk the text into bigrams.
+
+        Returns:
+            List[str]: A list of bigrams.
+        """
+        return list(
+            textacy.extract.basics.ngrams(
+                self.text_doc,
+                n=2,
+                filter_stops=True,
+                filter_nums=True,
+                filter_punct=True,
+            )
+        )
+
+    def tri_gramchunker(self):
+        """
+        Chunk the text into trigrams.
+
+        Returns:
+            List[str]: A list of trigrams.
+        """
+        return list(
+            textacy.extract.basics.ngrams(
+                self.text_doc,
+                n=3,
+                filter_stops=True,
+                filter_nums=True,
+                filter_punct=True,
+            )
+        )