In [None]:
!pip freeze > requirements_resume_parsing.txt

In [119]:
from pyresparser import ResumeParser
from pymongo import MongoClient
from PyPDF4 import PdfFileReader
import os
import re
from tqdm import tqdm
import json 
from tika import parser
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings

warnings.filterwarnings(action='ignore')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...


True

In [3]:
# Resume Paths
chartered_accountant_resume_path = 'resume/chartered accountant/'
machine_learning_resume_path = 'resume/machine learning/'
software_developer_resume_path = 'resume/software developer/'
product_manager_resume_path = 'resume/product manager/'

In [21]:
# Resume for chartered accountant
chartered_accountant_resume = [resume_file for resume_file in os.listdir(chartered_accountant_resume_path)]
print(f"Number of Chartered Accountant Resume: {len(chartered_accountant_resume)}")

Number of Chartered Accountant Resume: 5


In [18]:
# Resume for machine learning
machine_learning_resume = [resume_file for resume_file in os.listdir(machine_learning_resume_path)]
print(f"Number of Machine Learning Resume: {len(machine_learning_resume)}")

Number of Machine Learning Resume: 6


In [19]:
# Resume for product manager
product_manager_resume  = [resume_file for resume_file in os.listdir(product_manager_resume_path)]
print(f"Number of Product Manager Resume: {len(product_manager_resume)}")

Number of Product Manager Resume: 1


In [20]:
# Resume for software developer
software_developer_resume = [resume_file for resume_file in os.listdir(software_developer_resume_path)]
print(f"Number of Software Developer Resume: {len(software_developer_resume)}")

Number of Software Developer Resume: 9


In [124]:
# Function to extract text from a PDF file using Tika and perform basic cleaning
def remove_words_matching_patterns(text):
    patterns = ['outlook', 'gmail', 'linkedin', r'\b\w+com\b']
    combined_pattern = '|'.join(patterns)
    cleaned_text = re.sub(combined_pattern, '', text, flags=re.IGNORECASE)
    return cleaned_text


def extract_and_clean_text_from_pdf(file_path):
    raw = parser.from_file(file_path)
    text = raw['content']
    # Remove special characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove LinkedIn or GitHub URLs
    text = re.sub(r'(https?:\/\/(?:www\.)?linkedin\.com\/[^\s]+)|(https?:\/\/(?:www\.)?github\.com\/[^\s]+)', '', text)
    # Remove GitHub or Gmail addresses
    text = re.sub(r'(github|gmail)[\w.]*', '', text, flags=re.IGNORECASE)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
     # Remove email addresses with domain extensions
    text = re.sub(r'\S+@(?:gmail|outlook)\.(?:com)', '', text)
    # Remove specific phrases like "gmail.com"
    text = re.sub(r'\bgmail\.com\b', '', text)
    # Remove GitHub usernames
    text = re.sub(r'\bgithub\.com\/\w+\b', '', text)
    # Remove special characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)

     # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return str.lower(remove_words_matching_patterns(' '.join(lemmatized_tokens)))

# Function to parse and convert resume to JSON format
def parse_and_jsonize_resume(resume_path, resume_file):
    parsed_data = ResumeParser(resume_path + resume_file).get_extracted_data()
    return json.dumps(parsed_data, indent=4)

In [127]:
# MongoDB configuration
client = MongoClient('mongodb://localhost:27017/')
db = client['job-resume-db']
collection = db['resume']

In [128]:
# Folder path containing the resume PDFs
resume_folder = 'resume'

# Delete all existing documents in the collection
collection.delete_many({})

# Iterate over each folder in the resume directory
for category in tqdm(os.listdir(resume_folder), desc="Extracting, Parsing Resume and Storing in MongoDB: "):
    category_folder = os.path.join(resume_folder, category)
    if os.path.isdir(category_folder):
        # Iterate over each resume PDF in the category folder
        for resume_file in os.listdir(category_folder):
            resume_path = os.path.join(category_folder, resume_file)
            
            # Extract and clean the text from the resume PDF
            cleaned_text = extract_and_clean_text_from_pdf(resume_path)
            
            # Parse and convert resume to JSON format
            parsed_json = parse_and_jsonize_resume(category_folder + '/', resume_file)
            
            # Create a dictionary with the resume information
            resume_data = {
                'index': resume_file,
                'category': category,
                'text': cleaned_text,
                'parsed_resume': json.loads(parsed_json)
            }
            
            # Insert the resume data into the MongoDB collection
            collection.insert_one(resume_data)

Extracting, Parsing Resume and Storing in MongoDB: 100%|██████████| 4/4 [00:28<00:00,  7.17s/it]
