## Import all the necessary libraries

In [1]:
import pdfplumber
import docx
from PIL import Image
import pytesseract
import re
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq
import os
import csv
import ast
import pandas as pd
import fitz

## Define a function to extract the text from a resume of any compatibler format

The `extract_text` function reads text from various file types (.pdf, .docx/.doc, image files, .txt) based on the file extension. It extracts text accordingly and returns it, printing a success message if extraction is completed or an error message if an exception occurs.

In [2]:
def extract_text(file_path):
    # Get the file extension in lowercase
    file_extension = os.path.splitext(file_path)[1].lower()
    
    try:
        print(f"Extracting from : {file_path}")
        
        if file_extension == '.pdf':
            doc = fitz.open(file_path)
            for page_num in range(doc.page_count):
                page = doc.load_page(page_num)
                text = page.get_text("text") 
                print("\tExtraction completed")
                return text
                
        elif file_extension in ['.docx', '.doc']:
            doc = docx.Document(file_path)
            print("\tExtraction completed")
            return "\n".join([para.text for para in doc.paragraphs])
            
        elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
            image = Image.open(file_path)
            print("\tExtraction completed")
            return pytesseract.image_to_string(image)
            
        elif file_extension == '.txt':
            with open(file_path, 'r', encoding='utf-8') as file:
                print("\tExtraction completed")
                return file.read()
                
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
        
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return ""  # Return empty string on error

## Remove all special characters and retain relevant information

This function, `clean_resume_text`, removes emails, special characters, and newlines from a resume's text, making it easier to process. It then adds any extracted emails back at the end of the cleaned text for reference.

In [3]:
def clean_resume_text(text):
    # Regular expression for email addresses
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # Regular expression for special characters (excluding spaces and alphanumeric characters)
    special_characters = r'[^a-zA-Z0-9\s]'
    
    # Find and extract emails
    emails = re.findall(email_pattern, text)
    
    # Remove emails, phone numbers, and newline characters
    cleaned_text = re.sub(email_pattern, '', text)
    cleaned_text = cleaned_text.replace('\n', ' ')  # Remove newlines by replacing them with spaces
    
    # Remove any remaining special characters
    cleaned_text = re.sub(special_characters, ' ', cleaned_text)
    
    # Add extracted emails back at the end of the text
    cleaned_text += "\n" + "\n".join(emails)
    
    return cleaned_text

## Extract relevant information from the resume in structured format

The function, `extract_resume_info_from_text`, leverages the LangChain framework to extract structured data from a resume's raw text using the ChatGroq API, specifically tuned to capture and format keywords in a dictionary for ease of analysis.

In [4]:
os.environ['GROQ_API_KEY'] = "YOUR_API_KEY" # Use your ChatGroq API key

def extract_resume_info_from_text(preprocessed_text):
    # Create the prompt template
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", "I want you to go through a resume text and return the keywords for name ('name'), email ('email'), technical skills ('technical_skills'), soft skills ('soft_skills'), (give keywords based on the text), projects ('projects'), work experiences ('work_experience') (only the job role along with the company (Format: 'Job Role at Company' for each experience)) and total years of experience ('total_years_of_experience') of the person (only the total duration in 'm years n months' if more than a year else in 'n months'). Store the keywords only if present in the text. Let the extracted information be in dictionary format only."),
            ("user", f"Question: {preprocessed_text}")
        ]
    )
    
    # Initialize the groqllm with the desired model (here Llama3.0) and temperature settings
    groqllm = ChatGroq(model="llama3-70b-8192",temperature=0) 
    
    # Set up the output parser
    outputparser = StrOutputParser()
    
    # Create a chain to process the input through the API and parse the output
    chainSec = prompt | groqllm | outputparser
    
    # Run the chain with the preprocessed text
    extracted_info = chainSec.invoke({'question': preprocessed_text})
    
    return extracted_info

## Store the structured data in an Excel spreadsheet

The `store_to_csv` function extracts a dictionary from a text, converts it, and appends it as a row to a specified CSV file. If the file doesn't exist, it writes headers based on dictionary keys before adding the data.

In [5]:
def store_to_csv(text_data, csv_filename):

    # Extract the dictionary part from the text
    start_idx = text_data.find("{")
    end_idx = text_data.rfind("}") + 1
    dict_text = text_data[start_idx:end_idx]
    
    # Convert to a dictionary
    data = ast.literal_eval(dict_text)
    
    # Define the headers based on the dictionary keys
    headers = data.keys()
    
    # Check if the CSV file already exists
    file_exists = os.path.isfile(csv_filename)
    
    # Open the CSV file in append mode
    with open(csv_filename, mode='a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        
        # Write the header only if the file doesn't exist
        if not file_exists:
            writer.writeheader()
        
        # Write the data row to the CSV file
        writer.writerow(data)
    
    print(f"Data stored successfully in {csv_filename}.")

## Working Example

#### 1. `resume_text` stores the text extracted from the resume

In [6]:
resume_text = extract_text("sample_resumes/Sample Resume 4.pdf")
resume_text

Extracting from : sample_resumes/Sample Resume 4.pdf
	Extraction completed


' \n AashokKumar\n aashokrekumar@gmail.com\n +91-0176884977\n Bangalore,India\n https://www.linkedin.com/in/techaieashok\n / h\n https://github.com/loavespacetec Profile\n Passionateaboutnewtechnologyhavingexperienceof2yearsinPython,DataScience,MachineLearning,SQL,\n Cloud,etc.IhaveworkedonvariousprojectsindifferentdomainsofAIandMLfieldtosolvechallengingbusiness\n problems.\n ProfessionalExperience\n Syste\n m    Engineer\n ,   TC  S  Oct2021–present\n |  Bangalore,\n India\n •\n Crastrategiestominimizerisksindecision-makingandenhanceprofitabilitythroughtheuseofdatascience\n •\n Workingwithinanagileenvironment,Ihaveeffectivelymanagedmultipletasksandprioritieswhilecollaborating\n   withcross-functionalteamstodeliverprojectsontimeandtoahighstandard.Ihaveparticipatedindailystand-\n upcallsandworkedwithinsprintcyclestoensurethetimelydeliveryofkeydeliverables.\n •\n Mystronganalyticalskills,combinedwithmyabilitytocommunicatecomplexfindingsinaclearandconcise\n   manner,haveallowedmetoeffecti

#### 2. `cleaned_text` contains the processed text after removal of special characters

In [7]:
cleaned_text = clean_resume_text(resume_text)
cleaned_text

'   AashokKumar     91 0176884977  Bangalore India  https   www linkedin com in techaieashok    h  https   github com loavespacetec Profile  Passionateaboutnewtechnologyhavingexperienceof2yearsinPython DataScience MachineLearning SQL   Cloud etc IhaveworkedonvariousprojectsindifferentdomainsofAIandMLfieldtosolvechallengingbusiness  problems   ProfessionalExperience  Syste  m    Engineer      TC  S  Oct2021 present     Bangalore   India     Crastrategiestominimizerisksindecision makingandenhanceprofitabilitythroughtheuseofdatascience     Workingwithinanagileenvironment Ihaveeffectivelymanagedmultipletasksandprioritieswhilecollaborating    withcross functionalteamstodeliverprojectsontimeandtoahighstandard Ihaveparticipatedindailystand   upcallsandworkedwithinsprintcyclestoensurethetimelydeliveryofkeydeliverables      Mystronganalyticalskills combinedwithmyabilitytocommunicatecomplexfindingsinaclearandconcise    manner haveallowedmetoeffectivelycontributetodecision makingprocessesanddrive

#### 3. The information about the candidate in the structured format is stored in `resume_info`

In [8]:
resume_info = extract_resume_info_from_text(cleaned_text)
resume_info

"Here is the extracted information in dictionary format:\n\n```\n{\n    'name': 'Aashok Kumar',\n    'email': 'aashokrekumar@gmail.com',\n    'technical_skills': ['Python', 'Data Science', 'Machine Learning', 'SQL', 'Cloud', 'NumPy', 'Pandas', 'Matplotlib', 'Seaborn', 'Scikit learn', 'PySpark', 'SciPy', 'Tableau', 'Jupyter Notebook', 'TensorFlow', 'Anaconda', 'Git', 'Jira', 'Confluence', 'Docker', 'C', 'UFT Developer', 'Visual Studio'],\n    'soft_skills': ['Passionate', 'Strong analytical skills', 'Ability to communicate complex findings', 'Highly motivated', 'Effective collaboration', 'Time management', 'Prioritization', 'Decision making', 'Problem solving', 'Communication', 'Teamwork'],\n    'projects': ['Prediction of customer eligibility for credit using Machine Learning'],\n    'work_experience': ['System Engineer at TC S', 'Assistant System Engineer at TC S'],\n    'total_years_of_experience': '1 year 9 months'\n}\n```\n\nNote: The total years of experience is calculated based o

#### 4. `store_to_csv` function finally stores the candidate information in the Excel spreadsheet

In [9]:
store_to_csv(resume_info, 'sample_resume.csv')

Data stored successfully in sample_resume.csv.


In [10]:
pd.read_csv('sample_resume.csv')

Unnamed: 0,name,email,technical_skills,soft_skills,projects,work_experience,total_years_of_experience
0,Aashok Kumar,aashokrekumar@gmail.com,"['Python', 'Data Science', 'Machine Learning',...","['Passionate', 'Strong analytical skills', 'Ab...",['Prediction of customer eligibility for credi...,"['System Engineer at TC S', 'Assistant System ...",1 year 9 months
