## Here, we are reading the resumes from google cloud storage, parses the same with pyresparser, extract details and save it in an excel file

### Import python libraries required for processing

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from pyresparser import ResumeParser
import os
from docx import Document
import io
import uuid
import concurrent.futures
from pathlib import Path
from google.cloud import storage
import fitz  # PyMuPDF
from itertools import islice
import warnings
warnings.filterwarnings("ignore")

#### python commands to run in the order to install required libraries
pip install python-docx

pip install docx

pip install nltk

pip install spacy==2.3.5

pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz

pip install pyresparser

pip install --upgrade google-cloud-storage

pip install pypdf

pip install PyMuPDF

import nltk

nltk.download('stopwords')

In [2]:
input_path = "https://console.cloud.google.com/storage/browser/hackathon1415"
bucket_name = input_path.split("/")[-1]
bucket_name

'hackathon1415'

In [3]:
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_bytes):
    text = ""
    with fitz.open("pdf", pdf_bytes) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

In [4]:
# Function to parse resume using Pyresparser
def parse_resume(text):
    temp_file_path = str(uuid.uuid4()) + '.docx'
    doc = Document()
    doc.add_paragraph(text)
    doc.save(temp_file_path)

    try:
        data = ResumeParser(temp_file_path).get_extracted_data()
        return data
    finally:
        # Clean up the temporary file
        os.unlink(temp_file_path)

In [5]:
# Function to process a single blob (resume)
def process_blob(blob):
    resume_bytes = blob.download_as_bytes()
    text = extract_text_from_pdf(resume_bytes)
    data = parse_resume(text)
    skills = ', '.join(data.get('skills', []))
    data_dict = {
        'name': blob.name,
        'skills': skills
    }
    return data_dict

In [6]:
path_to_private_key = 'fifth-compass-415612-76f634511b19.json'
client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)
bucket = client.bucket(bucket_name)

str_folder_name_on_gcs = 'RESUME/data/'

# Create the directory locally
Path(str_folder_name_on_gcs).mkdir(parents=True, exist_ok=True)

blobs = bucket.list_blobs(prefix=str_folder_name_on_gcs)

# Limit to the first 100 blobs
limited_blobs = islice(blobs, 100)

# Use concurrent.futures for parallel processing
with concurrent.futures.ThreadPoolExecutor() as executor:
    data_dicts = list(executor.map(process_blob, limited_blobs))


In [7]:
# Write the data dictionary to an Excel file
df = pd.DataFrame(data_dicts)
excel_file = bucket_name + '_resume_data.xlsx'
df.to_excel(excel_file, index=False)
print(f"Data written to {excel_file}")
df.head()

Data written to hackathon1415_resume_data.xlsx


Unnamed: 0,name,skills
0,RESUME/data/10247517.pdf,"Analysis, Schedules, Requests, Programming, Wi..."
1,RESUME/data/10265057.pdf,"Requests, Programming, Python, Reports, Data c..."
2,RESUME/data/10399912.pdf,"Schedules, Correspondence, Reports, Customer s..."
3,RESUME/data/10549585.pdf,"Analysis, Retention, Windows, Analytical, Sche..."
4,RESUME/data/10553553.pdf,"Analysis, Windows, Security, Servers, Database..."
