In [1]:
import os
import pdfplumber
import json
from dotenv import load_dotenv

In [2]:
def get_files_list_in_folder(folder_path):
    file_list = [f"{folder_path}/{f}" for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return file_list

In [3]:
resume_contents = []
resume_files = get_files_list_in_folder("Resumes")
for file in resume_files:
    with pdfplumber.open(file) as pdf:
        resume_contents.append(pdf.pages[0].extract_text())

In [4]:
full_text = "".join(resume_contents[0])
print(full_text[:100])

Lakshmi Himaja Amrutham
IL,USA • (312) 7742876 • alakshmihimaja@hawk.iit.edu • linkedin.com/in/laksh


In [5]:
# GET secret keys
load_dotenv()
antropic_claude_api_key = os.getenv("ANTHROPIC_CLAUDE_API_KEY")
weaviate_cluster_url = os.getenv("WEAVIATE_CLUSTER_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
openai_api_key = os.getenv("OPEN_AI_API_KEY")

## Generating summaries for each resume

In [6]:
from langchain_anthropic import ChatAnthropic
from langchain.prompts import ChatPromptTemplate
from langchain.chains import SequentialChain
from langchain.chains import SimpleSequentialChain, LLMChain, SequentialChain

In [7]:
# Using Anthropic Claude model
claude = ChatAnthropic(temperature=0.9, 
                       api_key=antropic_claude_api_key, 
                       model_name="claude-3-5-sonnet-20241022")


#### Building sequential chains

In [27]:
gen_json_prompt = ChatPromptTemplate.from_template(
    "Structure the following resume content in JSON format"
    "\n\n{resume_content}")
# chain 1: input= resume_content and output= resume content in structured json format
chain_one = LLMChain(llm=claude, prompt=gen_json_prompt, output_key="resume_json")

summary_prompt = ChatPromptTemplate.from_template(
    "Generate a 300 words summary for the resume content in JSON format below." 
    "Highlight all the skills, education, certifications if any, responsibilities and publications." 
    "\n\n{resume_json}"
)
# chain 2: input= English_Review and output= summary
chain_two = LLMChain(llm=claude, prompt=summary_prompt, output_key="resume_summary")

# constructing the sequential chain
resume_summary_chain = SequentialChain(chains=[chain_one, chain_two],
                                       input_variables=["resume_content"],
                                       output_variables = ["resume_summary"],
                                       verbose=False)

In [28]:
resume_summaries = []
for r_content in resume_contents:
    res = resume_summary_chain(r_content)
    resume_summaries.append(res["resume_summary"])

In [44]:
# Storing the resume summaries for future use
for r_summary in resume_summaries:
    r_summary = r_summary[42:]
    name = "".join(r_summary.split()[:2])
    with open(f"summary/{name}_resume_summary.txt", "w") as file:
        file.write(r_summary)

## Saving the Resume summaries in Vector Data store Weaviate

In [6]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.util import generate_uuid5
from weaviate.classes.config import Configure, Property, DataType, Tokenization
from weaviate.classes.query import MetadataQuery

In [7]:
client.close()

  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


NameError: name 'client' is not defined

In [8]:
# Function to get the weaviate client
def get_weaviate_client():
    return weaviate.connect_to_weaviate_cloud(
        cluster_url= weaviate_cluster_url, # Replace with your Weaviate Cloud URL
        auth_credentials= Auth.api_key(weaviate_api_key), # Replace with your Weaviate Cloud key
        headers={"X-OpenAI-Api-Key": openai_api_key}
    )

In [9]:
# Function to create or get summary collection
def create_or_get_summary_collection(coll_name, properties, client):
    if client.collections.exists(coll_name):
        return client.collections.get(coll_name)
    else:
        client.collections.delete(coll_name)
        client.collections.create( 
            name = coll_name,
            vectorizer_config=[
                # Use the "text2vec-openai" vectorizer
                Configure.NamedVectors.text2vec_openai(  
                name=properties[1], source_properties=[properties[1]]         # Set the source property(ies)
                )],
            properties=[ 
                Property(name=properties[0], 
                         data_type=DataType.TEXT),
                Property(name=properties[1], 
                         data_type=DataType.TEXT, 
                         vectorize_property_name=True,
                         tokenization=Tokenization.LOWERCASE)
            ],
            # vectorizer_config=Configure.Vectorizer.text2vec_openai(),   # Configure the OpenAI embedding integration
            generative_config=Configure.Generative.openai()             # Configure the OpenAI generative AI integration
        )
        return client.collections.get(coll_name)

In [10]:
# Get client and check if its ready
client = get_weaviate_client()
print(client.is_ready())

True


In [11]:
# Deletes collections / do not use unless necessary
client.collections.delete("resume_summaries")

In [12]:
# CREATING the collection "resume_summary"
properties = ["candidate_summary_file", "summary"]
collection_name = "resume_summaries"
resume_summaries = create_or_get_summary_collection(collection_name, properties, client)

In [13]:
# Function to persist data objects to the collection
def persist_data_to_collection(resume_summaries, resume_summary_files):
    for summary_file in resume_summary_files:
        with open(summary_file, "r") as summary:
            data_object = {
                "candidate_summary_file": summary_file[16:-4],
                "summary": summary.read(),
            }
            resume_summaries.data.insert(
                properties=data_object,
                uuid=generate_uuid5(data_object),
            )

In [14]:
# Persisting resume summary data objects to the collection
resume_summary_files = get_files_list_in_folder("Resumes/summary")
persist_data_to_collection(resume_summaries, resume_summary_files)

In [None]:
# Reading all the objects inside a collection
for item in resume_summaries.iterator():
    print(item.uuid, item.properties)
    break # remove break to print all the data objects

In [140]:
job_description = """Experience with Large Language Model (LLM) APIs. 
Experience working with Machine Learning and Deep Learning Libraries. 
Proven track record of delivering complex, scalable, and high-performance software systems. 
Knowledge on SKlearn, Python, Neural Networks, Transformers, Keras, Tensorflow and Pytorch.   
"""

response = resume_summaries.query.near_text(
    query=job_description,
    limit=4,
    target_vector="summary",  # Specify the target vector for named vector collections
    return_metadata=MetadataQuery(distance=True)
)


In [143]:
# Iterate over the respose
for o in response.objects:
    print(o.properties["summary"])
    print(o.metadata.distance)
    print("\n\n")

Karthik Kaiplody is a Machine Learning Engineer pursuing a Master's in Data Science at Illinois Institute of Technology. He holds a Post Graduate Diploma in ML and AI from IIIT Bangalore and a Bachelor's in Electronics and Communication.

Skills Highlights:
- Technical: AWS, Python, Scikit-Learn, TensorFlow, Keras, PyTorch, NLTK, spaCy, Flask, Git, SQL
- Core Competencies: MLOps, Data Pipeline Development, NLP, Deep Learning, Computer Vision, CI/CD
- Functional: Teamwork, Client Management, Interpersonal Communication

Professional Experience:
Currently working as Data Science Engineer R&D Intern at CCC Intelligent Solutions, focusing on CI/CD pipeline development for AI products and AWS migration. Previously served as Research Assistant at Illinois Institute of Technology's SCS Laboratory, developing machine learning pipelines and nutritional data tools.

At Acuity Knowledge Partners, as Machine Learning Engineer III, he:
- Implemented transformer-based models using MLOps practices
- 