In [9]:
from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()
print('done')

done


In [10]:
from typing import List, Optional, Literal
from pydantic import BaseModel, Field

In [14]:
class ResumeMetadata(BaseModel):
    years_of_experience: int = Field(..., description="Number of years of work experience.")
    state: str = Field(None, description="The candidate's current state where they live (2 letter state abbreviation e.g. CA, AZ etc.).")
    highest_degree: Literal['High School', 'Bachelor\'s', 'Master\'s', 'Doctoral', 'Professional'] = Field(None, description="The highest degree attained by the candidate.")
    institution: str = Field(None, description="The institution from which the degree was earned.")
    expert_in: Literal['Accounting', 'Sales', 'Payroll', 'Inventory', 'Customer Service', 'Project Management', 'Finance', 'Bookkeeping'] = Field(None, description="What the candidate is most experienced and skilled in.")
    professional_summary: str = Field(None, description="A brief summary of the candidate's professional experience, background and career.")

In [12]:
from llama_extract import LlamaExtract

extractor = LlamaExtract()

In [None]:
metadata_schema = extractor.create_schema("Resume Metadata", ResumeMetadata.schema())

In [13]:
metadata_schema = extractor.get_schema('2b36f3f2-757d-463c-8205-c47ebe49c294')

In [15]:
metadata_schema = extractor.update_schema(metadata_schema.id, ResumeMetadata.schema())

In [18]:
resume_files = ['resume1.pdf', 'resume2.pdf', 'resume3.pdf']
all_metadata, all_metadata_models = extractor.extract(metadata_schema.id, resume_files, response_model=ResumeMetadata)

Extracting files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [06:53<00:00, 137.89s/it]


In [19]:
from llama_parse import LlamaParse
parser = LlamaParse(split_by_page=False)

In [20]:
documents = parser.load_data(resume_files)

Parsing files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:25<00:00,  8.38s/it]


In [27]:
len(documents)

3

In [None]:
for document, metadata in zip(documents, all_metadata):
    document.metadata = metadata.data

In [30]:
for document in documents:
    print(document.metadata)

{'state': 'MD', 'expert_in': 'Finance', 'institution': 'University of Maryland', 'highest_degree': "Bachelor's", 'years_of_experience': '3', 'professional_summary': 'Experienced finance professional with a strong background in accounting functions, including accounts payable processing, general ledger entries, and financial reporting. Proven ability to supervise staff, maintain financial records, and provide excellent customer service in a banking environment.'}
{'state': 'TX', 'expert_in': 'Accounting', 'institution': 'University of Houston', 'highest_degree': "Bachelor's", 'years_of_experience': '10', 'professional_summary': 'Degreed accountant with more than 10 years of diversified accounting experience seeking accounting position at a well-established company in Houston.'}
{'state': 'TX', 'expert_in': 'Accounting', 'institution': 'Marshall University', 'highest_degree': "Bachelor's", 'years_of_experience': '0', 'professional_summary': 'Maintained a 4.0 GPA while completing 12 hours

In [32]:
import os
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

chroma_path = './chroma_db'
chroma_collection_name = 'chrm'

Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.1)
Settings.embed_model = OpenAIEmbedding()

if os.path.exists(chroma_path):
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    index = VectorStoreIndex.from_vector_store(vector_store)
    print('loaded')
else:
    chroma_client = chromadb.PersistentClient(path=chroma_path)
    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
    print('created')

created


In [34]:
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo

all_metadata_info = []
for field_name, field_info in ResumeMetadata.__fields__.items():
    all_metadata_info.append(
        MetadataInfo(
            name=field_name,
            type=str(field_info.annotation),
            description=field_info.description,
        )
    )

In [35]:
all_metadata_info

[MetadataInfo(name='years_of_experience', type="<class 'int'>", description='Number of years of work experience.'),
 MetadataInfo(name='state', type="<class 'str'>", description="The candidate's current state where they live (2 letter state abbreviation e.g. CA, AZ etc.)."),
 MetadataInfo(name='highest_degree', type='typing.Literal[\'High School\', "Bachelor\'s", "Master\'s", \'Doctoral\', \'Professional\']', description='The highest degree attained by the candidate.'),
 MetadataInfo(name='institution', type="<class 'str'>", description='The institution from which the degree was earned.'),
 MetadataInfo(name='expert_in', type="typing.Literal['Accounting', 'Sales', 'Payroll', 'Inventory', 'Customer Service', 'Project Management', 'Finance', 'Bookkeeping']", description='What the candidate is most experienced and skilled in.'),
 MetadataInfo(name='professional_summary', type="<class 'str'>", description="A brief summary of the candidate's professional experience, background and career.")

In [36]:
vector_store_info = VectorStoreInfo(
    content_info="list of resume of candidates",
    metadata_info=all_metadata_info,
)
retriever = VectorIndexAutoRetriever(index, vector_store_info, verbose=True)

In [39]:
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(retriever=retriever, streaming=True)

In [43]:
resp = query_engine.query('what degree does the candidate in maryland have and from what uni?')
for token in resp.response_gen:
    print(token, end="")

Using query str: degree and university of the candidate
Using filters: [('state', '==', 'MD')]
The candidate has a Bachelor of Science Degree in Finance from the University of Maryland.