In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os

# Setup RAG Index Input Corpus for Past Participant Data (DEPRECATED)
Note that this approach is now deprecated

In [None]:
os.makedirs(os.path.dirname("data/past_participants_split/"), exist_ok=True)
df = pd.read_csv('data/past_participant_info.csv')
df = df.dropna(subset=['VRF ID', 'Skillset', 'Person Id']).reset_index(drop=True)
train_ratio = 0.8
train_df, eval_df = train_test_split(df['Person Id'], test_size=(1 - train_ratio), random_state=42)
train_df.to_csv("data/past_participants_split/train_ids.csv", index=False)
eval_df.to_csv("data/past_participants_split/eval_ids.csv", index=False)

def create_applicant_info_corpus(applicant_info_csv: str, train_ids_csv: str, max_samples: int = -1) -> str:
    """Create a corpus of applicant information from a CSV file.

    Args:
        applicant_info_csv: Path to the CSV file containing applicant information.
        train_ids_csv: Path to the CSV file containing the train IDs.
        max_samples: Maximum number of samples to use. If -1, use all samples.

    Returns:
        A corpus of applicant information.
    """
    train_ids_df = pd.read_csv(train_ids_csv)
    max_samples = len(train_ids_df) if max_samples == -1 else min(max_samples, len(train_ids_df))
    train_ids_df = train_ids_df.sample(n=max_samples, random_state=42)
    job_assigns_df = pd.read_csv(applicant_info_csv)
    job_assigns_df = train_ids_df.merge(job_assigns_df, on='Person Id', how='inner')
    job_assigns_df['job'] = job_assigns_df['VRF ID'].apply(lambda x: x.split('-')[1])
    job_assigns_df['Skillset'] = job_assigns_df['Skillset'].apply(lambda x: x.replace('\n', ' '))
    job_assigns_df['summary'] = " Participant with skills: " + job_assigns_df['Skillset'] + " was assigned to job: " + job_assigns_df['job'] + ".\n"
    corpus = ''.join(job_assigns_df['summary'])
    return corpus

corpus = create_applicant_info_corpus('data/past_participant_info.csv', 'data/generated_participants_training/train_ids.csv', -1)
with open("data/past_participants_split/train_corpus.txt", "w") as file:
    file.write(corpus)
eval_ids_df = pd.read_csv('data/past_participants_split/eval_ids.csv')
eval_data = eval_ids_df.merge(df, on='Person Id', how='inner')
eval_data["eval_input"] =  " Participant " + eval_data["Person Id"].astype(str) + " has skills: " + eval_data['Skillset']

In [100]:
from utils import create_timestamped_results, get_latest_index_version
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever, VectorContextRetriever
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.llms.openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from llama_index.core import Settings


from dotenv import load_dotenv
# Load environment variables from a .env file
load_dotenv()

llm = OpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)
embeddings = OpenAIEmbeddings()
Settings.llm = llm
Settings.embed_model = embeddings


In [108]:

latest_vector_store_dir = get_latest_index_version("vector_store_versions/")
storage_context = StorageContext.from_defaults(persist_dir=latest_vector_store_dir)
vector_index = load_index_from_storage(storage_context)
vector_index_retriever = VectorIndexRetriever(index=vector_index)
# vector_context_retriever = VectorContextRetriever(vector_store=vector_index, top_k=5)

eval_df = pd.read_csv("data/past_participant_info.csv")
eval_df = eval_df[eval_df['Seva Allocation Accurate or not']==1].dropna(subset=['Person Id', 'Skillset', 'VRF ID', ]).sample(5)
eval_df[["Computer Skills", "Work Designation", "Education", "Education Specialization"]] = eval_df[["Computer Skills", "Work Designation", "Education", "Education Specialization"]].fillna("NA")
eval_df["eval_input"] =  " Participant " + eval_df["Person Id"].astype(str) + " has skills: " + eval_df['Skillset'] + \
                        "and specifically computer skills: " + eval_df["Computer Skills"] + \
                        ". The participant worked with designation: " + eval_df["Work Designation"] + \
                        "and has a " + eval_df["Education"] + "education specialized in " + eval_df["Education Specialization"]


query = \
"""I want you to provide job title recommendations for a set of participants based on their skills.
Each participant should get 5 unique job recommendations (please do not repeat jobs for the same person)
ranked by relevance to their skills based on the job descriptions and required skills that are available in the corpus.
Please give me the participant's number followed by the list of recommended job titles in this format
"{Participant Id}/-/{Job Title rank 1},{Job Title rank 2},{Job Title rank 3},{Job Title rank 4},{Job Title rank 5}\n".
Please do not add any other text to the response other than the id and titles.
Here is the information about the participants please do not skip a single of the 5 participants: \n"""

participants = eval_df["eval_input"].values
query += "\n".join(participants)

results = vector_index_retriever.retrieve(query)
# context = vector_context_retriever.retrieve(query)

print(query)
print(results[0].text)
print(results[1].text)
# print(context)


subdirs: ['index-20241123_135104', 'index-20241128_135403', 'index-20241130_145649', 'index-20241129_110402', 'index-20241123_134612', 'index-20241125_123955', 'index-20241127_151849', 'index-20241128_140642']
I want you to provide job title recommendations for a set of participants based on their skills.
Each participant should get 5 unique job recommendations (please do not repeat jobs for the same person)
ranked by relevance to their skills based on the job descriptions and required skills that are available in the corpus.
Please give me the participant's number followed by the list of recommended job titles in this format
"{Participant Id}/-/{Job Title rank 1},{Job Title rank 2},{Job Title rank 3},{Job Title rank 4},{Job Title rank 5}
".
Please do not add any other text to the response other than the id and titles.
Here is the information about the participants please do not skip a single of the 5 participants: 
 Participant 385890 has skills: Accounting / Basic Accountancyand spec

In [110]:
# vector_index
# vector_index_retriever

<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x154c55a00>