In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


In [3]:
df = pd.read_csv('data/past_participant_info.csv')
df = df.dropna(subset=['VRF ID', 'Skillset', 'Person Id']).reset_index(drop=True)
train_ratio = 0.8
train_df, eval_df = train_test_split(df['Person Id'], test_size=(1 - train_ratio), random_state=42)
train_df.to_csv("data/train_ids.csv", index=False)
eval_df.to_csv("data/eval_ids.csv", index=False)

In [4]:
def create_applicant_info_corpus(applicant_info_csv: str, train_ids_csv: str, max_samples: int = -1) -> str:
    """Create a corpus of applicant information from a CSV file.

    Args:
        applicant_info_csv: Path to the CSV file containing applicant information.
        train_ids_csv: Path to the CSV file containing the train IDs.
        max_samples: Maximum number of samples to use. If -1, use all samples.

    Returns:
        A corpus of applicant information.
    """
    train_ids_df = pd.read_csv(train_ids_csv)
    max_samples = len(train_ids_df) if max_samples == -1 else min(max_samples, len(train_ids_df))
    train_ids_df = train_ids_df.sample(n=max_samples, random_state=42)
    job_assigns_df = pd.read_csv(applicant_info_csv)
    job_assigns_df = train_ids_df.merge(job_assigns_df, on='Person Id', how='inner')
    job_assigns_df['job'] = job_assigns_df['VRF ID'].apply(lambda x: x.split('-')[1])
    job_assigns_df['Skillset'] = job_assigns_df['Skillset'].apply(lambda x: x.replace('\n', ' '))
    job_assigns_df['summary'] = " Participant with skills: " + job_assigns_df['Skillset'] + " was assigned to job: " + job_assigns_df['job'] + ".\n"
    corpus = ''.join(job_assigns_df['summary'])
    return corpus

In [5]:
corpus = create_applicant_info_corpus('data/past_participant_info.csv', 'data/train_ids.csv', -1)
with open("data/train_corpus.txt", "w") as file:
    file.write(corpus)

In [13]:
eval_ids_df = pd.read_csv('data/eval_ids.csv')
eval_data = eval_ids_df.merge(df, on='Person Id', how='inner')
eval_data["eval_input"] =  " Participant " + eval_data["Person Id"].astype(str) + " has skills: " + eval_data['Skillset']

In [None]:
chunk_size = 3
chunks = []
for i in range(0, len(df), chunk_size):
    chunk = eval_data["eval_input"].iloc[i:i + chunk_size]
    corpus = "\n".join(chunk.tolist())
    chunks.append(corpus)

 Participant 317235 has skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills
 Participant 412869 has skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills
 Participant 426916 has skills: Soft Skills / Articulate in communication

 Participant 404805 has skills: Corporate / HR Manager
 Participant 356168 has skills: Soft Skills / Articulate in communication
 Participant 428648 has skills: Medical / Psychologist

 Participant 425076 has skills: IT / IT - Others
 Participant 368192 has skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills
 Participant 398213 has skills: Soft Skills / Enthusiastic

 Participant 350966 has skills: Basic Computer Skills / Basic Computer (MS Office and Email) Skills
 Participant 394568 has skills: Soft Skills / Articulate in communication
 Participant 396920 has skills: Accounting / Basic Accountancy

 Participant 362198 has skills: Soft Skills / Articulate in communication
 Participant 382173