In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


In [3]:
df = pd.read_csv('data/past_participant_info.csv')
df = df.dropna(subset=['VRF ID', 'Skillset', 'Person Id']).reset_index(drop=True)
train_ratio = 0.8
train_df, eval_df = train_test_split(df['Person Id'], test_size=(1 - train_ratio), random_state=42)
train_df.to_csv("data/train_ids.csv", index=False)
eval_df.to_csv("data/eval_ids.csv", index=False)

In [4]:
def create_applicant_info_corpus(applicant_info_csv: str, train_ids_csv: str, max_samples: int = -1) -> str:
    """Create a corpus of applicant information from a CSV file.

    Args:
        applicant_info_csv: Path to the CSV file containing applicant information.
        train_ids_csv: Path to the CSV file containing the train IDs.
        max_samples: Maximum number of samples to use. If -1, use all samples.

    Returns:
        A corpus of applicant information.
    """
    train_ids_df = pd.read_csv(train_ids_csv)
    max_samples = len(train_ids_df) if max_samples == -1 else min(max_samples, len(train_ids_df))
    train_ids_df = train_ids_df.sample(n=max_samples, random_state=42)
    job_assigns_df = pd.read_csv(applicant_info_csv)
    job_assigns_df = train_ids_df.merge(job_assigns_df, on='Person Id', how='inner')
    job_assigns_df['job'] = job_assigns_df['VRF ID'].apply(lambda x: x.split('-')[1])
    job_assigns_df['Skillset'] = job_assigns_df['Skillset'].apply(lambda x: x.replace('\n', ' '))
    job_assigns_df['summary'] = " Participant with skills: " + job_assigns_df['Skillset'] + " was assigned to job: " + job_assigns_df['job'] + ".\n"
    corpus = ''.join(job_assigns_df['summary'])
    return corpus

In [5]:
corpus = create_applicant_info_corpus('data/past_participant_info.csv', 'data/train_ids.csv', -1)
with open("data/train_corpus.txt", "w") as file:
    file.write(corpus)

In [13]:
eval_ids_df = pd.read_csv('data/eval_ids.csv')
eval_data = eval_ids_df.merge(df, on='Person Id', how='inner')
eval_data["eval_input"] =  " Participant " + eval_data["Person Id"].astype(str) + " has skills: " + eval_data['Skillset']

In [20]:
chunk_size = 3
chunks = []
for i in range(0, len(df), chunk_size):
    chunk = eval_data["eval_input"].iloc[i:i + chunk_size]
    corpus = "\n".join(chunk.tolist())
    chunks.append(corpus)

In [27]:
past_participant_df = pd.read_csv('data/past_participant_info.csv')
eval_ids_df = pd.read_csv('data/eval_ids.csv')
eval_df = eval_ids_df.merge(past_participant_df, on='Person Id', how='inner')

In [31]:
eval_df

Unnamed: 0,Person Id,Computer Skills,Additional Skills,Skillset,Work Experience,Work Designation,Education,Education Specialization,VRF ID,Seva Dept,Job Description,Skills/Keyword,Seva Allocation Accurate or not,Not understood/little confusion/some doubt,Comment
0,317235,Basics,cottage accommodation reception,Basic Computer Skills / Basic Computer (MS Off...,Isha foundation,Receptionist,Bachelor of Arts (B.A.),Economics,Cottage Accomodation - Front Office Activities...,Cottage Accomodation,"Reception, Handling queries, Check-ins, Check-...",General / Hospitality/ Hotel Manager,1.0,,
1,412869,"Basic computer skills (excel, word, PPT, email)",,Basic Computer Skills / Basic Computer (MS Off...,Aman's Atlantic pvt LTD,Merchandise,HSC/12,BA,Dhyanalinga - Guiding Visitors - 2373,Dhyanalinga,"Guiding devotees to Suryakund, assisting them ...",Soft Skills / Articulate in communication,1.0,,
2,426916,Professional & expert in general uses.,"Power plant mechanical engineering, System eng...",Soft Skills / Articulate in communication,Hindalco Industries limited,Deputy Manager,Bachelor of Technology (B.Tech),Mechanical Engineering,IPC - Project Manager - 2690,IPC,Central (VnC) - Manage various short term proj...,Soft Skills / Strong Interpersonal skills (Coo...,1.0,1.0,I think this is a right allocation. But could ...
3,404805,"I possess advanced computer skills, including ...",,Corporate / HR Manager,EV Logistics,HR,Diploma,Business Administration,Indian Language Publications - Punjabi Transla...,Indian Language Publications,"Translation of Sadhguru videos, articles, and ...",Media and Communication / Copywriter,0.0,,Language requirement.
4,356168,Diploma in computer application,"Can be supportive in impressions team, seen he...",Soft Skills / Articulate in communication,Guwahati High Court,Stenographer,Bachelor of Arts (B.A.),English honors,Isha Impressions - Illustrator - 2407,Isha Impressions,· Proven work experience as an Illustrator\n· ...,Arts and Crafts / Illustrator,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,392830,Full stack developer,,IT / Developer - full stack,Accenture,Custom software analyst,Bachelor of Technology (B.Tech),ECE,Sadhanapada - Ashram Support - 2787,Sadhanapada,Ashram Support volunteers,Soft Skills / Fit for Physical Seva,0.0,,
197,429637,Basic,,Soft Skills / Fit for Physical Seva,,,,,Global Languages Publications - Arabic Transla...,Global Languages Publications,1 support coordinator: Assigns tasks to volunt...,Soft Skills / Articulate in communication,1.0,,
198,397896,Basic computer skills,Learning software development,Media and Communication / BPO,Concentrix,Support Executive,HSC/12,Na,Dhyanalinga - Guiding Visitors - 2376,Dhyanalinga,"Enter visitor details, give and receive tokens...",Media and Communication / Communication Trainer,1.0,,
199,402151,"MS office suite, Power BI",,Soft Skills / Articulate in communication,Tata consultancy services,Business Analyst,Master of Business Administration (M.B.A.),Finance Marketing,IT - Applications - Business Analyst (BA) - 2613,IT / IT - Applications,"Business Analyst, Scrum Master, Systems Analys...",IT / Business Analyst,1.0,,
