In [1]:
import pandas as pd
from openai import OpenAI
from participant_data import create_participant_db_df
from dotenv import load_dotenv
load_dotenv()
client = OpenAI()

In [None]:

task_entities_df = pd.read_csv('data/entities_Work_Experience-Tasks.csv')

task_entities_df = task_entities_df.head(20)




In [11]:

user_prompt = """
You are an AI normalizing or standardizing text of job tasks extracted from
an applicants experience descriptions. Consider if applicants made wordy or uncommon descriptions of their
tasks and reduce or simplify them to a more concise phrase that can be used in structured data.

Please try to reduce each entity into maximum 4 words. Entities that are not common descriptions
please completely change them to a synonymous task that is commonly understood in the work world.
You should remove any super specific words for the applicants company or specific situation.

Entity to normalize:
{extracted_text}
"""


In [12]:

for _, row in task_entities_df.iterrows():
    messages = [
            {
                "role": "system",
                "content": "You transform extracted entities into more normalized/ standardize words or phrases of the entity, no extra text."
            },
            {
                "role": "user",
                "content": user_prompt.format(extracted_text=row['extracted_text'])
            }
        ]

    response = client.chat.completions.create(
                    model='gpt-4o-mini',
                    messages=messages,
                    temperature=0
                )
    gpt_reply = response.choices[0].message.content.strip()
    print("Extracted Text: ", row['extracted_text'])
    print("Normalized Text: ", gpt_reply)
    print()

Extracted Text:  3d modelling
Normalized Text:  3D modeling

Extracted Text:  2d drawings
Normalized Text:  2D design

Extracted Text:  client meetings
Normalized Text:  client consultations

Extracted Text:  site visits
Normalized Text:  field inspections

Extracted Text:  making presentations
Normalized Text:  creating presentations

Extracted Text:  2D drawings check
Normalized Text:  Review 2D drawings

Extracted Text:  Preparing & monitoring L3 Baseline Schedule
Normalized Text:  Schedule preparation and monitoring

Extracted Text:  Progress monitoring & control
Normalized Text:  Performance tracking

Extracted Text:  Preparing of Progress reports & lookahead schedules
Normalized Text:  Progress reporting and scheduling

Extracted Text:  Coordination with the Engineering Team
Normalized Text:  Team collaboration with engineers

Extracted Text:  Coordination with the sub-contractors & vendors for the delivery & execution of work at site
Normalized Text:  Vendor and subcontractor co

In [None]:
"""
Next steps

maybe group some of these entities that are close in vector space/ cosine space
by some threshold and ask llm to make clusters.
"""

In [5]:
# Load data
target_columns = ['SP ID', 'Work Experience/Company', 'Work Experience/Designation',
'Work Experience/Tasks', 'Work Experience/Industry',
'Education/Qualifications', 'Education/Specialization',
'Any Additional Skills', 'Computer Skills', 'Skills',
'Languages', 'Gender', 'Age', 'Work Experience/From Date', 'Work Experience/To Date']
participant_info_raw_df = pd.read_csv('../../data/input_participant_info_raw.csv')
participant_db_df = create_participant_db_df(participant_info_raw_df, target_columns)

extraction_columns = [
    "Work Experience/Designation",
    "Work Experience/Tasks",
    "Any Additional Skills",
    ]
participant_db_df = participant_db_df[['SP ID'] + extraction_columns]
type_list_cols = [
    "Work Experience/Designation",
    "Work Experience/Tasks",
    ]
for col in type_list_cols:
    participant_db_df[col] = participant_db_df[col].apply(str)
participant_db_df = participant_db_df.head(10)

In [7]:
user_prompt = """
Take the given text describing an applicant's tasks they have done at their job and provide a comma separated list
of the skills these tasks imply the applicant has. Do not return anything other than the list of skills separated
by commas. Here is the text of applicant tasks...
{tasks}
"""

In [9]:

for _, row in participant_db_df.iterrows():
    tasks_str = str(row["Work Experience/Tasks"])
    if tasks_str == "['NA']":
        continue
    print(tasks_str)
    messages = [
            {
                "role": "system",
                "content": "You extract and infer skills from a corpus of text on work tasks of a job applicant, no extra text."
            },
            {
                "role": "user",
                "content": user_prompt.format(tasks=tasks_str)
            }
        ]

    response = client.chat.completions.create(
                    model='gpt-4o-mini',
                    messages=messages,
                    temperature=0
                )
    gpt_reply = response.choices[0].message.content.strip()
=    print(gpt_reply)

['3d modelling, 2d drawings, client meetings and site visits', '3d modelling, making presentations , 2D drawings check']
3D modelling, 2D drawings, client meetings, site visits, making presentations, drawing checks
['1) Preparing & monitoring L3 Baseline Schedule (2) Progress monitoring & control (3) Preparing of Progress reports & lookahead schedules. (4) Coordination with the Engineering Team.', '1) Coordination with the sub-contractors & vendors for the delivery & execution of work at site. (2) Ordering & Delivery of Structural Steel as per the scope. (3) Performing Technical Bid Evaluation for the identified subcontractors. (4) Reconciliation of construction materials. (5) Traking of overall procurement status, from dispatch to billing at site. (6) Raising Claims (7) Preparing of L3 Baseline Schedule. (8) Preparing & Monitroing Weekly & Monthly Progress Reports']
Project management, scheduling, progress monitoring, report preparation, coordination, vendor management, procurement, t