In [4]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import json

In [5]:
client = OpenAI()

def get_prompt(resume):
    return f'generate a 200 word job description for the following resumes without proper nouns "{resume}"'

def get_job_description(resume):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": get_prompt(resume)}],
    )
    return response.choices[0].message.content

In [6]:
data = {"resume": [], "categories": []}
df = pd.DataFrame(data)
START_INDEX = 10000

# Iterate through a folder of text files and extract the text
import os
import codecs

# Define the directory
dir_path = "./data/resumes_corpus"

# Get a list of all files in the directory
files = os.listdir(dir_path)[START_INDEX:]

# Iterate over each file
for file in files:
    if file.endswith(".txt"):
        # Construct full file path
        file_path = os.path.join(dir_path, file)
        file_path_label = file_path.replace(".txt", ".lab")
        data = {"resume": [], "categories": []}

        # Open the file
        with codecs.open(file_path, "r", encoding="utf8", errors="ignore") as f:
            # Read the file's contents
            resume = f.read()
            data["resume"].append(resume)

        with codecs.open(file_path_label, "r", encoding="utf8", errors="ignore") as f:
            # Read the file's contents
            content = f.read()
            content = content.split("\n")
            if "" in content and len(content) == 1:
                continue
            elif "" in content:
                content.remove("")
            data["categories"].append(content)

        new_row_df = pd.DataFrame(data)
        # Add the new row to the DataFrame
        df = pd.concat([df, new_row_df], ignore_index=True)

In [7]:
unique_labels = df["categories"].explode().unique()
df_generator = df.iterrows()
valid_data = []
indices = []
for i in range(10):
    for u_label in unique_labels:
        data = {"label": u_label, "pos": [], "neg": []}
        i = 0
        while i < 5:
            index, row = next(df_generator)
            if u_label in row["categories"]:
                data["pos"].append(row["resume"])
                indices.append(index)
                i += 1
        i = 0
        while i < 15:
            index, row = next(df_generator)
            if u_label not in row["categories"]:
                indices.append(index)
                data["neg"].append(row["resume"])
                i += 1
        valid_data.append(data)

In [8]:
test_data = []
for i in range(10):
    for u_label in unique_labels:
        data = {"label": u_label, "pos": [], "neg": []}
        i = 0
        while i < 5:
            index, row = next(df_generator)
            if u_label in row["categories"]:
                data["pos"].append(row["resume"])
                indices.append(index)
                i += 1
        i = 0
        while i < 15:
            index, row = next(df_generator)
            if u_label not in row["categories"]:
                indices.append(index)
                data["neg"].append(row["resume"])
                i += 1
        test_data.append(data)

In [11]:
# Add job descriptions
for data in tqdm(valid_data):
    resumes = "---------------------------------\n".join(data["pos"][0:3])
    data["description"] = get_job_description(resumes)

for data in tqdm(test_data):
    resumes = "---------------------------------\n".join(data["pos"][0:3])
    data["description"] = get_job_description(resumes)

100%|██████████| 100/100 [08:33<00:00,  5.13s/it]


In [12]:
with open("./data/valid_data.json", "w") as f:
    json.dump(valid_data, f, indent=4)
with open("./data/test_data.json", "w") as f:
    json.dump(test_data, f, indent=4)