In [None]:
import requests
import pandas as pd
import os
from tqdm.auto import tqdm
import json
import math

In [None]:
SOURCE_BASE_DIR = "./datasets"
EVALUATION_BASE_DIR = SOURCE_BASE_DIR+os.sep+"/evaluation"
VERSION_MODEL = "0.8.0"
VERSION_DATA = "_001"
FILE_NAME = f"latam{VERSION_DATA}.csv"
FILE_PATH = SOURCE_BASE_DIR+os.sep+FILE_NAME

In [None]:
# use the same sample
jobs_df = pd.read_csv(FILE_PATH)
jobs_df = jobs_df.drop_duplicates(subset=["description", "salary_min_form", "salary_max_form", "currency_form", "time_lapse_form"])

In [None]:
jobs_df.shape

In [None]:
jobs_df.groupby(by=["country"]).size().sort_values(ascending=False)

In [None]:
def extract_salary(description, job_id):
    response = requests.post('http://localhost:3000/predict', data=json.dumps({"description": description, "jobid":job_id}), headers={"Content-Type": "application/json"})

    if response.status_code == 200:
        return response.json()
    
    return {'max': 0, 'min': 0, 'error_status_code': response.status_code}

In [None]:
salary_info = []
total_tokens = 0
accumulate_cost = 0
progress_steps = math.ceil(jobs_df.shape[0]*0.15)

for index, row in tqdm(jobs_df.iterrows(), total=jobs_df.shape[0]):
    try:
        salary_output = extract_salary(row.description, row.job_id)
        sent_tokens = salary_output["token_count_sent_chat_gpt"]
        total_tokens += sent_tokens

        cost = float(salary_output["cost"])
        accumulate_cost += cost

        orignal_description_tokens = len(row.description.split(" "))
        reduce_description_len = len(salary_output["reduce_description"].split(" "))
        ratio_sent_original_description = reduce_description_len/orignal_description_tokens

        salary_info.append((row.job_id, salary_output["reduce_description"], salary_output["min"], salary_output["max"], salary_output["currency"], salary_output["time_lapse"], salary_output["source"], sent_tokens, orignal_description_tokens, ratio_sent_original_description, cost))

        if (index%progress_steps)==0:
            print(f"[output salary: {salary_output}]")
            print("[------- Full Description -------]")
            print(row.description)
            print("[------- Reduce description -------]")
            print(salary_output["reduce_description"])
            print(f"==== Tokens sent for this JD:{sent_tokens} - Total sent tokens: {total_tokens} - tokens in complete description:{orignal_description_tokens} = Ratio sent/original description so far:{ratio_sent_original_description} | Cost for this JD:{cost} - Total cost so far: {accumulate_cost} ====")

    except Exception as e:
        print(f"error: {e}")

print(f"estimated_cost: {accumulate_cost} per: {jobs_df.shape[0]} jobs description | Total sent tokens: {total_tokens}")

In [None]:
salary_df = pd.DataFrame(salary_info,
                         columns=[
                                    "job_id",
                                    "reduce_description",
                                    f"salary_min_gpt_{VERSION_MODEL}",
                                    f"salary_max_gpt_{VERSION_MODEL}",
                                    f"currency_gpt_{VERSION_MODEL}",
                                    f"time_lapse_gpt_{VERSION_MODEL}",
                                    f"source_{VERSION_MODEL}",
                                    f"token_count_sent_chat_gpt_{VERSION_MODEL}",
                                    "token_count_original_description",
                                    "token_rate_sent_original_description",
                                    f"gpt_cost_{VERSION_MODEL}"
                                ]
            )

In [None]:
salary_df.sample(n=10)

# Add missing fields

In [None]:
jobs_with_salary = pd.merge(jobs_df, salary_df, on="job_id", how="left")

# How many jobs with salary did we got

In [None]:
all_jobs_count = jobs_with_salary.shape[0]
jobs_with_salary_count = jobs_with_salary[jobs_with_salary[f"salary_min_gpt_{VERSION_MODEL}"]>0].shape[0]

print(f"Salary jobs count: {all_jobs_count} | {jobs_with_salary_count} - jobs with salary {round(jobs_with_salary_count/all_jobs_count, 3)}%")

In [None]:
# Total rows
all_jobs_by_country = jobs_df.groupby(by=["country"]).size().sort_values(ascending=False).reset_index(name="count_all")

all_jobs_by_country

In [None]:
#By country
salary_count_by_country_df = jobs_with_salary[jobs_with_salary[f"salary_min_gpt_{VERSION_MODEL}"]>0].groupby(by=["country"]).size().sort_values(ascending=False).reset_index(name="count_with_salary")

In [None]:
salary_count_by_country_df

In [None]:
all_jobs_count_df = pd.merge(salary_count_by_country_df, all_jobs_by_country, on="country", how="left")

In [None]:
all_jobs_count_df["%"] = all_jobs_count_df.apply(lambda row: round(row["count_with_salary"]/row["count_all"], 2), axis=1)

In [None]:
all_jobs_count_df.sort_values(by="%", ascending=False)

In [None]:
jobs_with_salary.to_csv(EVALUATION_BASE_DIR+os.sep+f"jobs_with_salaries_v_{VERSION_MODEL}_val_set_v_{VERSION_DATA}.csv")