In [34]:
import pandas as pd
import json

## Helper Functions

In [35]:
def check_json(x):
  try:
    json.loads(x)
    return True
  except:
    return False

In [36]:
def check_json_adv(x):
    s_pos = None
    for i in range(len(x)):
        if x[i] == "[":
            s_pos = i
    e_pos = None
    for i in range(len(x)):
        if x[i] == "]":
            e_pos = i
    if s_pos and e_pos and s_pos < e_pos:
        try:
            json.loads(x[s_pos:e_pos+1])
            return True
        except:
            return False
    return False

In [37]:
def return_adv_json(x):
    s_pos = None
    for i in range(len(x)):
        if x[i] == "[":
            s_pos = i
    e_pos = None
    for i in range(len(x)):
        if x[i] == "]":
            e_pos = i
    if s_pos and e_pos and s_pos < e_pos:
        try:
            json.loads(x[s_pos:e_pos+1])
            return x[s_pos:e_pos+1]
        except:
            return "[]"
    return "[]"

In [38]:
# Returns the Exhaustive Count from a language model's output. Checkout subsequest cells to understand usage.

def get_exhaustive_count(json_extracted_llama:pd.DataFrame, model_output_col_name:str, json_extracted_openai:pd.DataFrame, openai_col_name:str):
  links_openai = {}
  for _, r in json_extracted_openai.iterrows():
    data_points = json.loads(r[openai_col_name])
    for dp in data_points:
      if "Job Link" in dp:
        links_openai[dp["Job Link"]] = None
  exhaustive_count = 0
  for _, r in json_extracted_llama.iterrows():
    json_list = json.loads(r[model_output_col_name])
    for json_dp in json_list:
      if "Job Link" in json_dp and json_dp["Job Link"] in links_openai:
        exhaustive_count += 1
  return exhaustive_count

In [40]:
# Checkout subsequest cells to understand usage.
def get_links_in_4o(json_extracted_llama:pd.DataFrame, json_extracted_openai:pd.DataFrame, model_output_col_name:str, openai_col_name:str):
  links_llama = {}
  for _, r in json_extracted_llama.iterrows():
    data_points = json.loads(r[model_output_col_name])
    for dp in data_points:
      if "Job Link" in dp:
        links_llama[dp["Job Link"]] = None
  links_openai = {}
  for _, r in json_extracted_openai.iterrows():
    data_points = json.loads(r[openai_col_name])
    for dp in data_points:
      if "Job Link" in dp:
        links_openai[dp["Job Link"]] = None
  return (
      len([l for l in links_llama if l not in links_openai]),
      len([l for l in links_llama if l in links_openai]), # Returns the Precise Count from a language model's output.
      len([l for l in links_openai]) # Returns the Precise Count for GPT-4o
  )

## Fine Tuned Llama 3.2 1B

In [41]:
extracted_data_llama = pd.read_parquet("../results/Llama-3.2-1B_unseen_test_dataset_output.parquet")
extracted_data_llama.head(10)

Unnamed: 0,model_output,html,openai_extracted
0,I am providing you with an HTML. I want you to...,"ifgzpr-0 ebeCrQ"">Learn more</button></div></a>...","[{""Job Title"":""Data Engineer"",""Job Location"":""..."
1,I am providing you with an HTML. I want you to...,"eey"">Fraud and Risk Analyst, Identity</div><di...","[{""Job Title"":""Fraud and Risk Analyst, Identit..."
2,I am providing you with an HTML. I want you to...,"StyledButton-sc-ifgzpr-0 ebeCrQ"">Learn more</b...","[{""Job Title"":""Senior iOS Engineer"",""Job Locat..."
3,I am providing you with an HTML. I want you to...,"PageRow__Subtitle-sc-v4dlgb-4 cfQbFR"">New York...","[{""Job Title"":""Software Engineer, Matching"",""J..."
4,I am providing you with an HTML. I want you to...,"class=""JobPageRow__LeftItem-sc-v4dlgb-1 izfJqe...","[{""Job Title"":""Senior Product Manager, Airport..."
5,I am providing you with an HTML. I want you to...,""">Data Scientist, Decisions - AdTech</div><div...","[{""Job Title"":""Data Scientist, Decisions - AdT..."
6,I am providing you with an HTML. I want you to...,"geRow__RightItem-sc-v4dlgb-5 czphV""><button ta...","[{""Job Title"":""Software Engineer Intern, Mobil..."
7,I am providing you with an HTML. I want you to...,"<!DOCTYPE html><html lang=""en""><head><meta cha...",[]
8,I am providing you with an HTML. I want you to...,"Management</div></div></div><div class=""JobPag...","[{""Job Title"":""Marketing Operations Manager"",""..."
9,I am providing you with an HTML. I want you to...,", San Francisco Office - Legal</div></div></di...","[{""Job Title"":""Senior Specialist, Regulatory C..."


In [52]:
ex_links_in_4o = get_exhaustive_count(extracted_data_llama, "openai_extracted", extracted_data_llama, "openai_extracted")

In [45]:
inference_data_llama = extracted_data_llama[extracted_data_llama["openai_extracted"].apply(lambda x: len(x)>2)].reset_index(drop=True)
inference_data_llama["model_response"] = inference_data_llama["model_output"].apply(lambda x: x.split("Response:")[-1] if x is not None else "")
inference_data_llama = inference_data_llama[["openai_extracted", "model_response"]]
inference_data_llama.head()

Unnamed: 0,openai_extracted,model_response
0,"[{""Job Title"":""Data Engineer"",""Job Location"":""...","\n[]]"">### Instruction:\nI am providing you wi..."
1,"[{""Job Title"":""Fraud and Risk Analyst, Identit...",\n[]]
2,"[{""Job Title"":""Senior iOS Engineer"",""Job Locat...","\n[]]"">1</button></div><div class=""JobPageRow_..."
3,"[{""Job Title"":""Software Engineer, Matching"",""J...","\n[]]"">Software Engineer, Data Platform Comput..."
4,"[{""Job Title"":""Senior Product Manager, Airport...","\n[]]"">[{""Job Title"":""Senior Product Manager, ..."


In [46]:
non_empty_llama = inference_data_llama[inference_data_llama["model_response"].apply(lambda x:len(x)>10 and "I am providing" not in x)].reset_index(drop=True)
non_empty_llama["model_response"] = non_empty_llama["model_response"].apply(lambda x:x.replace('\n[]]">', ""))
json_extracted_llama = non_empty_llama[non_empty_llama["model_response"].apply(check_json_adv)].reset_index(drop=True)
json_extracted_llama["model_response"] = json_extracted_llama["model_response"].apply(lambda x:return_adv_json(x))
json_extracted_llama = json_extracted_llama[json_extracted_llama["model_response"].apply(lambda x: len(x)>10)].reset_index(drop=True)
json_extracted_llama.head()

Unnamed: 0,openai_extracted,model_response
0,"[{""Job Title"":""Reference Data Analyst"",""Job Lo...","[{""Job Title"":""Senior Data Scientist"",""Job Loc..."
1,"[{""Job Title"":""Technician, Data Center Operati...","[{""Job Title"":""Senior Data Engineer, Data and ..."
2,"[{""Job Title"":""Analyst, EUC"",""Job Location"":""M...","[{""Job Title"":""Software Engineer III"",""Job Loc..."
3,"[{""Job Title"":""Reference Data Analyst"",""Job Lo...","[{""Job Title"":""Senior Data Scientist"",""Job Loc..."
4,"[{""Job Title"":""Lead Business Analyst, Product ...","[{""Job Title"":""Senior Director, Data Science a..."


In [49]:
exhaustive_count_llama = get_exhaustive_count(json_extracted_llama, "model_response", inference_data_llama, "openai_extracted")
print("Exhaustive Count for fine tuned Llama 3.2 1B: ", exhaustive_count_llama)

Exhaustive Count for fine tuned Llama 3.2 1B:  7


In [53]:
llama_links_not_in_4o, llama_links_in_4o, links_in_4o = get_links_in_4o(json_extracted_llama, inference_data_llama, "model_response", "openai_extracted")
print("Count of links in fine tuned Llama 3.2 1B but not in OpenAI's 4o: ", llama_links_not_in_4o)
print("Precise Count for fine tuned Llama 3.2 1B: ", llama_links_in_4o)
print("Precise Count of links in OpenAI's 4o: ", links_in_4o)
print("Exhaustive Count of links in OpenAI's 4o: ", ex_links_in_4o)

Count of links in fine tuned Llama 3.2 1B but not in OpenAI's 4o:  81
Precise Count for fine tuned Llama 3.2 1B:  7
Precise Count of links in OpenAI's 4o:  572
Exhaustive Count of links in OpenAI's 4o:  749


## Fine Tuned ReaderLM-v2 1.5B

In [54]:
extracted_data_readerlm = pd.read_parquet("../results/ReaderLM-v2_unseen_test_dataset_output.parquet")
extracted_data_readerlm.head()

Unnamed: 0,model_output,html,openai_extracted
0,I am providing you with an HTML. I want you to...,"ifgzpr-0 ebeCrQ"">Learn more</button></div></a>...","[{""Job Title"":""Data Engineer"",""Job Location"":""..."
1,I am providing you with an HTML. I want you to...,"eey"">Fraud and Risk Analyst, Identity</div><di...","[{""Job Title"":""Fraud and Risk Analyst, Identit..."
2,I am providing you with an HTML. I want you to...,"StyledButton-sc-ifgzpr-0 ebeCrQ"">Learn more</b...","[{""Job Title"":""Senior iOS Engineer"",""Job Locat..."
3,I am providing you with an HTML. I want you to...,"PageRow__Subtitle-sc-v4dlgb-4 cfQbFR"">New York...","[{""Job Title"":""Software Engineer, Matching"",""J..."
4,I am providing you with an HTML. I want you to...,"class=""JobPageRow__LeftItem-sc-v4dlgb-1 izfJqe...","[{""Job Title"":""Senior Product Manager, Airport..."


In [55]:
inference_data_readerlm = extracted_data_readerlm[extracted_data_readerlm["openai_extracted"].apply(lambda x: len(x)>2)].reset_index(drop=True)
inference_data_readerlm["model_response"] = inference_data_readerlm["model_output"].apply(lambda x: x.split("Response:")[-1])
inference_data_readerlm = inference_data_readerlm[["openai_extracted", "model_response"]]
inference_data_readerlm.head()

Unnamed: 0,openai_extracted,model_response
0,"[{""Job Title"":""Data Engineer"",""Job Location"":""...","\n[{""Job Title"":""Data Engineer"",""Job Location""..."
1,"[{""Job Title"":""Fraud and Risk Analyst, Identit...",I am providing you with an HTML. I want you to...
2,"[{""Job Title"":""Senior iOS Engineer"",""Job Locat...",I am providing you with an HTML. I want you to...
3,"[{""Job Title"":""Software Engineer, Matching"",""J...","\n[{""Job Title"":""Software Engineer, Ads Automa..."
4,"[{""Job Title"":""Senior Product Manager, Airport...",\n[]


In [56]:
non_empty_readerlm = inference_data_readerlm[inference_data_readerlm["model_response"].apply(lambda x:len(x)>10 and "I am providing" not in x)].reset_index(drop=True)
json_extracted_readerlm = non_empty_readerlm[non_empty_readerlm["model_response"].apply(check_json)].reset_index(drop=True)
json_extracted_readerlm.head()

Unnamed: 0,openai_extracted,model_response
0,"[{""Job Title"":""Data Engineer"",""Job Location"":""...","\n[{""Job Title"":""Data Engineer"",""Job Location""..."
1,"[{""Job Title"":""Software Engineer, Matching"",""J...","\n[{""Job Title"":""Software Engineer, Ads Automa..."
2,"[{""Job Title"":""Data Scientist, Decisions - AdT...","\n[{""Job Title"":""Data Scientist, Rider Segment..."
3,"[{""Job Title"":""Senior Specialist, Regulatory C...","\n[{""Job Title"":""Data Science Intern, Decision..."
4,"[{""Job Title"":""Data Science, Decisions - Airpo...","\n[{""Job Title"":""Data Scientist, Algorithms, D..."


In [57]:
exhaustive_count_readerlm = get_exhaustive_count(json_extracted_readerlm, "model_response", inference_data_readerlm, "openai_extracted")
print("Exhaustive Count for fine tuned ReaderLM 3.2 1B: ", exhaustive_count_readerlm)

Exhaustive Count for fine tuned ReaderLM 3.2 1B:  155


In [59]:
readerlm_links_not_in_4o, readerlm_links_in_4o, links_in_4o = get_links_in_4o(json_extracted_readerlm, inference_data_readerlm, "model_response", "openai_extracted")
print("Count of links in fine tuned ReaderLM-v2 1.5B but not in OpenAI's 4o: ", readerlm_links_not_in_4o)
print("Precise Count for fine tuned readerlm 3.2 1B:: ", readerlm_links_in_4o)
print("Precise Count of links in OpenAI's 4o: ", links_in_4o)
print("Exhaustive Count of links in OpenAI's 4o: ", ex_links_in_4o)

Count of links in fine tuned ReaderLM-v2 1.5B but not in OpenAI's 4o:  258
Precise Count for fine tuned readerlm 3.2 1B::  106
Precise Count of links in OpenAI's 4o:  572
Exhaustive Count of links in OpenAI's 4o:  749
