# Use BERTScore and ROUGE for Measurement.

Import packages.

In [10]:
import pandas as pd
import numpy as np

from bert_score import BERTScorer
from json import dumps
from rouge_score import rouge_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Import LangGraph outputs and Original advertisements.

In [11]:
llm_outputs_df = pd.read_csv("job-description-logs/loop_test_full_20251025_201551.csv")
original_df = pd.read_csv("job-description-data/sample_df.csv")

In [12]:
llm_json = llm_outputs_df[['job_title', 'skills', 'responsibilities', 'requirements',
       'classification', 'salary', 'location', 'workType']].apply(lambda row: dumps(row.to_dict()), axis=1).tolist()
llm_json

['{"job_title": "[\'Business Services Senior Accountant\']", "skills": "[\'Preparation of financial statements\', \'tax returns\', \'BAS\', \'FBT\', \'Payroll tax\', \'Journal entries\', \'reconciling accounts\', \'liaising with ATO and ASIC\', \'ad hoc advisory\', \'research work\']", "responsibilities": "[\'Preparation of financial statements, tax returns, BAS, FBT, and Payroll tax\', \'Journal entries and reconciling accounts\', \'Liaising with the ATO, ASIC, and other bodies\', \'Ad hoc advisory and research work\', \'Mentor juniors and grads\']", "requirements": "[\'CA Qualified (or near completion)\', \'4 - 5 years experience\', \'Ability to monitor WIP and billings\', \'Solid understanding of Tax fundamentals\', \'Excellent written and verbal communication\', \'Enthusiastic and self-starter attitude\']", "classification": "[\'Accounting\', \'Business Services & Corporate Advisory\']", "salary": "[]", "location": "[\'CBD, Inner West & Eastern Suburbs\', \'Sydney\']", "workType": 

In [13]:
original_json = original_df[['title', 'job_description_clean', 'classification',
       'subClassification', 'area', 'location', 'suburb', 'workType']].apply(lambda row: dumps(row.to_dict()), axis=1).tolist()
original_json 

['{"title": "Business Services Senior Accountant", "job_description_clean": "This Senior Accountant role is with a national firm who have been a mainstay of the AFR Top 50 list over the past decade and they are looking for a recently qualified accountant to join their ever - growing firm. Working directly with partners and directors, you will get hands on experience with some of the highest profile clients that they have on their books while also getting the opportunity to mentor juniors and grads as they some up through the ranks.\\nThere\\u2019s a high priority on a work life balance as well as ongoing professional development to make sure that you can be the best version of you that you can possibly be. This is a perfect opportunity for someone who is looking for a high class, corporate firm who promote on performance not tenure. The Role Preparation of financial statements, tax returns, BAS, FBT and Payroll tax Journal entries and reconciling accounts Liaising with the ATO, ASIC an

Compute BERT Score.

In [14]:
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(llm_json, original_json)
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

bert_df = pd.DataFrame()
bert_df ["bertscore_precision"] = P.tolist()
bert_df ["bertscore_recall"] = R.tolist()
bert_df ["bertscore_f1"] = F1.tolist()

BERTScore Precision: 0.7315, Recall: 0.5903, F1: 0.6528


Compute Rouge Score.

In [15]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_f, rouge2_f, rougel_f = [], [], []
for cand, ref in zip(llm_json, original_json):
    s = scorer.score(ref, cand)
    rouge1_f.append(s['rouge1'].fmeasure)
    rouge2_f.append(s['rouge2'].fmeasure)
    rougel_f.append(s['rougeL'].fmeasure)

rouge_df = pd.DataFrame()
rouge_df["rouge1_f1"] = rouge1_f
rouge_df["rouge2_f1"] = rouge2_f
rouge_df["rougeL_f1"] = rougel_f


In [16]:
measure_df = pd.concat([bert_df, rouge_df], axis=1)
measure_df 

Unnamed: 0,bertscore_precision,bertscore_recall,bertscore_f1,rouge1_f1,rouge2_f1,rougeL_f1
0,0.734463,0.582393,0.649648,0.453865,0.380952,0.42394
1,0.751452,0.62898,0.684783,0.585227,0.451429,0.4375
2,0.74392,0.528512,0.617983,0.354286,0.229885,0.24
3,0.70608,0.57138,0.631628,0.399274,0.302368,0.373866
4,0.675221,0.507401,0.579404,0.226974,0.155116,0.184211
5,0.758698,0.662294,0.707226,0.605932,0.510638,0.563559
6,0.727079,0.585343,0.648558,0.439189,0.360544,0.405405
7,0.718849,0.592076,0.649333,0.446634,0.375618,0.436782
8,0.770313,0.684498,0.724874,0.69375,0.540881,0.65
9,0.684678,0.603354,0.641449,0.51419,0.361809,0.434057


Combine measurement.

In [18]:
measure_df["combined_f1"] = (measure_df["bertscore_f1"] + measure_df["rougeL_f1"]) / 2
print("\n=== Average Scores ===")
print(f"BERTScore F1: {measure_df['bertscore_f1'].mean():.4f}")
print(f"ROUGE-L F1  : {measure_df ['rougeL_f1'].mean():.4f}")
print(f"Combined F1 : {measure_df ['combined_f1'].mean():.4f}")


=== Average Scores ===
BERTScore F1: 0.6528
ROUGE-L F1  : 0.3948
Combined F1 : 0.5238
