In [1]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
import pandas as pd
from enum import Enum

df = pd.read_csv('salary_labelled_development_set.csv', encoding='utf-8')

row = 15

class Freq(Enum):
    HOURLY = 1
    DAILY = 2
    WEEKLY = 3
    MONTHLY = 4
    ANNUAL = 5

# cleaning
def cleaning(df):
    def extract_salary_fields(y_true):
        return y_true.split('-')    

    df[['salary_min', 'salary_max', 'currency', 'period']] = df['y_true'].apply(
        lambda x: pd.Series(extract_salary_fields(x))
    )
    df.replace("None", None, inplace=True)
    df = df.dropna(subset=['salary_min', 'salary_max', 'currency', 'period'])

    df['salary_min'] = pd.to_numeric(df['salary_min'], errors='coerce')
    df['salary_max'] = pd.to_numeric(df['salary_max'], errors='coerce')

    df['period'] = df['period'].apply(lambda x: Freq[x])

    return df

df = cleaning(df)

#extraction
print(df.head())

     job_id                                          job_title  \
0  72000415  Financial Account - Call Center Agent - Up to 34k   
1  69481519  Aspiring Call Center Agents - Work from Home -...   
5  71611666                        Sanrio Gift Gate 兼職店務員(馬鞍山)   
7  72301597  Customer Service Agent With 1 month Call Cente...   
8  71038710                      ASAP - HR AND ADMIN ASSISTANT   

                                      job_ad_details nation_short_desc  \
0  <div><div><div>\n \n Job Opening \n \n <p>\n F...                PH   
1  <div><div>\n <div>\n <p><b>Job Opening</b></p>...                PH   
5  <div><div>\n<strong>Sanrio Gift Gate 兼職店務員(馬鞍山...                HK   
7  <div><div>\n \n <div>\n <div>\n \n Job Opening...                PH   
8  <div><div>\n <div>\n <p><b>Job Opening</b></p>...                PH   

  salary_additional_text                   y_true  salary_min  salary_max  \
0                    NaN  17500-17500-PHP-MONTHLY       17500       17500   
1   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['salary_min'] = pd.to_numeric(df['salary_min'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['salary_max'] = pd.to_numeric(df['salary_max'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['period'] = df['period'].apply(lambda x: Freq[x])


In [None]:
from bs4 import BeautifulSoup

raw_context = df.iloc[0]['job_ad_details']
context = BeautifulSoup(raw_context, "html.parser").get_text(separator=" ")

questions = {
    "salary minimum": "What is the lowest salary offered?",
    "salary maximum": "What is the highest salary offered?",
    "salary currency": "What is the currency of the salary?",
    "salary frequency": "Is the salary paid monthly, annually, or per hour?"
}

for label, question in questions.items():
    result = qa_pipeline(question=question, context=context)
    print(f"{label}: {result['answer']}")

salary minimum: ₱15,000 - ₱20,000
salary maximum: ₱15,000 - ₱20,000
salary currency: ₱15,000 - ₱20,000
salary frequency: ₱15,000 - ₱20,000


In [11]:
from datasets import Dataset

questions = {
    "salary_min": "What is the lowest salary offered?",
    "salary_max": "What is the highest salary offered?",
    "currency": "What is the currency of the salary?",
    "period": "Is the salary paid monthly, annually, or per hour?"
}

qa_rows = []
for _, row in df.iterrows():
    context = row["job_ad_details"]
    for label, question in questions.items():
        qa_rows.append({
            "context": context,
            "question": question,
            "answers": {
                "text": [str(row[label])],
                "answer_start": [context.find(str(row[label]))] if str(row[label]) in context else [0]
            },
            "id": f"{row['job_id']}_{label}"
        })

qa_dataset = Dataset.from_pandas(pd.DataFrame(qa_rows))

print(qa_dataset[0])

{'context': '<div><div><div>\n \n Job Opening \n \n <p>\n Financial Account - Call Center Agent - Up to 34k\n </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Job Industry\n \n \n <p>\n Telecommunications </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Job Type \n \n <p>\n Full-Time </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Experience Level\n \n \n <p>\n Entry Level </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Date Posted \n \n <p>\n 2022-10-27 </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Job Location \n \n <p>\n Pasig BlvdPasig1000NCRPhilippines </p>\n </div></div></div>\n<div><div></div></div>\n<div><div><div>\n \n Company Information \n \n <p>\n Sapient\n \n Pasig Blvd \n Cebu, Central Visayas \n 6019 \n Sapient is Philippine-based BPO that provides a range of outsourcing services from consulting services, IT-enabled services, and call center services pr

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def prepare_train_features(example):
    encoding = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )

    offsets = encoding.pop("offset_mapping")

    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(example["answers"]["text"][0])

    sequence_ids = encoding.sequence_ids()

    token_start_index = sequence_ids.index(1)
    token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

    start_token = end_token = 0
    for idx in range(token_start_index, token_end_index + 1):
        start, end = offsets[idx]
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx

    encoding["start_positions"] = start_token
    encoding["end_positions"] = end_token

    return encoding

tokenized_dataset = qa_dataset.map(prepare_train_features, batched=False, remove_columns=qa_dataset.column_names)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 5268/5268 [00:11<00:00, 440.40 examples/s]


In [16]:
training_args = TrainingArguments(
    output_dir="./finetuned-qa-model",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="no",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(
  0%|          | 0/1977 [05:19<?, ?it/s]
 25%|██▌       | 500/1977 [09:01<27:18,  1.11s/it]
 25%|██▌       | 500/1977 [09:01<27:18,  1.11s/it]

{'loss': 1.0253, 'grad_norm': 9.790283203125, 'learning_rate': 3.735457764289327e-05, 'epoch': 0.76}


 51%|█████     | 1000/1977 [17:58<17:57,  1.10s/it] 
 51%|█████     | 1000/1977 [17:58<17:57,  1.10s/it]

{'loss': 0.5762, 'grad_norm': 5.8679304122924805, 'learning_rate': 2.470915528578655e-05, 'epoch': 1.52}


 76%|███████▌  | 1500/1977 [27:11<08:47,  1.11s/it]
 76%|███████▌  | 1500/1977 [27:11<08:47,  1.11s/it]

{'loss': 0.4772, 'grad_norm': 2.8641157150268555, 'learning_rate': 1.2063732928679819e-05, 'epoch': 2.28}


100%|██████████| 1977/1977 [35:48<00:00,  1.07it/s]
100%|██████████| 1977/1977 [36:05<00:00,  1.10s/it]

{'train_runtime': 2165.1461, 'train_samples_per_second': 7.299, 'train_steps_per_second': 0.913, 'train_loss': 0.5884200015572146, 'epoch': 3.0}





TrainOutput(global_step=1977, training_loss=0.5884200015572146, metrics={'train_runtime': 2165.1461, 'train_samples_per_second': 7.299, 'train_steps_per_second': 0.913, 'total_flos': 4129533943455744.0, 'train_loss': 0.5884200015572146, 'epoch': 3.0})

In [20]:
test_df = pd.read_csv('salary_labelled_test_set.csv', encoding='utf-8')

test_df = cleaning(test_df)

test_rows = []
for _, row in test_df.iterrows():
    context = row["job_ad_details"]
    for label, question in questions.items():
        test_rows.append({
            "context": context,
            "question": question,
            "answers": {
                "text": [str(row[label])],
                "answer_start": [context.find(str(row[label]))] if str(row[label]) in context else [0]
            },
            "id": f"{row['job_id']}_{label}",
            "label": label,
            "ground_truth": str(row[label])
        })

test_dataset = Dataset.from_pandas(pd.DataFrame(test_rows))

# model - untrained
pretrained_model_name = "bert-base-uncased"
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
pretrained_model = AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name)
pretrained_qa_pipeline = pipeline("question-answering", model=pretrained_model, tokenizer=pretrained_tokenizer)

# model - finetuned
model_path = "./finetuned-qa-model/checkpoint-1977"
finetuned_model = AutoModelForQuestionAnswering.from_pretrained(model_path)
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)
qa_pipeline = pipeline("question-answering", model=finetuned_model, tokenizer=finetuned_tokenizer)

for i in range(50):
    sample = test_dataset[i]
    question = sample["question"]
    context = sample["context"]

    ground_truth = sample["ground_truth"]
    label = sample["label"]

    out_of_box_result = pretrained_qa_pipeline(question=question, context=context)
    fine_tuned_result = qa_pipeline(question=question, context=context)

    print(f"Label         : {label}")
    print(f"Question      : {question}")
    print(f"Ground Truth  : {ground_truth}")
    print(f"Pretrained    : {out_of_box_result['answer']}")
    print(f"Fine-Tuned    : {fine_tuned_result['answer']}")
    print("-" * 60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['salary_min'] = pd.to_numeric(df['salary_min'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['salary_max'] = pd.to_numeric(df['salary_max'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['period'] = df['period'].apply(lambda x: Freq[x])
Some weights of 

Label         : salary_min
Question      : What is the lowest salary offered?
Ground Truth  : 1500
Pretrained    : semua rekod mengenai
Fine-Tuned    : 1500
------------------------------------------------------------
Label         : salary_max
Question      : What is the highest salary offered?
Ground Truth  : 1800
Pretrained    : semua rekod mengenai
Fine-Tuned    : 1800
------------------------------------------------------------
Label         : currency
Question      : What is the currency of the salary?
Ground Truth  : MYR
Pretrained    : >Keperluan</strong></p
Fine-Tuned    : <ul
------------------------------------------------------------
Label         : period
Question      : Is the salary paid monthly, annually, or per hour?
Ground Truth  : Freq.MONTHLY
Pretrained    : >Keperluan</strong></p
Fine-Tuned    : <ul><li>Bertanggungjawab
------------------------------------------------------------
Label         : salary_min
Question      : What is the lowest salary offered?
Ground T