# Baseline 2: Vanilla-BERT implementation for expert ranking

**Note**

As my laptop does not have sufficient compute, this notebook was executed on kaggle and added to the repo so that the outputs are available right away.

In [3]:
import pandas as pd
import json
from transformers import BertTokenizer
import pickle
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizerFast, AdamW
import sklearn
from sklearn.metrics import average_precision_score
import numpy as np
import torch.nn.functional as F
import warnings

warnings.simplefilter("ignore", category=RuntimeWarning)
warnings.simplefilter("ignore", category=FutureWarning)

# Set to true to load the saved BERT-tokens and save time
load_tokens = False
# Set to true to avoid creating the datasets anew if already available
save_datasets = False

## 1. Initial data screening

Let's take a look at the data that is available in the GitHub repo accompanying the paper.

In [2]:
# Relative file paths
queries_file = "../data/queries_bankruptcy.csv"
labels_file = "../data/labels.qrel"
lawyerid_to_url_file = "../data/lawyerid_to_lawyerurl.json"

# Load datasets
queries_df = pd.read_csv(queries_file, header=None, names=["queryid", "query"])
labels_df = pd.read_csv(labels_file, sep=" ", header=None, names=["queryid", "iteration", "lawyerid", "label"])
with open(lawyerid_to_url_file, "r") as f:
    lawyerid_to_url = json.load(f)

In [3]:
queries_df.head()

Unnamed: 0,queryid,query
0,0,chapter 13 bankruptcy reorganization plan
1,1,nondischargeable debt and student loans
2,2,employment as an independent contractor
3,3,debt collection and debt settlement
4,4,chapter 7 bankruptcy for businesses


In [4]:
labels_df.head()

Unnamed: 0,queryid,iteration,lawyerid,label
0,1,0,3,1
1,2,0,3,1
2,3,0,3,1
3,4,0,3,1
4,5,0,3,1


In [5]:
print(labels_df["label"].value_counts())

label
1    1576
Name: count, dtype: int64


In [6]:
print(labels_df["lawyerid"].nunique())

51


Right off the bat, we run into a major problem: The labels file provided by the authors is not complete. Firstly, there are uniquely relevant lawyers in the file, even though relevant and irrelevant ones are necessary for training. One could assume that this means all the other lawyers are irrelevant for the given queries. This is contradicted by the fact that there are only 51 unique relevant lawyers in the file, while the authors outlined finding 61.

The best way to deal with this is to follow the process of determining relevant lawyers ourselves and using that for training. **Note however**, that this will inevidably lead to differing results, since the questions and answers needed to be scraped manually and there was a significant number of changes to the webpages over time (mainly pages that are not available anymore.)

## 2. Create our own dataset since the one in GitHub is not usable

In [7]:
answers_file = "../data/all_questions_and_answer_new.parquet"

answers_df = pd.read_parquet(answers_file)
answers_df.head()

Unnamed: 0,number,url,title,question,question_tags,answers,lawyers,posted_times,answer_card_text,stars,reviews,rating,helpful,lawyers_agree,best_answer
0,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"Bankruptcy,Debt,Bankruptcy and debt",Definitely get out of this arrangement immedia...,https://www.avvo.com/attorneys/750961.html,2021-07-18,Answer\nLarry R. Maitland II\nSocial Security ...,5.0,20.0,9.5,1.0,1.0,False
1,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"Bankruptcy,Debt,Bankruptcy and debt",Sounds to me like you are getting scammed. Was...,https://www.avvo.com/attorneys/370602.html,2021-07-16,Answer\nStuart Gregory Steingraber\nBankruptcy...,4.994382,178.0,9.8,1.0,1.0,False
2,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"Bankruptcy,Debt,Bankruptcy and debt",Why not pay bills yourself? You are in control...,https://www.avvo.com/attorneys/16108.html,2021-07-16,Answer\nRichard D. Granvold\nChapter 7 Bankrup...,4.680556,72.0,,0.0,1.0,False
3,1,https://www.avvo.com/legal-answers/high-credit...,High credit card balance consolidation offer.,Can I be held accountable for my late fathers ...,"Bankruptcy,Credit,Debt,Debt settlement,Debt ne...",Were you on the credit card along with your fa...,https://www.avvo.com/attorneys/383564.html,2021-07-15,Answer\nHarlene Miller\nBankruptcy Attorney in...,5.0,10.0,9.0,1.0,1.0,False
4,1,https://www.avvo.com/legal-answers/high-credit...,High credit card balance consolidation offer.,Can I be held accountable for my late fathers ...,"Bankruptcy,Credit,Debt,Debt settlement,Debt ne...",Probate may be required.\nCreditor may be requ...,https://www.avvo.com/attorneys/312867.html,2021-07-15,Answer\nJames Charles Shields\nBankruptcy Atto...,4.625,24.0,9.7,0.0,1.0,False


For many questions, the links were not available anymore, resulting in empty rows. We remove those.

In [8]:
# Count the number of None or NaN values in each column
nan_counts = answers_df.isna().sum()
print(nan_counts)

number                 0
url                    0
title                  0
question               0
question_tags       6867
answers             6867
lawyers             6867
posted_times        6867
answer_card_text    6867
stars               7275
reviews             7620
rating              8413
helpful             6867
lawyers_agree       9743
best_answer         6867
dtype: int64


In [9]:
# Drop rows where 'answers' column is NaN or None
answers_df = answers_df.dropna(subset=['answers'])
print(answers_df.shape)

(11223, 15)


### Create our own lawyerid_to_lawyerurl and queries_bankruptcy file

These will then be used to create our own labels file.

In [10]:
# Extract unique lawyer URLs
unique_lawyers = answers_df['lawyers'].unique()

# Create a mapping of lawyer IDs to lawyer URLs
lawyer_mapping = {'lawyer_id': [], 'lawyer_url': []}
for idx, url in enumerate(unique_lawyers, start=1):
    lawyer_mapping['lawyer_id'].append(idx)
    lawyer_mapping['lawyer_url'].append(url)

lawyer_mapping_df = pd.DataFrame(lawyer_mapping)

if save_datasets:
    lawyer_mapping_df.to_csv('../data/own_files/lawyerid_to_lawyerurl_own.csv', index=False)

In [11]:
# Extract all queries from the "question_tags" column
all_queries = answers_df['question_tags'].str.split(',').explode().str.strip()

# Count the frequency of each query
query_counts = all_queries.value_counts()

# Select the top 20% most frequent queries according to paper methodology
top_20_percent_queries = query_counts.head(int(len(query_counts) * 0.2)).index

# Create a mapping of query IDs to these queries
query_mapping = {'query_id': [], 'query': []}
for idx, query in enumerate(top_20_percent_queries, start=1):
    query_mapping['query_id'].append(idx)
    query_mapping['query'].append(query)

query_mapping_df = pd.DataFrame(query_mapping)
query_mapping_df.shape

if save_datasets:
    query_mapping_df.to_csv('../data/own_files/queries_bankruptcy_own.csv', index=False)

In [12]:
query_mapping_df.head()

Unnamed: 0,query_id,query
0,1,Bankruptcy
1,2,Debt
2,3,Bankruptcy and debt
3,4,Credit
4,5,Chapter 7 bankruptcy


Create lawyer_id and query_id column in the answers df to enable us to make the calculations for the lawyer-expert criteria.

In [13]:
# Create the lawyer_id column in answers_df
answers_df = answers_df.merge(lawyer_mapping_df, left_on='lawyers', right_on='lawyer_url', how='left')
answers_df = answers_df.rename(columns={'lawyer_id': 'lawyer_id'})

# Convert the question_tags column to a list format
answers_df['question_tags'] = answers_df['question_tags'].apply(lambda x: [tag.strip() for tag in x.split(',')])

# Create the query_id column
def get_query_ids(tags, query_mapping_df):
    tag_to_id = dict(zip(query_mapping_df['query'], query_mapping_df['query_id']))
    return [tag_to_id[tag] for tag in tags if tag in tag_to_id]

answers_df['query_id_list'] = answers_df['question_tags'].apply(lambda tags: get_query_ids(tags, query_mapping_df))
answers_df.rename(columns={"best_answer": "user_accepted"}, inplace=True)
answers_df.head()

Unnamed: 0,number,url,title,question,question_tags,answers,lawyers,posted_times,answer_card_text,stars,reviews,rating,helpful,lawyers_agree,user_accepted,lawyer_id,lawyer_url,query_id_list
0,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Definitely get out of this arrangement immedia...,https://www.avvo.com/attorneys/750961.html,2021-07-18,Answer\nLarry R. Maitland II\nSocial Security ...,5.0,20.0,9.5,1.0,1.0,False,1,https://www.avvo.com/attorneys/750961.html,"[1, 2, 3]"
1,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Sounds to me like you are getting scammed. Was...,https://www.avvo.com/attorneys/370602.html,2021-07-16,Answer\nStuart Gregory Steingraber\nBankruptcy...,4.994382,178.0,9.8,1.0,1.0,False,2,https://www.avvo.com/attorneys/370602.html,"[1, 2, 3]"
2,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Why not pay bills yourself? You are in control...,https://www.avvo.com/attorneys/16108.html,2021-07-16,Answer\nRichard D. Granvold\nChapter 7 Bankrup...,4.680556,72.0,,0.0,1.0,False,3,https://www.avvo.com/attorneys/16108.html,"[1, 2, 3]"
3,1,https://www.avvo.com/legal-answers/high-credit...,High credit card balance consolidation offer.,Can I be held accountable for my late fathers ...,"[Bankruptcy, Credit, Debt, Debt settlement, De...",Were you on the credit card along with your fa...,https://www.avvo.com/attorneys/383564.html,2021-07-15,Answer\nHarlene Miller\nBankruptcy Attorney in...,5.0,10.0,9.0,1.0,1.0,False,4,https://www.avvo.com/attorneys/383564.html,"[1, 4, 2, 37, 108, 3, 42, 34]"
4,1,https://www.avvo.com/legal-answers/high-credit...,High credit card balance consolidation offer.,Can I be held accountable for my late fathers ...,"[Bankruptcy, Credit, Debt, Debt settlement, De...",Probate may be required.\nCreditor may be requ...,https://www.avvo.com/attorneys/312867.html,2021-07-15,Answer\nJames Charles Shields\nBankruptcy Atto...,4.625,24.0,9.7,0.0,1.0,False,5,https://www.avvo.com/attorneys/312867.html,"[1, 4, 2, 37, 108, 3, 42, 34]"


In [14]:
print(answers_df["helpful"].value_counts())

helpful
0.0     8777
1.0     2193
2.0      159
3.0       44
4.0       16
5.0       10
6.0        8
8.0        5
7.0        5
9.0        3
14.0       2
20.0       1
Name: count, dtype: int64


In [15]:
long_answers_df = answers_df.explode('query_id_list')

# Rename the exploded column to query_id
long_answers_df = long_answers_df.rename(columns={'query_id_list': 'query_id'})

long_answers_df.head()

Unnamed: 0,number,url,title,question,question_tags,answers,lawyers,posted_times,answer_card_text,stars,reviews,rating,helpful,lawyers_agree,user_accepted,lawyer_id,lawyer_url,query_id
0,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Definitely get out of this arrangement immedia...,https://www.avvo.com/attorneys/750961.html,2021-07-18,Answer\nLarry R. Maitland II\nSocial Security ...,5.0,20.0,9.5,1.0,1.0,False,1,https://www.avvo.com/attorneys/750961.html,1
0,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Definitely get out of this arrangement immedia...,https://www.avvo.com/attorneys/750961.html,2021-07-18,Answer\nLarry R. Maitland II\nSocial Security ...,5.0,20.0,9.5,1.0,1.0,False,1,https://www.avvo.com/attorneys/750961.html,2
0,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Definitely get out of this arrangement immedia...,https://www.avvo.com/attorneys/750961.html,2021-07-18,Answer\nLarry R. Maitland II\nSocial Security ...,5.0,20.0,9.5,1.0,1.0,False,1,https://www.avvo.com/attorneys/750961.html,3
1,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Sounds to me like you are getting scammed. Was...,https://www.avvo.com/attorneys/370602.html,2021-07-16,Answer\nStuart Gregory Steingraber\nBankruptcy...,4.994382,178.0,9.8,1.0,1.0,False,2,https://www.avvo.com/attorneys/370602.html,1
1,0,https://www.avvo.com/legal-answers/a-company-a...,A company assigned by SSA to pay my bills.,I have an organization assigned by SSA to pay ...,"[Bankruptcy, Debt, Bankruptcy and debt]",Sounds to me like you are getting scammed. Was...,https://www.avvo.com/attorneys/370602.html,2021-07-16,Answer\nStuart Gregory Steingraber\nBankruptcy...,4.994382,178.0,9.8,1.0,1.0,False,2,https://www.avvo.com/attorneys/370602.html,2


### Implement expert-lawyer criteria

Lawyer is an expert in a query/tag if:
- has 10 or more answers in bankruptcy accepted by the asker (col "user_accepted") (represented by the bankruptcy questions in our entire dataset)
- more than average number of best answers within query (best answer is either col "user_accepted" is True  by asker OR more than 3 lawyers found answer useful (col "lawyers_agree"))
AND
- count of best answers/count of answers higher average in query category 

In [16]:
# Define a function to determine expert lawyers
def identify_expert_lawyers(answers_df, query_mapping_df):
    # Initialize an empty list to store expert labels
    expert_labels = []

    # Calculate global user_accepted counts for each lawyer
    global_user_accepted_counts = answers_df[
        answers_df['user_accepted'] == True
    ].groupby('lawyer_id').size()

    # Loop through each query in the mapping df
    for _, query_row in query_mapping_df.iterrows():
        query_id = query_row['query_id']
        query_name = query_row['query']

        # Filter the answers DataFrame for rows related to the current query
        query_answers = answers_df[answers_df['query_id_list'].apply(lambda x: query_id in x)]

        if query_answers.empty:
            continue

        # Calculate metrics
        lawyer_answer_counts = query_answers.groupby('lawyer_id').size()
        lawyer_best_answer_counts = query_answers[
            (query_answers['user_accepted'] == True) | (query_answers['lawyers_agree'] > 3)
        ].groupby('lawyer_id').size()

        # Average metrics
        avg_best_answers_per_query = lawyer_best_answer_counts.sum() / len(lawyer_answer_counts)
        avg_best_answer_ratio = (lawyer_best_answer_counts / lawyer_answer_counts).mean()

        # Identify experts and create labels
        for lawyer_id in lawyer_answer_counts.index:
            is_expert = int(
                global_user_accepted_counts.get(lawyer_id, 0) >= 10 and \
                lawyer_best_answer_counts.get(lawyer_id, 0) > avg_best_answers_per_query and \
                (lawyer_best_answer_counts.get(lawyer_id, 0) / lawyer_answer_counts[lawyer_id]) > avg_best_answer_ratio
            )
            expert_labels.append({'query_id': query_id, 'lawyer_id': lawyer_id, 'label': is_expert})

    return pd.DataFrame(expert_labels)

expert_lawyers_df = identify_expert_lawyers(answers_df, query_mapping_df)

print(expert_lawyers_df)


       query_id  lawyer_id  label
0             1          1      0
1             1          2      0
2             1          3      0
3             1          4      0
4             1          5      0
...         ...        ...    ...
30071       131       1796      0
30072       131       1797      0
30073       131       1800      0
30074       131       1801      0
30075       131       1888      0

[30076 rows x 3 columns]


In [17]:
# Count unique expert lawyers
unique_expert_lawyers = expert_lawyers_df[expert_lawyers_df['label'] == 1]['lawyer_id'].nunique()

print(f"Number of unique expert lawyers: {unique_expert_lawyers}")

Number of unique expert lawyers: 3


We run into a problem using the original expert criteria from the paper. Due to the differing and smaller dataset, only 3 expert lawyers result, which is too little representation for experts compared to the 61 in the paper. We will therefore adjust the expert criteria slightly. Instead of using the condition of "user_accepted", which occurs very rarely, we will use "helpful", which denotes how many other platform users (non-lawyers) found the answer useful instead of being question-asker accepted. We will also adjust the limit of across-dataset count of useful answers from 10 to 8.

In [18]:
# Adjusted function to determine expert lawyers
def identify_expert_lawyers(answers_df, query_mapping_df):
    # Initialize an empty list to store expert labels
    expert_labels = []

    # Calculate global helpful answer counts for each lawyer
    global_helpful_counts = answers_df[answers_df['helpful'] >= 1].groupby('lawyer_id').size()

    # Loop through each query in the mapping df
    for _, query_row in query_mapping_df.iterrows():
        query_id = query_row['query_id']
        query_name = query_row['query']

        # Filter the answers DataFrame for rows related to the current query
        query_answers = answers_df[answers_df['query_id_list'].apply(lambda x: query_id in x)]

        if query_answers.empty:
            continue

        # Calculate metrics
        lawyer_answer_counts = query_answers.groupby('lawyer_id').size()
        lawyer_best_answer_counts = query_answers[
            (query_answers['helpful'] >= 1) | (query_answers['lawyers_agree'] > 3)
        ].groupby('lawyer_id').size()

        # Average metrics
        avg_best_answers_per_query = lawyer_best_answer_counts.sum() / len(lawyer_answer_counts)
        avg_best_answer_ratio = (lawyer_best_answer_counts / lawyer_answer_counts).mean()

        # Identify experts and create labels
        for lawyer_id in lawyer_answer_counts.index:
            is_expert = int(
                global_helpful_counts.get(lawyer_id, 0) >= 8 and \
                lawyer_best_answer_counts.get(lawyer_id, 0) > avg_best_answers_per_query and \
                (lawyer_best_answer_counts.get(lawyer_id, 0) / lawyer_answer_counts[lawyer_id]) > avg_best_answer_ratio
            )
            expert_labels.append({'query_id': query_id, 'lawyer_id': lawyer_id, 'label': is_expert})

    return pd.DataFrame(expert_labels)

expert_lawyers_df = identify_expert_lawyers(answers_df, query_mapping_df)

expert_lawyers_df.head()


Unnamed: 0,query_id,lawyer_id,label
0,1,1,0
1,1,2,0
2,1,3,0
3,1,4,0
4,1,5,0


In [19]:
# Count unique expert lawyers
unique_expert_lawyers = expert_lawyers_df[expert_lawyers_df['label'] == 1]['lawyer_id'].nunique()

print(f"Number of unique expert lawyers: {unique_expert_lawyers}")

Number of unique expert lawyers: 53


With this, we have a dataframe with every unique lawyer-query combination, denoting whether a lawyer is an expert for that query via the "label" column. We also have 53 expert lawyers, which is close enough to the paper methodology.

As per the paper methodology, only queries are retained that have at least two expert lawyers.

In [20]:
# Filter queries that have at least two ones in the label column
query_label_counts = expert_lawyers_df[expert_lawyers_df['label'] == 1].groupby('query_id').size()
queries_with_at_least_two_ones = query_label_counts[query_label_counts >= 2].index

# Retain only the unique queries in expert_lawyers_df that meet the criteria
filtered_expert_lawyers_df = expert_lawyers_df[expert_lawyers_df['query_id'].isin(queries_with_at_least_two_ones)]

In [21]:
print(filtered_expert_lawyers_df.shape)

(29810, 3)


In [22]:
filtered_expert_lawyers_df.head()

Unnamed: 0,query_id,lawyer_id,label
0,1,1,0
1,1,2,0
2,1,3,0
3,1,4,0
4,1,5,0


In [23]:
if save_datasets:
    filtered_expert_lawyers_df.to_csv('../data/own_files/labels_own.csv', index=False, header=True)

BERT is trained using query-answer pairs as well as a label whether the answer was written by a lawyer relevant to the query. To achieve this, we need to merge the answers_df for the lawyer answers and the filtered_expert_lawyers_df for the labels on unique combinations of [lawyer_id, query_id].

In [24]:
merged_df = pd.merge(filtered_expert_lawyers_df, query_mapping_df, on="query_id", how="inner")
label_answer_df = pd.merge(merged_df, long_answers_df, on=["lawyer_id", "query_id"], how="inner")
# Select the relevant columns for the final output
final_df = label_answer_df[["query_id", "lawyer_id", "query", "answers" , "label"]]
final_df.head(20)

Unnamed: 0,query_id,lawyer_id,query,answers,label
0,1,1,Bankruptcy,Definitely get out of this arrangement immedia...,0
1,1,2,Bankruptcy,Sounds to me like you are getting scammed. Was...,0
2,1,2,Bankruptcy,The debtor's lawyer cannot knowingly submit in...,0
3,1,2,Bankruptcy,"Chances are you can't remove the lien, but may...",0
4,1,2,Bankruptcy,What does your BK lawyer say? No lawyer? For s...,0
5,1,2,Bankruptcy,"With the new equity exemption amounts, your BK...",0
6,1,2,Bankruptcy,My colleagues have given you good advice with ...,0
7,1,2,Bankruptcy,"Probably not. To be certain, ask your BK lawye...",0
8,1,2,Bankruptcy,What does your lawyer say? No lawyer? Remember...,0
9,1,2,Bankruptcy,"Your question has several ""moving parts"" invol...",0


In [25]:
print(final_df.shape)

(84932, 5)


Even though we do not have the exact same data, we can still apply the same data splitting methodology as outlined in the paper, ensuring an equal number of expert lawyers in the train/validation and test set, and all non-relevant lawyers are in train, val AND test. HOWEVER, this was tried and it leads to a large dataset, making model training even slower. With the available GPU resources, an adjustment was necessary. Therefore we implement an approach that samples a portion of non-relevant lawyers and adds them to the train, validation and test set.

In [26]:
# Define the percentage of non-relevant lawyers to sample for each set
non_relevant_sample_percentage = 0.5 

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Identify relevant and non-relevant lawyers
# Identify relevant lawyers
relevant_lawyers = final_df[final_df['label'] == 1]['lawyer_id'].unique()

# Shuffle relevant lawyers reproducibly
np.random.shuffle(relevant_lawyers)
non_relevant_lawyers = final_df[~final_df['lawyer_id'].isin(relevant_lawyers)]['lawyer_id'].unique()

# Ensure equal split of relevant lawyers; USE THIS if number of expert lawyers is not exactly 53
#num_relevant = len(relevant_lawyers)
#split_size = num_relevant // 3

train_relevant = relevant_lawyers[:18]
val_relevant = relevant_lawyers[18:36]
test_relevant = relevant_lawyers[36:]

# Create subsets for relevant lawyers
train_set = final_df[(final_df['lawyer_id'].isin(train_relevant)) & (final_df['label'] == 1)]
val_set = final_df[(final_df['lawyer_id'].isin(val_relevant)) & (final_df['label'] == 1)]
test_set = final_df[(final_df['lawyer_id'].isin(test_relevant)) & (final_df['label'] == 1)]
#print(train_set.shape, val_set.shape, test_set.shape)


# Split non-relevant lawyers into three roughly equal groups
np.random.shuffle(non_relevant_lawyers)  # Shuffle to randomize splitting
num_non_relevant = len(non_relevant_lawyers)
split_size_non_relevant = num_non_relevant // 3

train_non_relevant_group = non_relevant_lawyers[:split_size_non_relevant]
val_non_relevant_group = non_relevant_lawyers[split_size_non_relevant:2*split_size_non_relevant]
test_non_relevant_group = non_relevant_lawyers[2*split_size_non_relevant:]

# Sample a percentage from each non-relevant group
train_non_relevant_sample = np.random.choice(train_non_relevant_group, 
                                             int(len(train_non_relevant_group) * non_relevant_sample_percentage),
                                             replace=False)
val_non_relevant_sample = np.random.choice(val_non_relevant_group, 
                                           int(len(val_non_relevant_group) * non_relevant_sample_percentage),
                                           replace=False)
test_non_relevant_sample = np.random.choice(test_non_relevant_group, 
                                            int(len(test_non_relevant_group) * non_relevant_sample_percentage),
                                            replace=False)

# Add sampled non-relevant lawyers to respective sets
train_set = pd.concat([train_set, final_df[final_df['lawyer_id'].isin(train_non_relevant_sample)]])
val_set = pd.concat([val_set, final_df[final_df['lawyer_id'].isin(val_non_relevant_sample)]])
test_set = pd.concat([test_set, final_df[final_df['lawyer_id'].isin(test_non_relevant_sample)]])

# Reset index
train_set.reset_index(drop=True, inplace=True)
val_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

print(train_set.shape, val_set.shape, test_set.shape)

train_set.head()

(8836, 5) (8949, 5) (8085, 5)


Unnamed: 0,query_id,lawyer_id,query,answers,label
0,1,81,Bankruptcy,"Yes, some calculations might show that you can...",1
1,1,81,Bankruptcy,"Your question does not make sense but, no, you...",1
2,1,81,Bankruptcy,"You did not tell us who is garnishing you, muc...",1
3,1,81,Bankruptcy,It isn't the lawyer's responsibility so they d...,1
4,1,81,Bankruptcy,The automatic stay does not require that anyon...,1


In [27]:
train_set["label"].value_counts()

label
0    7460
1    1376
Name: count, dtype: int64

The chosen approach leads to our data having a smaller relative portion of non-relevant lawyers compared to relevant lawyers. 

Tokenize the data while retaining additional information about the tokenized instances for interpretation of the results and saving the results so the process does not need to be repeated every time.

In [28]:
# Function to tokenize the query-answer pairs
def tokenize_query_answer_with_metadata(row):
    # Combine query and answer in BERT's required format
    combined_text = f"[CLS] {row['query']} [SEP] {row['answers']} [SEP]"
    
    # Tokenize the combined text
    tokenized = tokenizer(combined_text, padding='max_length', truncation=True, max_length=512)
    
    # Add original row metadata to the tokenized result
    tokenized_with_metadata = {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "original_index": row.name  # Original row index
    }
    return tokenized_with_metadata

# Function to tokenize an entire dataframe
def tokenize_dataframe_with_metadata(dataframe):
    # Apply the tokenization function to each row
    tokenized_data = dataframe.apply(tokenize_query_answer_with_metadata, axis=1)
    tokenized_df = pd.DataFrame(tokenized_data.tolist())
    return tokenized_df

# Define a function to save tokenized data
def save_tokenized_data(dataframe, filename):
    path = f'../data/own_files/{filename}.pkl'
    with open(path, 'wb') as f:
        pickle.dump(dataframe, f)
    print(f"Tokenized data saved to {path}")

if not load_tokens:
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Tokenize train, val, and test sets with metadata
    tokenized_train_set = tokenize_dataframe_with_metadata(train_set)
    tokenized_val_set = tokenize_dataframe_with_metadata(val_set)
    tokenized_test_set = tokenize_dataframe_with_metadata(test_set)

    # Save the tokenized dataframes
    save_tokenized_data(tokenized_train_set, 'train_set_tokenized')
    save_tokenized_data(tokenized_val_set, 'val_set_tokenized')
    save_tokenized_data(tokenized_test_set, 'test_set_tokenized')
if load_tokens:
    # Load the tokenized data from files
    with open('../data/own_files/train_set_tokenized.pkl', 'rb') as f:
        tokenized_train_set = pickle.load(f)
    with open('../data/own_files/val_set_tokenized.pkl', 'rb') as f:
        tokenized_val_set = pickle.load(f)
    with open('../data/own_files/test_set_tokenized.pkl', 'rb') as f:
        tokenized_test_set = pickle.load(f)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokenized data saved to /kaggle/working/train_set_tokenized.pkl
Tokenized data saved to /kaggle/working/val_set_tokenized.pkl
Tokenized data saved to /kaggle/working/test_set_tokenized.pkl


In [29]:
print(tokenized_train_set.shape, tokenized_val_set.shape, tokenized_test_set.shape)

(8836, 3) (8949, 3) (8085, 3)


In [30]:
tokenized_train_set.head()

Unnamed: 0,input_ids,attention_mask,original_index
0,"[101, 101, 10528, 102, 2748, 1010, 2070, 16268...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[101, 101, 10528, 102, 2115, 3160, 2515, 2025,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 101, 10528, 102, 2017, 2106, 2025, 2425,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 101, 10528, 102, 2009, 3475, 1005, 1056,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3
4,"[101, 101, 10528, 102, 1996, 6882, 2994, 2515,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4


In [31]:
# Combine tokens for training with original information like query_id etc.
train_set_reset = train_set.reset_index()
input_label = pd.merge(tokenized_train_set, train_set_reset, left_on='original_index', right_on='index')
input_label = input_label.drop(columns=['index', 'lawyer_id', 'answers', 'query_id', 'query'])
input_label.head()

Unnamed: 0,input_ids,attention_mask,original_index,label
0,"[101, 101, 10528, 102, 2748, 1010, 2070, 16268...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1
1,"[101, 101, 10528, 102, 2115, 3160, 2515, 2025,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
2,"[101, 101, 10528, 102, 2017, 2106, 2025, 2425,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,1
3,"[101, 101, 10528, 102, 2009, 3475, 1005, 1056,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3,1
4,"[101, 101, 10528, 102, 1996, 6882, 2994, 2515,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,1


In [32]:
tokenized_test_set.head()

Unnamed: 0,input_ids,attention_mask,original_index
0,"[101, 101, 10528, 102, 1045, 5993, 2007, 2026,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[101, 101, 10528, 102, 2644, 2522, 1011, 6608,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 101, 10528, 102, 2065, 2017, 2123, 1005,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 101, 10528, 102, 2028, 5724, 2025, 6936,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3
4,"[101, 101, 10528, 102, 2720, 1012, 14233, 1764...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4


In [33]:
# Combine tokens for training with original information like query_id etc.
test_set_reset = test_set.reset_index()
test_set_final = pd.merge(tokenized_test_set, test_set_reset, left_on='original_index', right_on='index')
test_set_final = test_set_final.drop(columns=['index'])
test_set_final.head()

Unnamed: 0,input_ids,attention_mask,original_index,query_id,lawyer_id,query,answers,label
0,"[101, 101, 10528, 102, 1045, 5993, 2007, 2026,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,1,50,Bankruptcy,I agree with my esteemed colleagues who practi...,1
1,"[101, 101, 10528, 102, 2644, 2522, 1011, 6608,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1,50,Bankruptcy,Stop co-signing for other people's debts. Go c...,1
2,"[101, 101, 10528, 102, 2065, 2017, 2123, 1005,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,1,50,Bankruptcy,If you don't want to keep the same attorney to...,1
3,"[101, 101, 10528, 102, 2028, 5724, 2025, 6936,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3,1,50,Bankruptcy,One option not discussed by you or by my colle...,1
4,"[101, 101, 10528, 102, 2720, 1012, 14233, 1764...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,1,50,Bankruptcy,Mr. Steingraber is correct. Your fact scenario...,1


For the pairwise cross entropy loss, we need pairs of expert and non-expert examples.

In [34]:
# Set a fixed random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Separate positive and negative examples
positive_df = input_label[input_label['label'] == 1]
negative_df = input_label[input_label['label'] == 0]

pairs = []

# Number of negative samples to pair with each positive sample
num_negative_samples = 1  

# Generate pairs
for _, pos_row in positive_df.iterrows():
    for _ in range(num_negative_samples):
        # Randomly select a negative sample
        neg_row = negative_df.sample(n=1).iloc[0]
        
        pairs.append({
            'positive_input_ids': pos_row['input_ids'],
            'positive_attention_mask': pos_row['attention_mask'],
            'negative_input_ids': neg_row['input_ids'],
            'negative_attention_mask': neg_row['attention_mask']
        })

# Convert the pairs to a DataFrame
pairs_df = pd.DataFrame(pairs)

# Create dataset for training
class PairwiseDataset(Dataset):
    def __init__(self, pairs_df):
        self.pairs_df = pairs_df
    
    def __len__(self):
        return len(self.pairs_df)
    
    def __getitem__(self, idx):
        row = self.pairs_df.iloc[idx]
        
        return {
            'positive_input_ids': torch.tensor(row['positive_input_ids'], dtype=torch.long),
            'positive_attention_mask': torch.tensor(row['positive_attention_mask'], dtype=torch.long),
            'negative_input_ids': torch.tensor(row['negative_input_ids'], dtype=torch.long),
            'negative_attention_mask': torch.tensor(row['negative_attention_mask'], dtype=torch.long)
        }


## 3. Modeling

The implementation follows the specification from the paper, with the exception of using 20 epochs instead of 100 due to computation limitations.

In [35]:
# Set the seed for reproducibility
SEED = 42

def set_seed(seed=SEED):
    """Fix all sources of randomness for full reproducibility."""
    torch.manual_seed(seed) 
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Call seed function
set_seed()

# Pairwise Cross-Entropy Loss
def pairwise_cross_entropy_loss(positive_scores, negative_scores):
    return torch.mean(-F.logsigmoid(positive_scores - negative_scores))

# Define BERT Ranker Model
class BertRanker(torch.nn.Module):
    def __init__(self):
        super(BertRanker, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token output
        scores = self.classifier(cls_output)
        return scores

# Training Function
def train_bert_ranker(model, dataloader, num_epochs=100, gradient_accumulation_steps=1):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define Optimizer
    optimizer = AdamW([
        {'params': model.bert.parameters(), 'lr': 2e-5},  # BERT layers
        {'params': model.classifier.parameters(), 'lr': 0.001}  # Classifier layer
    ])

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        optimizer.zero_grad()

        for step, batch in enumerate(dataloader):
            # Move data to device
            positive_input_ids = batch['positive_input_ids'].to(device)
            positive_attention_mask = batch['positive_attention_mask'].to(device)
            negative_input_ids = batch['negative_input_ids'].to(device)
            negative_attention_mask = batch['negative_attention_mask'].to(device)

            # Forward pass
            positive_scores = model(positive_input_ids, positive_attention_mask)
            negative_scores = model(negative_input_ids, negative_attention_mask)

            # Compute loss
            loss = pairwise_cross_entropy_loss(positive_scores, negative_scores)
            epoch_loss += loss.item()

            # Backward pass
            loss.backward()

            # Gradient accumulation
            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(dataloader):
                optimizer.step()
                optimizer.zero_grad()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

    # Save the trained model
    torch.save(model.state_dict(), "./bert_ranker.pth")
    print("Saved to bert_ranker.pth")

# Ensure DataLoader uses a fixed seed
g = torch.Generator()
g.manual_seed(SEED)

dataloader = DataLoader(PairwiseDataset(pairs_df), batch_size=16, shuffle=True)

# Initialize Model
bert_ranker = BertRanker()

# Train Model
train_bert_ranker(bert_ranker, dataloader, num_epochs=20, gradient_accumulation_steps=1)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/20, Loss: 0.2730
Epoch 2/20, Loss: 0.0228
Epoch 3/20, Loss: 0.0056
Epoch 4/20, Loss: 0.0111
Epoch 5/20, Loss: 0.0119
Epoch 6/20, Loss: 0.0012
Epoch 7/20, Loss: 0.0004
Epoch 8/20, Loss: 0.0001
Epoch 9/20, Loss: 0.0001
Epoch 10/20, Loss: 0.0000
Epoch 11/20, Loss: 0.0003
Epoch 12/20, Loss: 0.0001
Epoch 13/20, Loss: 0.0000
Epoch 14/20, Loss: 0.0000
Epoch 15/20, Loss: 0.0000
Epoch 16/20, Loss: 0.0000
Epoch 17/20, Loss: 0.0000
Epoch 18/20, Loss: 0.0000
Epoch 19/20, Loss: 0.0000
Epoch 20/20, Loss: 0.0000
/kaggle/working/bert_ranker.pth


## 4. Evaluation

In [36]:
# Load trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertRanker()
model.load_state_dict(torch.load("./bert_ranker.pth", map_location=device))
model.to(device)
model.eval()  # Set to evaluation mode

BertRanker(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [37]:
# Custom dataset class for test data
class TestDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        return {
            "query_id": row["query_id"],
            "lawyer_id": row["lawyer_id"],
            "input_ids": torch.tensor(row["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(row["attention_mask"], dtype=torch.long),
        }

# Create test dataset and dataloader
test_dataset = TestDataset(test_set_final)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [38]:
# Store predictions
predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        scores = model(input_ids, attention_mask).squeeze(-1)  # Get relevance scores
        
        # Convert tensor to plain integer
        for qid, lid, score in zip(batch["query_id"], batch["lawyer_id"], scores.cpu().numpy()):
            predictions.append((qid.item(), lid.item(), score))  # Use .item() to extract int values

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions, columns=["query_id", "lawyer_id", "score"])


In [39]:
predictions_df.head()

Unnamed: 0,query_id,lawyer_id,score
0,1,50,-8.265645
1,1,50,-6.304363
2,1,50,2.398583
3,1,50,10.358211
4,1,50,-1.832613


In [40]:
def compute_metrics(predictions_df, test_set_final, k_values=[1, 2, 5]):
    map_scores = []
    mrr_scores = []
    precision_at_k = {k: [] for k in k_values}

    for qid, group in predictions_df.groupby("query_id"):
        # Sort by predicted score (higher is better)
        group = group.sort_values(by="score", ascending=False)

        # Get ground truth relevant experts
        relevant_experts = set(test_set_final[(test_set_final["query_id"] == qid) & 
                                              (test_set_final["label"] == 1)]["lawyer_id"])

        # Compute binary relevance list
        relevance = [1 if lid in relevant_experts else 0 for lid in group["lawyer_id"]]

        # MAP Score
        if any(relevance):  # If there are relevant results
            map_scores.append(average_precision_score(relevance, group["score"].values))

        # MRR Score (First relevant expert found)
        for rank, rel in enumerate(relevance, start=1):
            if rel == 1:
                mrr_scores.append(1 / rank)
                break

        # Precision@K
        for k in k_values:
            precision_at_k[k].append(sum(relevance[:k]) / k)

    # Aggregate results
    map_score = np.mean(map_scores) if map_scores else 0
    mrr_score = np.mean(mrr_scores) if mrr_scores else 0
    precision_k_scores = {k: np.mean(v) for k, v in precision_at_k.items()}

    return map_score, mrr_score, precision_k_scores

# Compute metrics using the test set
map_score, mrr_score, precision_k_scores = compute_metrics(predictions_df, test_set_final)

# Print results
print(f"MAP: {map_score:.7f}")
print(f"MRR: {mrr_score:.7f}")
for k, v in precision_k_scores.items():
    print(f"P@{k}: {v:.7f}")


MAP: 0.3123552
MRR: 0.3905762
P@1: 0.2047244
P@2: 0.1850394
P@5: 0.1811024
