# LLM Judgment - Human Judgment as a Target

This experiments applied the signed differecnes of LLM labels and human labels as a target for analysis the features.

In [None]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data

In [None]:
qrels = glob.glob(f'./qrels/*.txt')

In [None]:
qrels_df_list = []

for infile in qrels:
    judger = infile.split('/')[2].split('.')[3]
    result_df = pd.read_csv(infile, sep=' ', header=None, names=['qid', 'Q0', 'docid', 'score'])
    result_df.drop(['Q0'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(int)
    qrels_df_list.append(result_df)
 
qrels_df = pd.concat(qrels_df_list)
qrels_df

In [None]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
qrels_df = pd.merge(qrels_df, qid_to_info, on='qid')

In [None]:
qrels_df['judged_by'] = qrels_df['judged_by'].replace({'withDupes': 'Human', 'gpt4': 'GPT-4'})

In [None]:
# add a new column called 'QT' and the values are "Real" if Synthetic is 0 and isGPT is 0, "T5" if Synthetic is 1 and isGPT is 0, 
# "GPT-4" if Synthetic is 0 and isGPT is 1
qrels_df['QT'] = np.where((qrels_df['Synthetic'] == 0) & (qrels_df['isGPT4'] == 0), 'Real',
                          np.where((qrels_df['Synthetic'] == 1) & (qrels_df['isGPT4'] == 0), 'T5', 'GPT-4'))

In [None]:
df = qrels_df[['score', 'judged_by', 'QW', 'QT']]
df['score'] = df['score'].astype('category')

sns.set_style("ticks")  # Options: white, dark, whitegrid, darkgrid, ticks
sns.set_context("poster", font_scale =0.8)     # Options: paper, notebook, talk, poster
sns.set_palette("bright")   # You can also use: deep, muted, bright, dark, colorblind, or a custom list of colors

# Create a FacetGrid for the label type
g = sns.FacetGrid(df, col="QT", hue="judged_by", height=5)

# Add a histogram to the FacetGrid
g.map(sns.histplot, 'score', stat="proportion", hue = 'judged_by', multiple='dodge', shrink = .8, common_norm=False, data = df)

# Adjust the titles and labels
g.add_legend()
g.set_axis_labels("", "Proportion", fontsize=22)
g.set_titles(col_template="{col_name}", fontsize=22)

titles = ['Labels for Human Queries', 'Labels for T5 Queries', 'Labels for GPT-4 Queries']

for ax, title in zip(g.axes.flatten(), titles):
    ax.set_title(title)
    
sns.move_legend(g, ncol=3, loc='upper center', title = '')

# Adjust layout
plt.subplots_adjust(top=0.8)
# plt.legend(fontsize=19, ncol=2, loc='upper center')
# Show the plot
plt.xticks([0, 1, 2, 3])  # Set y-ticks to 0, 1, 2, 3
plt.savefig("figs/label_barplots.pdf", bbox_inches="tight")