In [1]:
import pandas as pd
import glob
import statsmodels.api as sm
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
## Function to check model assumptions 

def plot_residuals(result):
    
    # Extract residuals and fitted values
    residuals = result.resid
    fitted_values = result.fittedvalues

    # Q-Q plot
    plt.figure(figsize=(8, 6))
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title('Q-Q Plot')
    plt.show()

    # Histogram of residuals
    plt.figure(figsize=(8, 6))
    plt.hist(residuals, bins=30, edgecolor='k')
    plt.title('Histogram of Residuals')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.show()

    # Residuals vs. Fitted values
    plt.figure(figsize=(8, 6))
    plt.scatter(fitted_values, residuals, alpha=0.5)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values')
    plt.show()

In [3]:
metric = "map" # map # ndcg_cut_10
result_format = "treceval" # treceval # ndcgeval

In [4]:
results_df_list = []

for infile in glob.glob(f'./results/TRECDL2023/*.{result_format}'):
    judger = infile.split('/')[3].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
3,naverloo_bm25_splades_RR,2001010,0.3698,gpt4
30,naverloo_bm25_splades_RR,2001459,0.2113,gpt4
57,naverloo_bm25_splades_RR,2001575,0.3611,gpt4
84,naverloo_bm25_splades_RR,2002075,0.2001,gpt4
111,naverloo_bm25_splades_RR,2002168,0.1848,gpt4
...,...,...,...,...
263914,uogtr_b_grf_e_gb,3100825,0.0958,nist
264005,uogtr_b_grf_e_gb,3100833,0.3054,nist
264096,uogtr_b_grf_e_gb,3100909,0.1652,nist
264187,uogtr_b_grf_e_gb,3100918,0.0807,nist


In [5]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [6]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/pass_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [7]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [8]:
data.describe(include='object')
data


Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,ST,isLLM,MN
0,naverloo_bm25_splades_RR,2001010,0.3698,gpt4,0,0.7840,0.7023,6,72.5291,0,0,T5,1,8
1,naverloo_bm25_splades_RR,2001459,0.2113,gpt4,0,2.6311,6.7917,4,57.2620,0,0,T5,1,8
2,naverloo_bm25_splades_RR,2001575,0.3611,gpt4,0,0.1040,0.7523,4,440.2367,0,0,T5,1,8
3,naverloo_bm25_splades_RR,2002075,0.2001,gpt4,0,1.8125,5.2791,7,1005.6332,0,0,T5,1,8
4,naverloo_bm25_splades_RR,2002168,0.1848,gpt4,0,1.3110,4.3667,7,56.8986,0,0,T5,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,uogtr_b_grf_e_gb,3100825,0.0958,nist,1,0.7835,4.0882,11,760.7758,1,1,T5,1,4
5736,uogtr_b_grf_e_gb,3100833,0.3054,nist,1,0.1144,0.6350,13,702.7082,1,1,T5,1,4
5737,uogtr_b_grf_e_gb,3100909,0.1652,nist,1,1.1883,2.7444,10,995.8028,1,1,T5,1,4
5738,uogtr_b_grf_e_gb,3100918,0.0807,nist,1,0.4276,2.0563,10,143.9186,1,1,T5,1,4


In [21]:
mixed_model_int = (
"score ~ C(judged_by, Treatment(reference='nist')) * (QDR + QW + DL + isGPT4 + C(ST, Treatment(reference='Other')) + MN) "
)

In [22]:
model_int = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result_int = model_int.fit()
result_int.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0225
Min. group size:,164,Log-Likelihood:,2622.3148
Max. group size:,164,Converged:,Yes
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.121,0.016,7.433,0.000,0.089,0.152
"C(judged_by, Treatment(reference='nist'))[T.gpt4]",0.069,0.016,4.453,0.000,0.039,0.100
"C(ST, Treatment(reference='Other'))[T.GPT]",-0.051,0.019,-2.661,0.008,-0.088,-0.013
"C(ST, Treatment(reference='Other'))[T.T5]",-0.001,0.023,-0.045,0.964,-0.046,0.044
"C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.092,0.018,5.265,0.000,0.058,0.127
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.GPT]",0.032,0.012,2.743,0.006,0.009,0.055
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.T5]",-0.001,0.014,-0.091,0.927,-0.029,0.026
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.T5+GPT]",-0.027,0.011,-2.467,0.014,-0.048,-0.005
QDR,-0.014,0.002,-5.996,0.000,-0.018,-0.009


# NDCG

In [24]:
metric = 'ndcg_cut_10'
result_format = 'ndcgeval'

In [25]:
results_df_list = []

for infile in glob.glob(f'./results/TRECDL2023/*.{result_format}'):
    judger = infile.split('/')[3].split('.')[2]
    result_df = pd.read_csv(infile, sep='\t', header=None, names=['run_id', 'metric', 'qid', 'score'])
    result_df = result_df[result_df['qid'] != 'all']
    result_df['metric'] = result_df['metric'].apply(lambda x: x.rstrip())
    result_df = result_df[(result_df['metric'] == metric)]
    result_df.drop(['metric'], axis=1, inplace=True)
    result_df['judged_by'] = judger
    result_df['qid'] = result_df['qid'].astype(int)
    result_df['score'] = result_df['score'].astype(float)
    results_df_list.append(result_df)
 
results_dfs = pd.concat(results_df_list)
results_dfs

Unnamed: 0,run_id,qid,score,judged_by
2,cip_run_7,2001010,0.8563,nist
12,cip_run_7,2001459,0.6551,nist
22,cip_run_7,2001575,0.3321,nist
32,cip_run_7,2002075,0.8087,nist
42,cip_run_7,2002168,0.6691,nist
...,...,...,...,...
28992,naverloo_bm25_splades_RR,3100825,0.7797,gpt4
29002,naverloo_bm25_splades_RR,3100833,0.8526,gpt4
29012,naverloo_bm25_splades_RR,3100909,0.9373,gpt4
29022,naverloo_bm25_splades_RR,3100918,0.9402,gpt4


In [26]:
qid_to_info = pd.read_csv("infos/query_to_info.txt", sep='\t')
doc_to_info = pd.read_csv("infos/pass_to_info.txt", sep='\t')
model_to_info = pd.read_csv("infos/model_to_info.txt", sep='\t')

In [27]:
queries_judged = set(results_dfs['qid'])
real_queries_judged = [x for x in queries_judged if x < 3000000]
t5_queries_judged = [x for x in queries_judged if x > 3000000 and x < 3100000]
gpt4_queries_judged = [x for x in queries_judged if x > 3100000]

In [28]:
data = pd.merge(results_dfs, qid_to_info, on='qid')
# data = pd.merge(data, doc_to_info, on='qid')
data = pd.merge(data, model_to_info, on='run_id')

In [29]:
data

Unnamed: 0,run_id,qid,score,judged_by,QL,QDR,QDS,QW,DL,Synthetic,isGPT4,ST,isLLM,MN
0,cip_run_7,2001010,0.8563,nist,0,0.7840,0.7023,6,72.5291,0,0,GPT,1,2
1,cip_run_7,2001459,0.6551,nist,0,2.6311,6.7917,4,57.2620,0,0,GPT,1,2
2,cip_run_7,2001575,0.3321,nist,0,0.1040,0.7523,4,440.2367,0,0,GPT,1,2
3,cip_run_7,2002075,0.8087,nist,0,1.8125,5.2791,7,1005.6332,0,0,GPT,1,2
4,cip_run_7,2002168,0.6691,nist,0,1.3110,4.3667,7,56.8986,0,0,GPT,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5735,naverloo_bm25_splades_RR,3100825,0.7797,gpt4,1,0.7835,4.0882,11,760.7758,1,1,T5,1,8
5736,naverloo_bm25_splades_RR,3100833,0.8526,gpt4,1,0.1144,0.6350,13,702.7082,1,1,T5,1,8
5737,naverloo_bm25_splades_RR,3100909,0.9373,gpt4,1,1.1883,2.7444,10,995.8028,1,1,T5,1,8
5738,naverloo_bm25_splades_RR,3100918,0.9402,gpt4,1,0.4276,2.0563,10,143.9186,1,1,T5,1,8


In [30]:
mixed_model_int = (
"score ~ C(judged_by, Treatment(reference='nist')) * (QDR + QW + DL + isGPT4 + C(ST, Treatment(reference='Other')) + MN) "
)

In [31]:
model_int = sm.MixedLM.from_formula(mixed_model_int, data, groups=data["run_id"])
result_int = model_int.fit()
result_int.summary()



0,1,2,3
Model:,MixedLM,Dependent Variable:,score
No. Observations:,5740,Method:,REML
No. Groups:,35,Scale:,0.0419
Min. group size:,164,Log-Likelihood:,827.2732
Max. group size:,164,Converged:,Yes
Mean group size:,164.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.412,0.030,13.648,0.000,0.353,0.472
"C(judged_by, Treatment(reference='nist'))[T.gpt4]",0.207,0.021,9.747,0.000,0.165,0.249
"C(ST, Treatment(reference='Other'))[T.GPT]",-0.076,0.039,-1.924,0.054,-0.153,0.001
"C(ST, Treatment(reference='Other'))[T.T5]",0.034,0.047,0.729,0.466,-0.058,0.127
"C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.166,0.036,4.579,0.000,0.095,0.237
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.GPT]",0.018,0.016,1.140,0.254,-0.013,0.049
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.T5]",0.008,0.019,0.402,0.688,-0.030,0.045
"C(judged_by, Treatment(reference='nist'))[T.gpt4]:C(ST, Treatment(reference='Other'))[T.T5+GPT]",0.003,0.015,0.219,0.827,-0.026,0.032
QDR,0.033,0.003,10.498,0.000,0.026,0.039
