In [4]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
from torch.nn import functional as F
import os



In [6]:
os.chdir('export')

## EVOUNA

In [7]:
models = ['fid','gpt35','chatgpt','gpt4','newbing']
dfs = []
for m in models:
    dfs.append(pd.read_csv(f'EVOUNA/{m}.csv'))
    dfs[-1].dropna(inplace=True)
    dfs[-1]['judge'].astype('int')
    dfs[-1].drop('Unnamed: 0',axis=1,inplace=True)    

### Our model

In [8]:
from joblib import load

lm = load('logr500mv3.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
from utils.metrics import evaluator

r = []
for i in range(len(dfs)):
    pred = evaluator(dfs[i][['question','golden_answer','answer']],lm)
    r.append(pred)


In [18]:
dfs[0].head()

Unnamed: 0,question,golden_answer,answer,judge,model
0,who got the first nobel prize in physics,Wilhelm Conrad Röntgen,Wilhelm Röntgen,True,fid
1,which mode is used for short wave broadcast se...,Olivia,AM,False,fid
2,what does hp mean in war and order,hit points or health points,Health Points,True,fid
3,who wrote the first declaration of human rights,Cyrus,John Humphrey,False,fid
4,who is the owner of reading football club,Xiu Li Dai,Renhe Sports Management Ltd,False,fid


In [19]:
from sklearn.metrics import matthews_corrcoef,confusion_matrix
corrs = []
for i in range(len(dfs)):
    cor = matthews_corrcoef(r[i],dfs[i]['judge'].astype('int'))
    corrs.append(cor)

In [20]:
corrs

[0.8192311366274059,
 0.6710243582779716,
 0.6052006275053852,
 0.5590805437431997,
 0.5692999407562559]

In [30]:
confusion_matrix(dfs[1]['judge'].astype('int'),r[1])

array([[1021,   21],
       [ 543, 1435]])

### Bertscore

In [5]:
from bert_score import score

r = []
for i in range(len(dfs)):
    P,R,F = score(list(dfs[i][f'answer'].astype(str)),list(dfs[i]['golden_answer'].astype(str)),lang='en', rescale_with_baseline=True)
    F = (F - min(F))/ (max(F)-min(F))
    r.append(np.where(F > 0.5 ,1 ,0))

r = np.array(r)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [8]:
from sklearn.metrics import matthews_corrcoef,confusion_matrix
corrs = []
for i in range(len(dfs)):
    cor = matthews_corrcoef(np.where(np.array(r[i]) > 0.5 ,1,0),np.where(dfs[i]['judge'] == True, 1,0))
    corrs.append(cor)


In [9]:
corrs

[0.027114861277846698,
 0.262341318229872,
 0.22780347260106842,
 0.3131563256472302,
 0.13290023540831086]

### NLI

In [3]:
from utils.metrics import initialize_model

model,tokenizer = initialize_model()

In [7]:
from utils.metrics import get_semantic_similarity


r = []
for i in range(len(dfs)):
    pred = dfs[i].apply(lambda row: get_semantic_similarity(row['question'],row['golden_answer'],row['answer'],model,tokenizer),axis=1)
    r.append(pred)


In [9]:
from sklearn.metrics import matthews_corrcoef,confusion_matrix
corrs = []
for i in range(len(dfs)):
    cor = matthews_corrcoef(np.where(r[i] > 0.5,1,0),dfs[i]['judge'].astype('int'))
    corrs.append(cor)
corrs

[0.8157245935322697,
 0.6618356962910176,
 0.5916800893580882,
 0.5424632444492016,
 0.5580415277939176]

### BEM

In [1]:
!pip install tf-text

[31mERROR: Could not find a version that satisfies the requirement tf-text (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tf-text[0m[31m
[0m

### BLEURT

### F-1 Score

## DIVER-QA

In [11]:
# Load Validation set
df = pd.read_csv('combinedv3.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

# Load Human_eval set
df1 = pd.read_csv('cleaned_dataset.csv')
df1.drop(df1.columns[0],axis=1,inplace=True)





In [13]:
df.head()

Unnamed: 0,questions,answers,dataset,prediction,eval,model
0,Where is the Hoppings funfair held?,Town Moor,aqa,"According to the passage, the Hoppings funfair...",1,claude
1,Who shares a name with an older type of transp...,Stagecoach North East,aqa,"Based on the information provided, the company...",1,claude
2,What would be done for people who need more in...,link the local networks to national networks,aqa,"Based on the given context, there isn't specif...",0,claude
3,What was affected by the refurbishment?,cinema,aqa,"Based on the given context, the Pilgrim Street...",1,claude
4,"Of the successful state schools listed, this s...",Heaton Manor School,aqa,"To answer this question, I'll list the success...",1,claude


In [7]:
df1.columns = ['claude_eval','mixtral_eval','ll70b_eval','ll8b_eval','phi_eval']

In [8]:
df1.head()

Unnamed: 0,claude_eval,mixtral_eval,ll70b_eval,ll8b_eval,phi_eval
0,1,1,1,1,1
1,1,1,1,0,1
2,0,0,0,0,0
3,1,1,1,1,1
4,1,0,1,0,0


In [9]:
df2 = pd.concat([df,df1],axis=1)

In [12]:
df2.columns

Index(['answers', 'claude', 'mixtral', 'll70b', 'll8b', 'phi', 'questions',
       'claude_eval', 'mixtral_eval', 'll70b_eval', 'll8b_eval', 'phi_eval'],
      dtype='object')

In [4]:
# List of model names
models = ['claude', 'mixtral', 'll70b', 'll8b', 'phi']

# List of subdataset names
subdatasets = ['aqa', 'squad', 'medqa', 'hotpotqa', 'triviaqa']

# Melt the dataframe to transform model columns into rows
melted_df = pd.melt(df, 
                    id_vars=['questions', 'answers'], 
                    value_vars=models,
                    var_name='model', 
                    value_name='predictions')

# Create a dataset column
melted_df['dataset'] = np.repeat(subdatasets, 120 * len(models))

# Sort the dataframe to ensure correct order
melted_df = melted_df.sort_values(['dataset', 'questions', 'model']).reset_index(drop=True)

# Reorder columns
final_df = melted_df[['questions', 'answers', 'predictions', 'model', 'dataset']]

In [10]:
import pandas as pd
import numpy as np

# List of model names
models = ['claude', 'mixtral', 'll70b', 'll8b', 'phi']

# List of subdataset names
subdatasets = ['aqa', 'squad', 'medqa', 'hotpotqa', 'triviaqa']

# Melt the dataframe for predictions
melted_predictions = pd.melt(df2, 
                             id_vars=['questions', 'answers'], 
                             value_vars=models,
                             var_name='model', 
                             value_name='predictions')

# Melt the dataframe for evaluations
melted_evaluations = pd.melt(df2, 
                             id_vars=['questions', 'answers'], 
                             value_vars=[f'{model}_eval' for model in models],
                             var_name='model', 
                             value_name='evaluation')

# Remove '_eval' suffix from the model column in melted_evaluations
melted_evaluations['model'] = melted_evaluations['model'].str.replace('_eval', '')

# Merge the melted predictions and evaluations
melted_df = pd.merge(melted_predictions, melted_evaluations, 
                     on=['questions', 'answers', 'model'])

# Create a dataset column
melted_df['dataset'] = np.repeat(subdatasets, 120 * len(models))

# Sort the dataframe to ensure correct order
melted_df = melted_df.sort_values(['dataset', 'questions', 'model']).reset_index(drop=True)

# Reorder columns
final_df = melted_df[['questions', 'answers', 'predictions', 'evaluation', 'model', 'dataset']]

In [11]:
len(final_df)

3000

In [23]:
final_df.iloc[600]

questions      "An Account of Further Discoveries in Air" was...
answers                                         Joseph Priestley
predictions    According to the context, "An Account of Furth...
model                                                       ll8b
dataset                                                 hotpotqa
Name: 600, dtype: object

In [13]:
from joblib import load
lm = load('models/logr500mv3.joblib')

In [14]:
from utils.metrics import evaluator

r = []
for d in subdatasets:
    sub = final_df[final_df['dataset'] == d]
    pred = evaluator(sub[['questions','answers','predictions']],lm)
    r.append(pred)

In [15]:
r = np.array(r)

In [38]:
gpt = pd.read_csv('eval_data/gpt_evaluated.csv')

In [40]:
gpt.drop('Unnamed: 0',inplace=True,axis=1)
gpt.columns = df1.columns

In [42]:
# Melt the dataframe to transform model columns into rows
melted_eval_df = pd.melt(gpt, 
                         value_vars=models,
                         var_name='model', 
                         value_name='eval')

# Create a dataset column
melted_eval_df['dataset'] = np.repeat(subdatasets, 120 * len(models))

# Sort the dataframe to ensure correct order
melted_eval_df = melted_eval_df.sort_values(['dataset', 'model']).reset_index(drop=True)

# Reorder columns
gpt4_df = melted_eval_df[['eval', 'model', 'dataset']]

In [43]:
gpt4_df.head()

Unnamed: 0,eval,model,dataset
0,1,claude,aqa
1,1,claude,aqa
2,0,claude,aqa
3,1,claude,aqa
4,1,claude,aqa


In [18]:
from sklearn.metrics import matthews_corrcoef
d_eval= []
for i in range(len(subdatasets)):
    d_eval.append(matthews_corrcoef(r[i],final_df[final_df['dataset']==subdatasets[i]]['evaluation']))
    

In [17]:
final_df.head()

Unnamed: 0,questions,answers,predictions,evaluation,model,dataset
0,"""An Account of Further Discoveries in Air"" was...",Joseph Priestley,"According to the information provided, ""An Acc...",1,claude,aqa
1,"""The names of how many states of the USA start...",3,"To answer this question, let's go through the ...",0,claude,aqa
2,"""What was special about """"The Daily Courant"""" ...",First daily newspaper,"""The Daily Courant"" was significant in the ear...",1,claude,aqa
3,"""Which bass guitarist, songwriter, singer, and...",John Entwistle's,"The bass guitarist, songwriter, singer, and ho...",1,claude,aqa
4,"""Which country launched the space station """"Sk...",The United States of America,"The United States launched the space station ""...",1,claude,aqa


In [30]:
d_e

[-0.012289200239332837,
 0.07033404116150534,
 0.03978473466339667,
 0.04512654702114134,
 0.022023037791621264]

In [49]:
from sklearn.metrics import matthews_corrcoef
d_eval= []
for i in range(len(subdatasets)):
    d_eval.append(matthews_corrcoef(gpt4_df[gpt4_df['dataset']==subdatasets[i]]['eval'],final_eval_df[final_eval_df['dataset']==subdatasets[i]]['eval']))
    

In [21]:
final_df.to_csv('eval_data/combinedv3.csv',index=False)

In [19]:
d_eval

[0.6695088952305026,
 0.6757920712461999,
 0.6654317717087476,
 0.6861900242568284,
 0.6837815239372683]

In [46]:
d_eval

[0.719231779434366,
 0.6606556204940824,
 0.6553162779841664,
 0.70239465192972,
 0.6327552621360103]

## DIVERQA-2

In [22]:
# Load Validation set
df = pd.read_csv('combinedv3.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

# Load Human_eval set
df1 = pd.read_csv('cleaned_dataset.csv')

In [23]:
df.head()

Unnamed: 0,questions,answers,dataset,prediction,eval,model
0,Where is the Hoppings funfair held?,Town Moor,aqa,"According to the passage, the Hoppings funfair...",1,claude
1,Who shares a name with an older type of transp...,Stagecoach North East,aqa,"Based on the information provided, the company...",1,claude
2,What would be done for people who need more in...,link the local networks to national networks,aqa,"Based on the given context, there isn't specif...",0,claude
3,What was affected by the refurbishment?,cinema,aqa,"Based on the given context, the Pilgrim Street...",1,claude
4,"Of the successful state schools listed, this s...",Heaton Manor School,aqa,"To answer this question, I'll list the success...",1,claude


In [24]:
df.drop('eval',axis=1,inplace=True)

In [69]:
l = df1['claude-3.5-sonnet']
l = pd.DataFrame(l)
l.columns = ['eval']

In [70]:
# Get the series
l = df1['claude-3.5-sonnet']
l = pd.DataFrame(l)
l.columns = ['eval']



In [75]:
x = df1['phi-3-mini']
x = pd.DataFrame(x)
x.columns = ['eval']  # Keep the same column name

# Concatenate vertically (axis=0 is default)
l = pd.concat([l, x], ignore_index=True)

In [76]:
df.head()

Unnamed: 0,questions,answers,dataset,prediction,model
0,Where is the Hoppings funfair held?,Town Moor,aqa,"According to the passage, the Hoppings funfair...",claude
1,Who shares a name with an older type of transp...,Stagecoach North East,aqa,"Based on the information provided, the company...",claude
2,What would be done for people who need more in...,link the local networks to national networks,aqa,"Based on the given context, there isn't specif...",claude
3,What was affected by the refurbishment?,cinema,aqa,"Based on the given context, the Pilgrim Street...",claude
4,"Of the successful state schools listed, this s...",Heaton Manor School,aqa,"To answer this question, I'll list the success...",claude


In [77]:
df['eval'] = l

In [78]:
df.head()

Unnamed: 0,questions,answers,dataset,prediction,model,eval
0,Where is the Hoppings funfair held?,Town Moor,aqa,"According to the passage, the Hoppings funfair...",claude,1
1,Who shares a name with an older type of transp...,Stagecoach North East,aqa,"Based on the information provided, the company...",claude,1
2,What would be done for people who need more in...,link the local networks to national networks,aqa,"Based on the given context, there isn't specif...",claude,0
3,What was affected by the refurbishment?,cinema,aqa,"Based on the given context, the Pilgrim Street...",claude,1
4,"Of the successful state schools listed, this s...",Heaton Manor School,aqa,"To answer this question, I'll list the success...",claude,1


In [79]:
df.to_csv('combinedv4.csv',index=False)

In [93]:
df['dataset'].iloc[0]

'aqa'

## OUR MODEL

In [None]:
def process_input(question, answer, prediction):
    premise = 'question: '+question+' '+'answer: '+answer
    hypothesis = 'question: '+question+' '+'answer: '+prediction

    return premise,hypothesis

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)

def get_semantic_similarity(question,answer,prediction):
    premise,hypothesis = process_input(question,answer,prediction)
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

def get_lexical_similarity(answer, prediction):
    answer_list = answer.split(" ")
    count = 0
    for word in prediction.split(" "):
        if word in answer_list:
            count += 1
    return count/len(answer_list)


def evaluator(X,log_regr_model,device='cuda'):
    lex_eval = X.apply(lambda x: get_lexical_similarity(x[X.columns[1]],x[X.columns[2]]), axis=1)
    sem_eval = X.apply(lambda x: get_semantic_similarity(x[X.columns[0]],x[X.columns[1]],x[X.columns[2]],model,tokenizer), axis=1)
    x = pd.DataFrame({'lex_eval':lex_eval,'sem_eval': sem_eval})
    pred = log_regr_model.predict(x)
    return pred
