In [1]:
print(1)


1


In [2]:
import ir_datasets
import email
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import gc
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from config import *
import re
import numpy as np
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import sys
sys.path.append("../scripts/")

In [4]:
from dataset import load_sara
from eval import jupyter_evaluation
from model import llm_experiment, post_process_split_docs
from models import get_model_version
from preprocess_sara import proccutit


#s = load_sara()
#proc = proccutit(s)
#tokenizer, model = get_model_version('get_mistral', "mistralai/Mistral-7B-Instruct-v0.2", 'main', 'auto')
#jupyter_evaluation(labels, preds)
#llm_experiment(dataset, prompt_strategy, model, tokenizer, end_prompt=None):
#post_process_split_docs(mr, fpr, pre, df)

#### Reading results

In [5]:
def get_results_json(file_name):
    current_directory = os.getcwd()
    parent_directory = os.path.dirname(current_directory)
    target_directory = os.path.join(parent_directory, 'scripts/results')
    #print("Path to results", target_directory)

    #file_name = 'test.json'
    file_path = os.path.join(target_directory, file_name)
    #print(file_path)

    with open(file_path) as json_file:
        data = json.load(json_file)

    df = pd.DataFrame(data)
    #print(df)
    return df

In [6]:
all_results_df = get_results_json('clean_model_responses.json')

#### Prompt Effectiveness

Prompt performance: How models perform across different prompts

In [7]:
from sklearn.metrics import balanced_accuracy_score, f1_score

def calculate_accuracy(group):
    correct_predictions = (group['prediction'] == group['ground_truth']).sum()
    total_predictions = len(group)
    accuracy = correct_predictions / total_predictions
    return accuracy

def calculate_balanced_accuracy(group):
    return balanced_accuracy_score(group['ground_truth'], group['prediction'])

def calculate_f1(group):
    return f1_score(group['ground_truth'], group['prediction'], average='weighted')

def prompt_performance(df):
    #accuracy_df = results_df.groupby(['model', 'prompt']).apply(lambda x: (x['prediction'] == x['ground_truth']).mean()).reset_index(name='accuracy')
    # Group by model and prompt, then apply the calculation for each metric
    grouped = df.groupby(['model', 'prompt'])
    accuracy_df = grouped.apply(calculate_accuracy).reset_index(name='accuracy')
    balanced_accuracy_df = grouped.apply(calculate_balanced_accuracy).reset_index(name='balanced_accuracy')
    f1_score_df = grouped.apply(calculate_f1).reset_index(name='f1_score')
    # Merge results into a single DataFrame - easy comparison
    performance_df = pd.merge(accuracy_df, balanced_accuracy_df, on=['model', 'prompt'])
    performance_df = pd.merge(performance_df, f1_score_df, on=['model', 'prompt'])
    #print(performance_df)
    return performance_df

prompt_performance_df = prompt_performance(all_results_df)

In [8]:
prompt_performance_df

Unnamed: 0,model,prompt,accuracy,balanced_accuracy,f1_score
0,l27b-meta,b1,0.786608,0.490123,0.782466
1,l27b-meta,b1_2,0.7597,0.485606,0.768147
2,l27b-meta,b1_2sys,0.692115,0.468716,0.727621
3,l27b-meta,b1sys,0.7597,0.474766,0.765976
4,l27b-meta,b2,0.645585,0.433642,0.69525
5,l27b-meta,b2_2,0.5598,0.432694,0.634359
6,l27b-meta,b2_2sys,0.41891,0.433031,0.506877
7,l27b-meta,b2sys,0.604258,0.440604,0.667776
8,l27b-meta,itspersonal,0.815394,0.571595,0.814788
9,l27b-meta,itspersonal_2,0.82791,0.539711,0.814621


#### Document Difficulty

Identify difficult documents to classify. We calculate the misclassification rate for each document across all models and prompts.

We find documents that no model-prompt combination could predict this correctly.

In [9]:
def get_most_difficult_docs(df):
    misclassification_df = df.groupby('doc_id').apply(lambda x: (x['prediction'] != x['ground_truth']).mean()).reset_index(name='misclassification_rate')
    # Sorting finds the documents with the highest misclassification rates
    return misclassification_df.sort_values(by='misclassification_rate', ascending=False)

def get_all_wrong():
    # Documents entirely misclassified
    return most_difficult_docs[most_difficult_docs.misclassification_rate == 1]

most_difficult_docs = get_most_difficult_docs(all_results_df)
all_wrong = get_all_wrong()


Next, we analyse these errors. We will manually review the documents, their predicted and ground truths.

In [10]:
def get_n_most_difficult_docs(most_difficult_docs, n=10):
    difficult_doc_ids = most_difficult_docs['doc_id'].head(n)
    return difficult_doc_ids

difficult_doc_ids = get_n_most_difficult_docs(most_difficult_docs)

In [11]:
s = load_sara()
diff_id = list(difficult_doc_ids)

id_index = []
for idd in diff_id:
    find = s[s.doc_id == idd]
    id_index.append(find.index[0])

print(id_index)
x = proccutit(s.loc[id_index])
for hard in x.iterrows():
    print(hard[1].text)
    break

[1563, 1176, 235, 203, 229, 923, 230, 882, 906, 924]
 This is an early draft document and should be revised, but does explain the spread option structure.
--Stinson


#### Model Comparison Analysis

In [37]:
from statsmodels.stats.contingency_tables import mcnemar

def run_mcnemar(table):
    result = mcnemar(table, exact=True, correction=True)
    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')


def mcnemar_table(control, change):
    all_ids = control.doc_id.tolist()
    control = control[control['prediction'] == control['ground_truth']].doc_id.to_list()
    change = change[change['prediction'] == change['ground_truth']].doc_id.to_list()

    seen_id = []
    both_correct = 0
    control_cor = 0
    change_cor = 0
    both_incor = 0

    for val in control:
        if val in change:
            both_correct += 1
        else:
            control_cor += 1
        seen_id.append(val)

    for val in change:
        if val in seen_id:
            continue
        else:
            change_cor += 1
        seen_id.append(val)

    
    both_incor = len(all_ids) - (both_correct + control_cor + change_cor)
    table = [[both_correct, control_cor], [change_cor, both_incor]]
    return table


In [38]:
######################
          #
#   BOTH   #   ORIG
          #
######################
          #
#   NEW    #   WRONG BOTH
          #
######################

In [39]:
orig = all_results_df[(all_results_df.prompt == 'b1') & (all_results_df.model == 'l27b-meta')]
new = all_results_df[(all_results_df.prompt == 'b2') & (all_results_df.model == 'l27b-meta')]
table = mcnemar_table(orig, new)
run_mcnemar(table)
table

statistic=45.000, p-value=0.000
Different proportions of errors (reject H0)


[[986, 271], [45, 296]]

### Main Fine Inspection

In [40]:
all_results_df = get_results_json('clean_model_responses.json')

In [41]:
def mc_eval_util_prompt(results_df, prompt_name, prompt_name2):
    orig = results_df[results_df.prompt == prompt_name]
    pure = results_df[results_df.prompt == prompt_name2]
    table = mcnemar_table(orig, pure)
    return table

def mc_eval_util_model(results_df, model_name, model_name2):
    orig = results_df[results_df.model == model_name]
    pure = results_df[results_df.model == model_name2]
    table = mcnemar_table(orig, pure)
    return table

def mcnemar_eval(model_name, model_name2, prompt_name, prompt_name2):
    if (model_name != model_name2) and (prompt_name != prompt_name2):
        print('Multiple difference')
        return []
    
    if prompt_name != prompt_name2:
        results_df = all_results_df[(all_results_df.model == model_name)]
        overall_table = mc_eval_util_prompt(results_df, prompt_name, prompt_name2)
        non_results_df = results_df[results_df['ground_truth'] == 0]
        non_table = mc_eval_util_prompt(non_results_df, prompt_name, prompt_name2)
        sen_results_df = results_df[results_df['ground_truth'] == 1]
        sen_table = mc_eval_util_prompt(sen_results_df, prompt_name, prompt_name2)
    else:
        results_df = all_results_df[(all_results_df.prompt == prompt_name)]
        overall_table = mc_eval_util_model(results_df, model_name, model_name2)
        non_results_df = results_df[results_df['ground_truth'] == 0]
        non_table = mc_eval_util_model(non_results_df, model_name, model_name2)
        sen_results_df = results_df[results_df['ground_truth'] == 1]
        sen_table = mc_eval_util_model(sen_results_df, model_name, model_name2)

    print('Overall\t\t Non-sensitive only\t Sensitive only')
    #print(np.matrix(overall_table), end='')
    #print(np.matrix(non_table), end='')

    print(np.matrix(overall_table[0]), '\t', np.matrix(non_table[0]), '\t\t', np.matrix(sen_table[0]))
    print(np.matrix(overall_table[1]), '\t', np.matrix(non_table[1]), '\t\t', np.matrix(sen_table[1]))
    return overall_table


model_name = 'mist7b-mist'
model_name2 = 'mist7b-mist'

compare = [('itspersonal','itspersonalpurely'), ('b1', 'itspersonal')]
for v in compare:
    print('Control:', v[0], '\tChange:', v[1])
    prompt_name = v[0]
    prompt_name2 = v[1]
    overall = mcnemar_eval(model_name, model_name2, prompt_name, prompt_name2)
    run_mcnemar(overall)


Control: itspersonal 	Change: itspersonalpurely
Overall		 Non-sensitive only	 Sensitive only
[[1078   65]] 	 [[1023   31]] 		 [[55 34]]
[[151 303]] 	 [[148 197]] 		 [[  3 106]]
statistic=65.000, p-value=0.000
Different proportions of errors (reject H0)
Control: b1 	Change: itspersonal
Overall		 Non-sensitive only	 Sensitive only
[[1011  324]] 	 [[1003  319]] 		 [[8 5]]
[[132 131]] 	 [[51 27]] 		 [[ 81 104]]
statistic=132.000, p-value=0.000
Different proportions of errors (reject H0)


In [42]:
model_name = 'mist7b-mist'
model_name2 = 'mixt-4bit'

compare = [('itspersonal','itspersonal'), ('b1', 'itspersonal')]
for v in compare:
    print('Control:', v[0], '\tChange:', v[1])
    prompt_name = v[0]
    prompt_name2 = v[1]
    overall = mcnemar_eval(model_name, model_name2, prompt_name, prompt_name2)
    #run_mcnemar(overall)

Control: itspersonal 	Change: itspersonal
Overall		 Non-sensitive only	 Sensitive only
[[1092   51]] 	 [[1046    8]] 		 [[46 43]]
[[312 142]] 	 [[298  47]] 		 [[14 95]]
Control: b1 	Change: itspersonal
Multiple difference


In [44]:
model_name = 'mixt-4bit'
model_name2 = 'mixt-4bit'

compare = [('itspersonalfewshot','itspersonal'), ('b1', 'itspersonal')]
for v in compare:
    print('Control:', v[0], '\tChange:', v[1])
    prompt_name = v[0]
    prompt_name2 = v[1]
    overall = mcnemar_eval(model_name, model_name2, prompt_name, prompt_name2)
    #run_mcnemar(overall)

Control: itspersonalfewshot 	Change: itspersonal
Overall		 Non-sensitive only	 Sensitive only
[[1004   72]] 	 [[952   3]] 		 [[52 69]]
[[400 122]] 	 [[392  53]] 		 [[ 8 69]]
Control: b1 	Change: itspersonal
Overall		 Non-sensitive only	 Sensitive only
[[0 0]] 	 [[0 0]] 		 [[0 0]]
[[ 1404 -1404]] 	 [[ 1344 -1344]] 		 [[ 60 -60]]


In [46]:
# Eval config
model_name = 'mixt-4bit'
prompt_name_control = 'itspersonal'
prompt_name_change = 'itspersonalfewshot'
truth_lab = 0
model_guess = 1

rdf = all_results_df
rdf_truth_select = rdf[rdf.ground_truth == truth_lab]
df = rdf_truth_select[(rdf_truth_select.model == model_name) & ((rdf_truth_select.prompt == prompt_name_control) |  (rdf_truth_select.prompt == prompt_name_change))]
guess_df = df[df.prediction == model_guess] #.doc_id.to_list()
diff_guess_df = guess_df.groupby('doc_id').filter(lambda x: x['prompt'].nunique() == 1)
id_diff_guesses = diff_guess_df.doc_id.to_list()


gen_resp_df = get_results_json('all_model_responses.json')
gendf = gen_resp_df[gen_resp_df.ground_truth == truth_lab]
genx = gendf[(gendf.model == model_name) & ((gendf.prompt == prompt_name_control) | (gendf.prompt == prompt_name_change) )]

genverbose = gendf[(gendf.model == model_name) & (gendf.prompt == 'itspersonalverbose')]

s = load_sara()
filtered_df = s[s['doc_id'].isin(id_diff_guesses)]
filtered_df = proccutit(filtered_df)


for i, tex in enumerate(filtered_df.iterrows()):
    idd = tex[1].doc_id
    text = tex[1].text
    sens = tex[1].sensitivity
    try:
        # Get most recent row of doc prediction - that is most likely the one I want after rerunning experiments.
        no_preds = len(genx[genx.doc_id == idd])
        all_rows_control = genx[(genx.doc_id == idd) & (genx.prompt==prompt_name_control)]
        all_rows_change = genx[(genx.doc_id == idd) & (genx.prompt==prompt_name_change)]
        gen_row_control = all_rows_control.iloc[len(all_rows_control)-1]
        gen_ans_control = gen_row_control.generated_response
        gen_pred_control = gen_row_control.prediction
        gen_row_change = all_rows_change.iloc[len(all_rows_change)-1]
        gen_ans_change = gen_row_change.generated_response
        gen_pred_change = gen_row_change.prediction
    except:
        gen_ans_control = 'Nothing'
        gen_pred_control = None
        gen_ans_change = 'Nothing'
        gen_pred_change = None

    try:
        no_preds = len(genverbose[genverbose.doc_id == idd])
        gen_row = genverbose[genverbose.doc_id == idd].iloc[no_preds-1]
        verb = gen_row.generated_response
    except:
        verb = None

    # Only show agreement/disagreement documents
    #if gen_pred_change == truth_lab:
    if gen_pred_change == gen_pred_control:
        continue

    print('DOCUMENT VIEW')
    print('Document', idd, 'Class', sens, '\nMessage:')
    print(text)
    print('End Message.')
    print('\nCONTROL: Prediction:', gen_pred_control, '. Response:', gen_ans_control)
    print('\nCHANGE: Prediction:', gen_pred_change, '. Response:', gen_ans_change)
    print('Verbose desc:', verb)
    print('\n\n')
    
    #if i == 4:
    #    break



DOCUMENT VIEW
Document 114715_0 Class 0 
Message:
Though I had a somewhat different notion when I initially raised the idea of co-sponsorship, I agree with Lee's observations and think that we should proceed the way he suggests.
End Message.

CONTROL: Prediction: 0.0 . Response:  
 The text does not contain purely personal information or personal information in a professional context. The message only indicates agreement with a co-worker's opinion regarding an unspecified idea or proposal. It does not contain any personal information such as names of individuals, their health status, financial status, personal relationships, or any other information that could be considered private or sensitive. The message is purely professional in nature.</s>

CHANGE: Prediction: 1.0 . Response: [/INST] 
Response: The text does not contain purely personal information, but it does contain
Verbose desc: None



DOCUMENT VIEW
Document 114715_2 Class 0 
Message:
 Rob, I'd send this to the whole group but