In [None]:
from iesta.llms.generate import Generator
import argparse
from dotenv import load_dotenv, find_dotenv

import iesta
import iesta.llms
import iesta.llms.models
from iesta.llms.models import LlamaV2, ChatGpt

import importlib
importlib.reload(iesta)
import pandas as pd

In [None]:
 

from datasets import load_dataset


lib_ds = load_dataset("notaphoenix/debateorg_w_effect_for_liberal")


from transformers import LlamaTokenizer
import numpy as np

# Load LLAMA tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

# Example text field
text_field = lib_ds["training"]["text"]

# Tokenize the text field
tokenized_texts = [tokenizer.tokenize(sentence) for sentence in text_field]

# Count the number of tokens in each instance
token_counts = [len(tokens) for tokens in tokenized_texts]

# Calculate the median token count
median_token_count = np.median(token_counts)

print("Median token count:", median_token_count)

In [None]:
np.histogram(token_counts)

In [None]:
df = pd.read_parquet("data/liberal_debate_arguments_w_effect.parquet")

In [None]:
df = df[df["round"] == 0]

In [None]:
import ast


In [None]:
def apply_add_cols(row):
    effect_dict_col = row["effect_count"]
    effect_dict = ast.literal_eval(ast.literal_eval(str(effect_dict_col)).decode('utf-8'))
    total = 0
    for eff, count in effect_dict.items():
        row[eff] = count
        total = total +count
    row["total_votes"] =    total 
    row["total_votes_up5"] =    total > 5

    return row
df = df.apply(apply_add_cols, axis=1)

In [None]:
df

In [None]:
df[["effective", "ineffective", "provocative", "okay"]].fillna(0, inplace=True)

In [None]:
df_count = df[["effective", "ineffective", "provocative", "okay", "total_votes", "total_votes_up5"]]
df_count = df_count.fillna(0)

In [None]:
df_count

In [None]:
at_least_1 = len(df_count[df_count['effective'] >= 1])/len(df_count)
at_least_1_in = len(df_count[df_count['ineffective'] >= 1])/len(df_count)
print("# of debates: ", len(df_count))
print("at_least_1:", round( at_least_1*100, 1))
print("at_least_1 INEFF:", round( at_least_1_in*100, 1))

In [None]:
from math import floor
def apply_has_majority(row):
    majority = floor(row['total_votes'] / 2 ) +1
    row["effective_majority"] = row['effective'] >= majority
    row["ineffective_majority"] = row['ineffective'] >= majority
    return row

df_count = df_count.apply(apply_has_majority, axis = 1)

majority_total = len(df_count[df_count['effective_majority']]) /len(df_count)
majority_total_in = len(df_count[df_count['ineffective_majority']])/len(df_count)
print("# of debates: ", len(df_count))
print("majority:", round( majority_total*100, 1))
print("majority INEFF:", round( majority_total_in*100, 1))

In [None]:
from math import floor

def get_stats_per_effect(votes, df_votes, effect_val="effective"):
    vote_stat = {}
    vote_stat["# of debates"] = len(df_votes)
    vote_stat["votes"] = votes
    majority = floor(votes / 2 ) +1
    print(f"Majority is {majority} for {votes}")
    for effect in range(1, min((majority+1), 4)):
        vote_stat[f"@{effect}"] = len(df_votes[(df_votes[effect_val] >= effect )])
        vote_stat[f"{effect_val}_% @{effect}"] =round((vote_stat[f"@{effect}"] / vote_stat["# of debates"])* 100, 2)
    vote_stat[f">=Majority"] = len(df_votes[(df_votes[effect_val] >= majority )])
    vote_stat[f"{effect_val}_% >=Majority"] = round((vote_stat[f">=Majority"]  / vote_stat["# of debates"])* 100, 2)
    return vote_stat

def get_stats_per_effect_votes_r_bigger(votes, df_votes, effect_val="effective"):
    vote_stat = {}
    vote_stat["# of debates"]  = 0
    vote_stat["votes"] = votes
    for v, df_ in df_votes.groupby("total_votes"): 
        dict_ = get_stats_per_effect(v, df_, effect_val)
        if "# of debates" not in vote_stat.keys(): vote_stat["# of debates"] =0
        vote_stat["# of debates"] = vote_stat["# of debates"] + dict_["# of debates"]
        majority = floor(v / 2 ) +1
        for effect in range(1, min((majority+1), 4)):
            if f"@{effect}" not in vote_stat.keys(): vote_stat[f"@{effect}"] =0
            vote_stat[f"@{effect}"] = vote_stat[f"@{effect}"] + dict_[f"@{effect}"] 

        if ">=Majority" not in vote_stat.keys(): vote_stat[">=Majority"] =0
        vote_stat[f">=Majority"] = vote_stat[f">=Majority"]  + dict_[f">=Majority"]
    for effect in range(1, min((majority+1), 4)):
        vote_stat[f"{effect_val}_% @{effect}"] =round((vote_stat[f"@{effect}"] / vote_stat["# of debates"])* 100, 2)
    vote_stat[f"{effect_val}_% >=Majority"] = round((vote_stat[f">=Majority"]  / vote_stat["# of debates"])* 100, 2)
    return vote_stat



results_effective = []

for votes, df_votes in df_count.groupby("total_votes"):
    results_effective.append(get_stats_per_effect(votes, df_votes, "effective"))

results_ineffective = []

for votes, df_votes in df_count.groupby("total_votes"):
    results_ineffective.append(get_stats_per_effect(votes, df_votes, "ineffective"))
stats_df = pd.DataFrame(results_effective)
stats_df.fillna(0.0, inplace=True)

stats_ineffective_df = pd.DataFrame(results_ineffective)
stats_ineffective_df.fillna(0.0, inplace=True)

In [None]:
stats_ineffective_df = stats_ineffective_df[["# of debates", "votes", "ineffective_% @1", "ineffective_% >=Majority"]]
stats_ineffective_df.set_index(["votes", "# of debates"], inplace=True)
stats_df = stats_df[["# of debates", "votes", "effective_% @1", "effective_% >=Majority"]]
stats_df.set_index(["votes", "# of debates"], inplace=True)


In [None]:
stats_liberal = stats_ineffective_df.merge(stats_df, how='inner', left_index=True, right_index=True)

In [None]:
stats_liberal = stats_liberal[["ineffective_% @1", "effective_% @1", "ineffective_% >=Majority", "effective_% >=Majority"]]

In [None]:
stats_liberal

In [None]:
effective_up5 = get_stats_per_effect_votes_r_bigger(6, df_count[df_count["total_votes_up5"]], effect_val="effective")
ineffective_up5 = get_stats_per_effect_votes_r_bigger(6, df_count[df_count["total_votes_up5"]], effect_val="ineffective")

In [None]:
effective_up5

In [None]:
ineffective_up5

In [2]:
import pandas as pd

In [3]:

human_evaluation = pd.read_csv("data/human_evaluation/annotations.csv")# (f"data/human_evaluation/{ideology}_annotations.csv")

def apply_abstract(row):
    for n in range(1,4):
        for col in [f"effectiveness_arg{n}", f"clarity_arg{n}", f"consistency_arg{n}"]:
            if col == "consistency_arg1":
                continue
            score_type = col.split("_")[0]
            row[f"is_{col}"] = row[col] >=3
            row[f"is_highly_{col}"] = row[col] >=4
    return row


human_evaluation = human_evaluation.apply(apply_abstract, axis=1)

In [4]:
human_evaluation.head()

Unnamed: 0,a_id,user_id,arg_tuple_id,annotation_date,effectiveness_arg1,effectiveness_arg2,effectiveness_arg3,clarity_arg1,clarity_arg2,clarity_arg3,...,is_clarity_arg2,is_highly_clarity_arg2,is_consistency_arg2,is_highly_consistency_arg2,is_effectiveness_arg3,is_highly_effectiveness_arg3,is_clarity_arg3,is_highly_clarity_arg3,is_consistency_arg3,is_highly_consistency_arg3
0,1,4,0,2024-02-09 22:37:59.284006+00:00,1,1,3,1,1,3,...,False,False,False,False,True,False,True,False,True,False
1,2,5,50,2024-02-09 22:38:31.381697+00:00,5,5,1,5,4,2,...,True,True,True,False,False,False,False,False,True,False
2,3,6,0,2024-02-09 23:13:31.852903+00:00,3,4,4,4,2,3,...,False,False,True,True,True,True,True,False,True,True
3,4,11,0,2024-02-11 15:46:02.001212+00:00,3,3,4,2,4,5,...,True,True,False,False,True,True,True,True,True,True
4,5,11,1,2024-02-11 15:49:06.851714+00:00,3,2,4,4,4,4,...,True,True,False,False,True,True,True,True,True,True


In [5]:
conservative_ids = [13, 17]
liberal_ids = [11, 15, 16]

In [149]:
liberal = human_evaluation[human_evaluation["user_id"].isin(liberal_ids)]
conservative = human_evaluation[human_evaluation["user_id"].isin(conservative_ids)]

In [155]:
liberal['favorite']

def apply_fav(row):
    row[f"fav_arg{row['favorite']}"] = 1
    for i in range(1,4):
        if i != row['favorite']:
            row[f"fav_arg{i}"] = 0

    return row
liberal = liberal.apply(apply_fav, axis=1)

In [156]:
conservative['favorite']

conservative = conservative.apply(apply_fav, axis=1)

In [179]:
conservative.columns

Index(['a_id', 'annotation_date', 'arg_tuple_id', 'clarity_arg1',
       'clarity_arg2', 'clarity_arg3', 'consistency_arg2', 'consistency_arg3',
       'effectiveness_arg1', 'effectiveness_arg2', 'effectiveness_arg3',
       'fav_arg1', 'fav_arg2', 'fav_arg3', 'favorite', 'is_clarity_arg1',
       'is_clarity_arg2', 'is_clarity_arg3', 'is_consistency_arg2',
       'is_consistency_arg3', 'is_effectiveness_arg1', 'is_effectiveness_arg2',
       'is_effectiveness_arg3', 'is_highly_clarity_arg1',
       'is_highly_clarity_arg2', 'is_highly_clarity_arg3',
       'is_highly_consistency_arg2', 'is_highly_consistency_arg3',
       'is_highly_effectiveness_arg1', 'is_highly_effectiveness_arg2',
       'is_highly_effectiveness_arg3', 'user_id'],
      dtype='object')

In [183]:
df_ = conservative['arg_tuple_id'].value_counts().rename_axis('unique_values').reset_index(name='count')
arg_tuple_id_lst = df_[df_["count"]>1]["unique_values"].to_list()

In [186]:
conservative = conservative[conservative["arg_tuple_id"].isin(arg_tuple_id_lst)]
len(conservative)

80

In [162]:

def get_1annotator_per_row(df):
    criteria_dict = {}
    for user_id, df_ in df.sort_values(by=["user_id", "arg_tuple_id"]).groupby(["user_id"]):
        print(user_id)
        for c in ['effectiveness_arg1', 'effectiveness_arg2', 'effectiveness_arg3',
        'clarity_arg1', 'clarity_arg2', 'clarity_arg3', 'consistency_arg2',
        'consistency_arg3', 'fav_arg1', 'fav_arg2', 'fav_arg3']:
            if c  not in criteria_dict.keys(): criteria_dict[c]=  []
            criteria_dict[c].append(df_[c].values.tolist())
    return criteria_dict

# a numpy array with rows as items and columns as raters

# used for cohen kappa
def get_rows_as_items_and_cols_as_raters(df, normalize = False):
    criteria_dict = {}
    for c in ['effectiveness_arg1', 'effectiveness_arg2', 'effectiveness_arg3',
            'clarity_arg1', 'clarity_arg2', 'clarity_arg3', 'consistency_arg2',
            'consistency_arg3', 'fav_arg1', 'fav_arg2', 'fav_arg3']:
        df_ = df[[c,"arg_tuple_id", "user_id" ]].sort_values(["arg_tuple_id", "user_id"])
        def do_normalize(row):
            if row[c] <3:
                row[c] = 0
            else: row[c] = 1
            return row
        if normalize and not c.startswith("fav_"):
            df_ = df_.apply(do_normalize, axis = 1)
        pivot_df = df_.pivot(index='arg_tuple_id', columns='user_id', values=c)
        ratings_array = pivot_df.to_numpy()

        criteria_dict[c] =  ratings_array
    return criteria_dict


def get_rows_as_items_and_cols_as_raters(df, normalize = False):
    criteria_dict = {}
    for c in ['effectiveness_arg1', 'effectiveness_arg2', 'effectiveness_arg3',
            'clarity_arg1', 'clarity_arg2', 'clarity_arg3', 'consistency_arg2',
            'consistency_arg3',  'fav_arg1', 'fav_arg2', 'fav_arg3']:
        df_ = df[[c,"arg_tuple_id", "user_id" ]].sort_values(["arg_tuple_id", "user_id"])
        def do_normalize(row):
            if row[c] <3:
                row[c] = 0
            else: row[c] = 1
            return row
        if normalize and not c.startswith("fav_"):
            df_ = df_.apply(do_normalize, axis = 1)
        pivot_df = df_.pivot(index='arg_tuple_id', columns='user_id', values=c)
        ratings_array = pivot_df.to_numpy()

        criteria_dict[c] =  ratings_array
    return criteria_dict

            
    
def getrow_as_item_rater_df(df, normalize = False):
    criteria_dict = {}
    for c in ['effectiveness_arg1', 'effectiveness_arg2', 'effectiveness_arg3',
            'clarity_arg1', 'clarity_arg2', 'clarity_arg3', 'consistency_arg2',
            'consistency_arg3', 'fav_arg1', 'fav_arg2', 'fav_arg3']:
        df_ = df[[c,"arg_tuple_id", "user_id" ]].sort_values(["arg_tuple_id", "user_id"])


        
        def do_normalize(row):
            
            if row[c] <3:
                row[c] = 0
            #elif row[c] >3: row[c] = 3
            else: row[c] = 1
            return row
        if normalize and not c.startswith("fav_"):
            df_ = df_.apply(do_normalize, axis = 1)

        res_arr = []
        for _, row in df_.iterrows():
            res_arr.append({
                "Item": row["arg_tuple_id"],
                "Rater": row["user_id"],
                "Rating": row[c] 
            }
            )
        criteria_dict[c] =  pd.DataFrame(res_arr)
    return criteria_dict

            


In [163]:
#!pip  -q install pingouin

In [165]:
# COHEN's KAPPA



from sklearn.metrics import cohen_kappa_score
import itertools
from statsmodels.stats.inter_rater import fleiss_kappa
import pandas as pd
import pingouin as pg
import numpy as np
from scipy.stats import kendalltau
import pandas as pd

def get_avg_pairwise_cohen_kappa(ratings):

    raters_pair = list(itertools.combinations(range(0, len(ratings.T)), 2))

    # Assuming 'ratings' is a numpy array with rows as items and columns as raters
    kappas = 0.0
    pari = 0
    for pair in raters_pair:
        kappas = kappas + cohens_kappa(ratings[:, pair[0]], ratings[:, pair[1]]).kappa
        pari = pari +1
    assert pari == 1
    average_kappa = kappas / pari
    return average_kappa




def get_kappa_per_feature(df, normalize=False):
    ratings_per_criteria = get_rows_as_items_and_cols_as_raters(df, normalize)
    results = {}
    for criteria, ratings in ratings_per_criteria.items():
        res = get_avg_pairwise_cohen_kappa(ratings)
        print(f"cohen KAPPA {criteria}: \t {res}")
        results[criteria] = res
    return results

get_kappa_per_feature(conservative, True)
#get_kappa_per_feature(conservative)


def get_fleiss_kappa_score(ratings):
    rating_counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=2), axis=1, arr=ratings)
    return fleiss_kappa(rating_counts)

def get_fleiss_kappa_per_feature(df, normalize=False):
    ratings_per_criteria = get_rows_as_items_and_cols_as_raters(df, normalize)
    results = {}
    for criteria, ratings in ratings_per_criteria.items():
        res = get_fleiss_kappa_score(ratings)
        print(f"{criteria}: \t {res}")
        results[criteria] = res
    return results
print("fleiss kappa")
get_fleiss_kappa_per_feature(liberal, True)
#get_kappa_per_feature(conservative)



def get_icc(ratings_df, c):
    # Convert numpy array 'ratings' to pandas DataFrame
    # ratings_df = pd.DataFrame(ratings)
     # Optionally, add an 'Item' column if your dataset doesn't have one
    # ratings_df['Item'] = range(1, len(ratings_df) + 1)
    # ratings_df.set_index('Item', inplace=True)
    # ratings_df = pd.melt(ratings_df.reset_index(), id_vars='Item', var_name='Rater', value_name='Rating')

    print(ratings_df.columns)
    # Calculate ICC
    icc = pg.intraclass_corr(data=ratings_df, targets='Item', raters='Rater', ratings="Rating").round(3)
    return icc.at[2, 'ICC']  # Using ICC(3, k) for consistency/agreement

def get_icc_per_feature(df, normalize=False):
    ratings_per_criteria = getrow_as_item_rater_df(df, normalize)
    results = {}
    for criteria, ratings in ratings_per_criteria.items():
        res = get_icc(ratings, criteria)
        print(f"{criteria}: \t {res}")
        results[criteria] = res
    return results

print(

)
print("icc")
get_icc_per_feature(liberal, True)
# get_icc_per_feature(conservative)
# get_rows_as_items_and_cols_as_raters(liberal, False)

def get_kendall_tau(ratings_df):
    

    # Convert numpy array 'ratings' to pandas DataFrame and rank the ratings
    # ratings_df = pd.DataFrame(ratings)
    # Optionally, add an 'Item' column if your dataset doesn't have one
    # ratings_df['Item'] = range(1, len(ratings_df) + 1)
    # ratings_df.set_index('Item', inplace=True)

    ratings_df = ratings_df.rank()
    # Calculate Kendall's W
    num_items = len(ratings_df)
    num_raters = len(ratings_df.columns)
    rank_sum_squared = sum(ratings_df.sum() ** 2)
    return (12 * rank_sum_squared - 3 * num_items**2 * (num_raters+1)**2) / (num_items**2 * (num_raters**3 - num_raters))

def get_kendallstau_per_feature(df, normalize=False):
    ratings_per_criteria = getrow_as_item_rater_df(df, normalize)
    results = {}
    for criteria, ratings in ratings_per_criteria.items():
        res = get_kendall_tau(ratings)
        print(f"{criteria}: \t {res}")
        results[criteria] = res
    return results

print(

)
#print("KANDALL TAU")
#get_kendallstau_per_feature(liberal, True)



def get_avg_pairwise_tau(ratings):

    raters_pair = list(itertools.combinations(range(0, len(ratings.T)), 2))

    # Assuming 'ratings' is a numpy array with rows as items and columns as raters
    tau = 0.0
    for pair in raters_pair:
        tau = tau + kendalltau(ratings[:, pair[0]], ratings[:, pair[1]])[0]

    average_tau = tau / len(ratings)
    return average_tau




def get_avgtau_per_feature(df, normalize=False):
    ratings_per_criteria = get_rows_as_items_and_cols_as_raters(df, normalize)
    results = {}
    for criteria, ratings in ratings_per_criteria.items():
        res = get_avg_pairwise_tau(ratings)
        print(f"{criteria}: \t {res}")
        results[criteria] = res
    return results
print()
print("tau")
get_avgtau_per_feature(liberal, True)

chen KAPPA effectiveness_arg1: 	 0.2631046019976209
chen KAPPA effectiveness_arg2: 	 0.11890838206627681
chen KAPPA effectiveness_arg3: 	 -0.04401154401154398
chen KAPPA clarity_arg1: 	 0.10823090353806819
chen KAPPA clarity_arg2: 	 0.07210698117771856
chen KAPPA clarity_arg3: 	 0.08515130190007043
chen KAPPA consistency_arg2: 	 0.1257705073494547
chen KAPPA consistency_arg3: 	 -0.020776874435411097
chen KAPPA fav_arg1: 	 0.0357142857142857
chen KAPPA fav_arg2: 	 0.1263398892310064
chen KAPPA fav_arg3: 	 0.047615382831141084
fleiss kappa
effectiveness_arg1: 	 0.25280199252801977
effectiveness_arg2: 	 0.11616161616161595
effectiveness_arg3: 	 -0.061477222467934606
clarity_arg1: 	 0.06524926686216993
clarity_arg2: 	 0.07355418304323424
clarity_arg3: 	 0.1008991008991018
consistency_arg2: 	 0.08133971291866034
consistency_arg3: 	 -0.024159663865545734
fav_arg1: 	 0.0036231884057967025
fav_arg2: 	 0.12424740010946904
fav_arg3: 	 0.040000000000000036

icc
Index(['Item', 'Rater', 'Rating'], 

{'effectiveness_arg1': 0.016783566595034546,
 'effectiveness_arg2': 0.007313972994395378,
 'effectiveness_arg3': -0.003402069087198859,
 'clarity_arg1': 0.007349818951341622,
 'clarity_arg2': 0.004832377973423638,
 'clarity_arg3': 0.007210884353741497,
 'consistency_arg2': 0.010142542485649064,
 'consistency_arg3': -0.0012392758478562413,
 'fav_arg1': nan,
 'fav_arg2': 0.007696229660520904,
 'fav_arg3': 0.002824503597043532}

In [224]:
import numpy as np

# Example NumPy array
array = np.array([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12],
    [13, 14, 15, 16]
])

# Specify the row indices and column indices you want to select
row_indices = range(len(array)) # Selecting the 1st and 3rd rows (0-indexed)
col_indices = [1, 3]  # Selecting the 2nd and 4th columns (0-indexed)
pair = (0,1)
# Using advanced indexing to select specific rows and columns

print(selected_elements)


[[ 1  2]
 [ 5  6]
 [ 9 10]
 [13 14]]


In [10]:
%pip install krippendorff

Collecting krippendorff
  Downloading krippendorff-0.6.1-py3-none-any.whl.metadata (2.8 kB)
Downloading krippendorff-0.6.1-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.6.1
Note: you may need to restart the kernel to use updated packages.


In [67]:
import numpy as np
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import aggregate_raters, fleiss_kappa, cohens_kappa, to_table
import krippendorff
from sklearn.metrics import cohen_kappa_score
import itertools
from statsmodels.stats.inter_rater import fleiss_kappa
import pandas as pd


import numpy as np
from scipy.stats import kendalltau
import pandas as pd

def format_fk(ratings):
    
    agg = aggregate_raters(ratings, )


    return agg, fleiss_kappa(agg, method='fleiss')

# Example annotation data for a binary rating task with three annotators
# Each row is an item, and each column is an annotator's binary decision (0 or 1)
annotations = [
    [1, 1, 1],
    [1, 1, 1],
    [0, 1, 1],
    [0, 0, 0],
    [1, 0, 1]
]

def get_rows_as_items_and_cols_as_raters(df, normalize = False):
    criteria_dict = {}
   
    for cr in ['original_effectiveness', 'rewrite1_effectiveness', 'original_effectiveness', 
               'original_clarity', 'rewrite1_clarity', 'original_clarity',
                'original_consistency', 'rewrite1_consistency', 'original_consistency','fav'] :
    
        df_ = df[[cr,"id", "iteration" ]].sort_values(["id", "iteration"])
        def do_normalize(row):
            if row[cr] <3:
                row[cr] = 0
            else: row[cr] = 1
            return row
        if normalize and not cr.startswith("fav_"):
            df_ = df_.apply(do_normalize, axis = 1)
        pivot_df = df_.pivot(index='id', columns='iteration', values=cr)
        ratings_array = pivot_df.to_numpy()

        criteria_dict[cr] =  ratings_array
    return criteria_dict
    
def calc_majority(r):
    total = len(r)
    maj = 0.0
    for item in r:
        is_full = 1 if len(list(set(item))) == 1 else 0
        maj = maj + is_full
    return maj/total

def get_binary_kappa(df, normalize=False):
    ratings_per_criteria = get_rows_as_items_and_cols_as_raters(df, normalize)
    results = {}
    majority= {}
    for criteria, ratings in ratings_per_criteria.items():
    
        # print(f"{criteria}: \t {fleiss_kappa_score}")

        #results[criteria] = round(fleiss_kappa(aggregate_raters(ratings)[0],  method='fleiss'), 2)
        # results[criteria]  = round( krippendorff.alpha(reliability_data=ratings, level_of_measurement="nominal"), 2)
        raters_pair = list(itertools.combinations(range(0, len(ratings[0])), 2))

        # Assuming 'ratings' is a numpy array with rows as items and columns as raters
        kappas = 0.0
        pari = 0
        for pair in raters_pair:

            selected_elements = ratings[range(len(ratings)), :][:, list(pair)]
            #assert len(selected_elements) == 2
            kappas = kappas + cohens_kappa(to_table(selected_elements)[0]).kappa
            pari = pari +1
            
        results[criteria] = kappas / pari# cohens_kappa(to_table(ratings)[0]).kappa


        majority[criteria] = calc_majority(ratings)

    results_g  = {}
    majority_g = {}
    ratings_per_all_models = {"effectiveness": [],
                              "clarity": [],
                              "consistency": [],
                              "fav": []
                              }
    for crit in ['effectiveness', 'clarity', 'consistency' , "fav"]:
        for criteria, ratings in ratings_per_criteria.items():
            
            if criteria.endswith(crit):
                ratings_per_all_models[crit].extend(ratings)
               

    for criteria, ratings in ratings_per_all_models.items():
        ratings = np.array(ratings)
        raters_pair = list(itertools.combinations(range(1, len(ratings[0])), 2))

        # Assuming 'ratings' is a numpy array with rows as items and columns as raters
        kappas = 0.0
        pari = 0
        for pair in raters_pair:
            
            selected_elements = ratings[range(len(ratings)), :][:, list(pair)]

            kappas = kappas + cohens_kappa(to_table(selected_elements)[0]).kappa
            pari = pari +1
            
        results_g[criteria] = kappas / pari# cohens_kappa(to_table(ratings)[0]).kappa

        # print(f"{criteria}: \t {fleiss_kappa_score}")
        #results_g[criteria] = cohens_kappa(to_table(ratings)[0])#round(fleiss_kappa(aggregate_raters(ratings)[0],  method='fleiss'), 2)
        # results_g[criteria]  = round( krippendorff.alpha(reliability_data=ratings, level_of_measurement="nominal"), 2)

        majority_g[criteria] = calc_majority(ratings)

    return results, majority, ratings, results_g, majority_g

for kk in dfssss.keys():
    kappa, majority, r, results_g, majority_g = get_binary_kappa(dfssss[kk]
                                                                , True)
    print(f"{kk} KAPPA:{results_g}, FULL:{majority_g} ")
    print()



  self['z_value'] = self['kappa'] / self['std_kappa0']
  kappa = (agree / nobs - agree_exp) / (1 - agree_exp)
  var_kappa0 /= (1 - agree_exp)**2 * nobs
  kappa_max = (np.minimum(freq_row, freq_col).sum() - agree_exp) / \
  self['std_kappa'] = np.sqrt(self['var_kappa'])
  self['z_value'] = self['kappa'] / self['std_kappa0']
  kappa = (agree / nobs - agree_exp) / (1 - agree_exp)
  var_kappa0 /= (1 - agree_exp)**2 * nobs
  kappa_max = (np.minimum(freq_row, freq_col).sum() - agree_exp) / \
  self['z_value'] = self['kappa'] / self['std_kappa0']
  kappa = (agree / nobs - agree_exp) / (1 - agree_exp)
  var_kappa0 /= (1 - agree_exp)**2 * nobs
  kappa_max = (np.minimum(freq_row, freq_col).sum() - agree_exp) / \


Mixtral8x7B_Conservative KAPPA:{'effectiveness': 0.7488792221838781, 'clarity': 0.4537629581658334, 'consistency': 0.5621204838743589, 'fav': 0.4814325824941164}, FULL:{'effectiveness': 0.79, 'clarity': 0.82, 'consistency': 0.86, 'fav': 0.52} 

Mixtral8x7B_Liberal KAPPA:{'effectiveness': 0.6480084237531528, 'clarity': 0.5779263725692295, 'consistency': 0.5425588416913013, 'fav': 0.2663502612312645}, FULL:{'effectiveness': 0.75, 'clarity': 0.83, 'consistency': 0.86, 'fav': 0.34} 

GPT4_Liberal KAPPA:{'effectiveness': 0.6612294508225687, 'clarity': 0.5188257058969797, 'consistency': 0.8777967618582703, 'fav': 0.5261943102522812}, FULL:{'effectiveness': 0.69, 'clarity': 0.73, 'consistency': 0.91, 'fav': 0.52} 

GPT4_Conservative KAPPA:{'effectiveness': 0.6028897878695607, 'clarity': 0.5036028875441055, 'consistency': 0.8921541011545728, 'fav': 0.5061513687600645}, FULL:{'effectiveness': 0.7, 'clarity': 0.72, 'consistency': 0.95, 'fav': 0.46} 



  kappa = (agree / nobs - agree_exp) / (1 - agree_exp)
  var_kappa0 /= (1 - agree_exp)**2 * nobs
  kappa_max = (np.minimum(freq_row, freq_col).sum() - agree_exp) / \


In [38]:
dfssss["Mixtral8x7B_Conservative"]

Unnamed: 0,iteration,batch,favorite,rewrite1_effectiveness,rewrite2_clarity,rewrite2_feedback,ideology,prompt,original_feedback,id,rewrite2_consistency,original_consistency,rewrite1_consistency,rewrite2_effectiveness,original_effectiveness,rewrite1_clarity,original_clarity,rewrite1_feedback
0,1,conservative,rewrite2,2.0,5.0,the rewritten argument is clear and structured...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is not clear and lacks s...,50,3.0,5.0,5.0,3.0,2.0,4.0,2.0,the rewritten argument is clearer and more str...
1,1,conservative,rewrite1,4.0,4.0,rewrite 2 is similar to the original argument ...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is somewhat effective in...,51,5.0,5.0,5.0,4.0,3.0,5.0,3.0,"rewrite 1 is clear and concise, effectively co..."
2,1,conservative,rewrite2,1.0,5.0,this argument is clear and to the point. it ef...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is ineffective as it lac...,52,3.0,5.0,1.0,2.0,1.0,5.0,3.0,"this argument is clear and structured, but it ..."
3,1,conservative,rewrite1,4.0,5.0,"rewrite 2 is clear, consistent, and effectivel...",Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is somewhat effective in...,53,5.0,5.0,5.0,4.0,3.0,5.0,3.0,"rewrite 1 is clear, consistent, and presents t..."
4,1,conservative,rewrite1,4.0,3.0,Rewrite 2 is consistent with the original argu...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",The original argument effectively breaks down ...,54,5.0,5.0,5.0,4.0,4.0,5.0,3.0,Rewrite 1 captures the original argument's mai...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,5,conservative,original,3.0,5.0,this rewrite is a more formal version of the o...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is a simple statement ac...,95,5.0,5.0,5.0,3.0,3.0,5.0,5.0,the rewrite is similar in meaning to the origi...
246,5,conservative,rewrite2,4.0,5.0,the rewrite is fully effective in presenting t...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is fairly effective in p...,96,5.0,,3.0,5.0,3.0,5.0,4.0,the rewrite is mostly effective in presenting ...
247,5,conservative,original,4.0,5.0,the rewrite argues that planets are indeed isl...,Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument clearly lays out the def...,97,1.0,5.0,2.0,2.0,3.0,5.0,5.0,the rewrite argues that planets are not island...
248,5,conservative,rewrite2,3.0,5.0,"rewrite 2 is clear, well-structured, and more ...",Flag and Faith Conservative ideology,"Human: From now on, you are an American with a...",the original argument is somewhat effective du...,98,5.0,5.0,5.0,4.0,3.0,4.0,3.0,rewrite 1 is clearer and more structured than ...


In [34]:
all_llm_based_df.head()

Unnamed: 0,effectiveness_rewrite1,model,ideology,iteration,id,effectiveness_rewrite2,effectiveness_original,clarity_rewrite2,clarity_rewrite1,clarity_original,consistency_rewrite2,consistency_original,consistency_rewrite1
0,2.0,Mixtral8x7B,Conservative,1,50,,,,,,,,
1,,Mixtral8x7B,Conservative,1,50,3.0,,,,,,,
2,,Mixtral8x7B,Conservative,1,50,,2.0,,,,,,
3,4.0,Mixtral8x7B,Conservative,1,51,,,,,,,,
4,,Mixtral8x7B,Conservative,1,51,4.0,,,,,,,


In [62]:
from glob import glob
import pandas as pd
eval_files = glob("data/llms_out/llm_evaluation/*.*")
criterion = ["effectiveness", "clarity", "consistency", "favorite"]
#all_eval_per_criteria = {k:[] for k in criterion}
all_llm_eval = []
model = "GPT4"

dfssss ={}
for file in eval_files:
    print(f"processing {file}")
    
    if file.endswith("csv"):
        df_ = pd.read_csv(file)
    else: 
        df_ = pd.read_json(file, lines=True)
    def _appp (row):
        if row["favorite"] == "rewrite1":
            row["fav"] = 2
        elif row["favorite"] == "rewrite2":
            row["fav"] = 3
        else:
            row["fav"] = 1
        return row
    

    df_ = df_[df_["iteration"].isin(range(1,6))].apply(_appp, axis=1)
  

    model_name_ = "GPT4" if file.find("gpt-4")>-1 else "Mixtral8x7B"
    ideology =  "Liberal" if file.find("liberal")>-1 else "Conservative"
    dfssss[f"{model_name_}_{ideology}"] = df_
    print(f"{model_name_}_{ideology}")
    





processing data/llms_out/llm_evaluation/conservative_mixtral_processed.csv
Mixtral8x7B_Conservative
processing data/llms_out/llm_evaluation/liberal_mixtral_processed.csv
Mixtral8x7B_Liberal
processing data/llms_out/llm_evaluation/liberal_gpt-4.jsonl
GPT4_Liberal
processing data/llms_out/llm_evaluation/conservative_gpt-4_based_eval1-5.jsonl
GPT4_Conservative


In [97]:
def print_stat(df):
    model_dict = {1: "ineffective", 2: "llama2", 3: "chatgpt"}
    for scoret in [f"effectiveness", f"clarity", f"consistency"]:
        for n in range(1,4):
            col = f"{scoret}_arg{n}"
            if col == "consistency_arg1":
                continue
            print(f"{model_dict[n]} {scoret} : {round(len(df[df[f'is_{col}']])/len(df)*100, 1)}\%"
                  f"({round(len(df[df[f'is_highly_{col}']])/len(df)*100, 1)}\%)")

            print(f"\t mean: {round(df[f'{col}'].mean(),2)} \t {round(df[f'{col}'].std(),2)} \n")
print_stat(liberal)

ineffective effectiveness : 48.7\%(12.7\%)
	 mean: 2.49 	 0.87 

llama2 effectiveness : 88.0\%(41.3\%)
	 mean: 3.32 	 0.8 

chatgpt effectiveness : 88.7\%(56.7\%)
	 mean: 3.51 	 0.84 

ineffective clarity : 58.7\%(24.7\%)
	 mean: 2.79 	 1.0 

llama2 clarity : 91.3\%(56.7\%)
	 mean: 3.53 	 0.78 

chatgpt clarity : 95.3\%(65.3\%)
	 mean: 3.68 	 0.71 

llama2 consistency : 63.3\%(40.0\%)
	 mean: 2.97 	 1.23 

chatgpt consistency : 90.7\%(67.3\%)
	 mean: 3.8 	 0.96 



In [99]:
liberal["favorite"].value_counts(normalize=True)

3    0.50
2    0.42
1    0.08
Name: favorite, dtype: float64

In [100]:
conservative = human_evaluation[human_evaluation["user_id"].isin(conservative_ids)]

In [101]:
print_stat(conservative)

ineffective effectiveness : 32.2\%(8.9\%)
	 mean: 2.2 	 0.91 

llama2 effectiveness : 93.3\%(68.9\%)
	 mean: 3.69 	 0.79 

chatgpt effectiveness : 78.9\%(52.2\%)
	 mean: 3.36 	 1.04 

ineffective clarity : 38.9\%(11.1\%)
	 mean: 2.31 	 0.91 

llama2 clarity : 93.3\%(68.9\%)
	 mean: 3.7 	 0.71 

chatgpt clarity : 84.4\%(53.3\%)
	 mean: 3.43 	 0.86 

llama2 consistency : 96.7\%(64.4\%)
	 mean: 3.66 	 0.62 

chatgpt consistency : 86.7\%(46.7\%)
	 mean: 3.38 	 0.82 



In [102]:
conservative["favorite"].value_counts(normalize=True)

2    0.577778
3    0.388889
1    0.033333
Name: favorite, dtype: float64