In [36]:
import pandas as pd
import numpy as np
import json
from config import LABEL_SET, PROMPT_SET, LABEL_TO_ID, DATA_PATH, MODEL_SET

def compute_metric(pred_label, true_label, task):
    valid_indices = pred_label != -1
    valid_pred_label = pred_label[valid_indices]
    valid_true_label = true_label[valid_indices]

    # Calculate metrics
    num_examples = len(pred_label)  # Total number of examples
    acc = np.mean(pred_label == true_label) * 100.0  # Accuracy considers all predictions
    asr = (1 - np.mean(valid_pred_label == valid_true_label)) * 100.0 if len(valid_pred_label) > 0 else 0.0
    
    return {
        'num_examples': num_examples,
        'acc': acc,
        'asr': asr
    }

def get_data_construction(idx):
    data = json.load(open(DATA_PATH["advglue"],'r'))
    task = data['sst2']
    return task[idx]['data_construction']


def extract_stats(filePath, mask_rate, denoise=False, data_construction_method = "word"):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(filePath)

    # Filter the DataFrame for rows where data_construction == "word"
    df['data_construction'] = df['idx'].apply(get_data_construction)
    filtered_df = df[df['data_construction'] == data_construction_method]

    # Initialize labels dictionary and metrics DataFrame
    labels = {}  # Contains true labels and predictions
    labels['true_label'] = filtered_df['true_label'].to_numpy()
    method_metrics_df = pd.DataFrame() 

    model_list = MODEL_SET['hug_gen']

    # Add predictions for each model to the labels dictionary
    for model in model_list:
        labels['pred-' + model.replace('/', '_')] = filtered_df['pred-' + model.replace('/', '_')].to_numpy()
    
    # Group by 'method' and compute metrics for each group
    grouped = filtered_df.groupby('method')
    for method, group in grouped:
        # Extract true labels for the current group
        group_true_labels = group['true_label'].to_numpy()

        for key in labels.keys():
            if key != 'true_label':
                # Filter predictions for the current group
                group_pred_labels = group[key].to_numpy()

                # Map predictions to label IDs
                pred_label = np.array([LABEL_TO_ID["sst2"].get(label, -1) for label in group_pred_labels])

                # Compute metrics
                metric_dict = compute_metric(pred_label, group_true_labels, "sst2")

                # Add metadata to the metric_dict
                metric_dict['method'] = method
                metric_dict['model'] = key
                metric_dict['mask_rate'] = mask_rate
                metric_dict['task'] = 'sst2'
                metric_dict['data_construction'] = data_construction_method  # Already filtered, so it's always "word"

                # Append to method_metrics_df
                method_metrics_df = pd.concat(
                    [method_metrics_df, pd.DataFrame([metric_dict])],
                    ignore_index=True
                )

    # Display the resulting metrics DataFrame in Jupyter Notebook
    from IPython.display import display
    if denoise:
        print("Self-denoised with Mask rate of "+ str(mask_rate))
    else:
        print("mask rate of " + str(mask_rate))
    display(method_metrics_df)
    print("\n")

    # Optionally, return the metrics DataFrame for further use
    return method_metrics_df

In [None]:

def print_stats(denoise=False, data_construction_method="word"):
    # for i in range(1,10):
        # mask_rate = f'0.{str(i)}'
    if denoise:
        file_path = f'result/merge_advglue_sst2_hug_gen_maskrate_0.1_self-denoise.csv'
    else:
        file_path = f'result/merge_advglue_sst2_hug_gen_maskrate_0.1.csv'
    extract_stats(file_path,0.1, denoise, data_construction_method=data_construction_method)

def print_standard_stats(data_construction_method = "word"):
    file_path = f'result/merge_advglue_sst2_hug_gen.csv'
    extract_stats(file_path, 0, data_construction_method=data_construction_method)

print("Construction method: Sentence")
print_stats(True,"sentence")

# print("Construction method: word")
# print_stats(True,"word")

Construction method: Sentence
Self-denoised with Mask rate of 0.1


Unnamed: 0,num_examples,acc,asr,method,model,mask_rate,task,data_construction
0,11,54.545455,45.454545,CheckList,pred-gemini-1.5-flash,0.1,sst2,sentence
1,11,9.090909,90.909091,CheckList,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
2,4,50.0,50.0,SCPN,pred-gemini-1.5-flash,0.1,sst2,sentence
3,4,75.0,25.0,SCPN,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
4,19,52.631579,47.368421,StressTest,pred-gemini-1.5-flash,0.1,sst2,sentence
5,19,26.315789,73.684211,StressTest,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
6,17,47.058824,52.941176,T3,pred-gemini-1.5-flash,0.1,sst2,sentence
7,17,58.823529,41.176471,T3,pred-gemini-1.5-flash-8b,0.1,sst2,sentence






In [53]:
print("Construction method: Sentence")
print_stats(False,"sentence")

print("Construction method: word")
print_stats(False,"word")

Construction method: Sentence
mask rate of 0.1


Unnamed: 0,num_examples,acc,asr,method,model,mask_rate,task,data_construction
0,11,90.909091,9.090909,CheckList,pred-gemini-1.5-flash,0.1,sst2,sentence
1,11,45.454545,54.545455,CheckList,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
2,4,25.0,75.0,SCPN,pred-gemini-1.5-flash,0.1,sst2,sentence
3,4,50.0,50.0,SCPN,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
4,19,47.368421,52.631579,StressTest,pred-gemini-1.5-flash,0.1,sst2,sentence
5,19,42.105263,57.894737,StressTest,pred-gemini-1.5-flash-8b,0.1,sst2,sentence
6,17,41.176471,58.823529,T3,pred-gemini-1.5-flash,0.1,sst2,sentence
7,17,47.058824,52.941176,T3,pred-gemini-1.5-flash-8b,0.1,sst2,sentence




Construction method: word
mask rate of 0.1


Unnamed: 0,num_examples,acc,asr,method,model,mask_rate,task,data_construction
0,8,75.0,25.0,BERT-ATTACK,pred-gemini-1.5-flash,0.1,sst2,word
1,8,50.0,50.0,BERT-ATTACK,pred-gemini-1.5-flash-8b,0.1,sst2,word
2,9,77.777778,22.222222,SemAttack,pred-gemini-1.5-flash,0.1,sst2,word
3,9,77.777778,22.222222,SemAttack,pred-gemini-1.5-flash-8b,0.1,sst2,word
4,28,64.285714,35.714286,SememePSO,pred-gemini-1.5-flash,0.1,sst2,word
5,28,60.714286,39.285714,SememePSO,pred-gemini-1.5-flash-8b,0.1,sst2,word
6,19,89.473684,10.526316,TextBugger,pred-gemini-1.5-flash,0.1,sst2,word
7,19,89.473684,10.526316,TextBugger,pred-gemini-1.5-flash-8b,0.1,sst2,word
8,16,56.25,43.75,TextFooler,pred-gemini-1.5-flash,0.1,sst2,word
9,16,56.25,43.75,TextFooler,pred-gemini-1.5-flash-8b,0.1,sst2,word






In [51]:
print("Construction method: Sentence")
print_standard_stats(data_construction_method="sentence")

# print("Construction method: Word")
# print_standard_stats(data_construction_method="word")

Construction method: Sentence
mask rate of 0


Unnamed: 0,num_examples,acc,asr,method,model,mask_rate,task,data_construction
0,11,81.818182,18.181818,CheckList,pred-gemini-1.5-flash,0,sst2,sentence
1,11,0.0,100.0,CheckList,pred-gemini-1.5-flash-8b,0,sst2,sentence
2,4,25.0,75.0,SCPN,pred-gemini-1.5-flash,0,sst2,sentence
3,4,75.0,25.0,SCPN,pred-gemini-1.5-flash-8b,0,sst2,sentence
4,19,47.368421,52.631579,StressTest,pred-gemini-1.5-flash,0,sst2,sentence
5,19,52.631579,47.368421,StressTest,pred-gemini-1.5-flash-8b,0,sst2,sentence
6,17,64.705882,35.294118,T3,pred-gemini-1.5-flash,0,sst2,sentence
7,17,47.058824,52.941176,T3,pred-gemini-1.5-flash-8b,0,sst2,sentence






## Thoughts on data construction: 

### Word

1. Self-denoised smoothing seems to improve ASR for character-level or word-level perturbations, these are classified under data_construction "word". And the types of attack are "Bert-Attack", "SemAttack", "SemmePSO", "TextBugger", "TextFooler".

    More specifically, it only applies to a mask rate of 0.1. This seems to guarantee the best ASR. For instance, gemini 1.5 flash 8b receives improved performances for Bert-Attack methods from 50% to 37.5% ASR, and TextFooler from 25% down to 18.75%. Note that however, 1.5 flash 8b's performance on SemmePSO dropped from 50% to 53% ASR. 

    While gemini 1.5 flash generally improves on SemAttack from 22.22% down to 11.11%, TextFooler from 50% down to 43.75% and SememePSO from 35.714% to 28.57%. 

    This aligns with the paper, since RanMask was made to deal with word-substitution based attacks and character level perturbation by leveraging on the ability of mask language models e.g. BERT to predict [MASK] tokens. However RANMask fails to work as the mask rate increases, this is where our results differ from the paper. we believe this is the case because the number of times we perform voting for each sample, denoted as n, is only 10. Whereas the paper sets n to be 100. 

    Also, another explanation for the differences in results from the paper and our findings is that the paper only focuses on 2 methods: Bert-Attack and TextFooler with the original sst2 dataset while we utilize the advGlue SST2 dataset which encompasses a variety of methods, with fewer samples per method.

2. Ranmask offers mixed performances, it exceeds the model's base performance with a mask rate of 0.1, except for the TextBugger method where self-denoised method for gemini 1.5 flash had an asr of 10.56% while the base model had an asr of 5.26%. However its results pale in comparison to the self-denoised smoothing method. For instance, gemini 1.5 flash 8b has an increased ASR from RanMask on TextFooler and BERT-ATTACK while gemini 1.5 flash has increased ASR from SemAttack, SememePSO and TextBugger.



3. RanMask smoothing performs relatively worse than ranmask as its mask rate goes up.

    The reason for this can be attributed to the fact that as more mask tokens are given, the eventual meaning of the entire sentence can be lost, leading to a poorer accuracy and higher attack success rate. Therefore, self-denoised smoothing should be set lower.

### Sentence

1. Self-denoised smoothing seems to work extremely well for consistently reducing checklist and SCPN attacks even as mask rate grows but fares poorly for the rest

2. RanMask offers mixed and inconsistent performances.

## Overall data


In [32]:
from IPython.display import display, HTML

global_combined_metrics_df = pd.DataFrame()

def extract_combined_metrics(filePath, mask_rate=0, self_denoise=False):
    global global_combined_metrics_df  # Reference the global DataFrame
    
    df = pd.read_csv(filePath)

    combined_metrics_df = pd.DataFrame() 
    # Initialize labels dictionary and metrics DataFrame
    labels = {}  # Contains true labels and predictions
    labels['true_label'] = df['true_label'].to_numpy()
    method_metrics_df = pd.DataFrame() 

    model_list = MODEL_SET['hug_gen']  # Assuming this is predefined

    for model in model_list:
        labels['pred-' + model.replace('/', '_')] = df['pred-' + model.replace('/', '_')].to_numpy()

    for key in labels.keys():
        if key != 'true_label':

            # Map predictions to label IDs
            pred_label = np.array([LABEL_TO_ID["sst2"].get(label, -1) for label in labels[key]])

            # Compute metrics
            metric_dict = compute_metric(pred_label, labels['true_label'], "sst2")

            # Add metadata to the metric_dict
            metric_dict['model'] = key
            metric_dict['mask_rate'] = mask_rate
            metric_dict['self_denoise'] = self_denoise
            metric_dict['task'] = "sst2"

            # Append to combined_metrics_df
            combined_metrics_df = pd.concat(
                [combined_metrics_df, pd.DataFrame([metric_dict])],
                ignore_index=True
            )
    
    # Append the current file's metrics to the global DataFrame
    global_combined_metrics_df = pd.concat(
        [global_combined_metrics_df, combined_metrics_df],
        ignore_index=True
    )

# Example usage:
# Assuming you have a list of file paths to iterate over

file_path = f'result/merge_advglue_sst2_hug_gen.csv'
extract_combined_metrics(file_path, mask_rate = 0)
for i in range(1, 10):
    mask_rate = f'0.{i}'
    file_path_mask = f'result/merge_advglue_sst2_hug_gen_maskrate_{mask_rate}.csv'

    extract_combined_metrics(file_path_mask, mask_rate=float(mask_rate))

    file_path_mask = f'result/merge_advglue_sst2_hug_gen_maskrate_{mask_rate}_self-denoise.csv'
    extract_combined_metrics(file_path_mask, mask_rate=float(mask_rate), self_denoise=True)

filtered_df_flash = global_combined_metrics_df.loc[global_combined_metrics_df['model'] == 'pred-gemini-1.5-flash']
filtered_df_flash_8b = global_combined_metrics_df.loc[global_combined_metrics_df['model'] == 'pred-gemini-1.5-flash-8b']

# Final global DataFrame after processing all files
display(HTML('<h3>Flash Metrics DataFrame</h3>'))
display(filtered_df_flash.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))

display(HTML('<h3>Flash 8B Metrics DataFrame</h3>'))
display(filtered_df_flash_8b.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))




Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task
0,148,67.567568,32.432432,pred-gemini-1.5-flash,0.0,False,sst2
2,148,61.486486,38.513514,pred-gemini-1.5-flash,0.1,False,sst2
4,148,63.513514,36.486486,pred-gemini-1.5-flash,0.1,True,sst2
6,148,60.135135,39.864865,pred-gemini-1.5-flash,0.2,False,sst2
8,148,59.459459,40.540541,pred-gemini-1.5-flash,0.2,True,sst2
10,148,52.027027,47.972973,pred-gemini-1.5-flash,0.3,False,sst2
12,148,51.351351,48.648649,pred-gemini-1.5-flash,0.3,True,sst2
14,148,54.054054,45.945946,pred-gemini-1.5-flash,0.4,False,sst2
16,148,52.702703,47.297297,pred-gemini-1.5-flash,0.4,True,sst2
18,148,48.648649,51.351351,pred-gemini-1.5-flash,0.5,False,sst2


Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task
1,148,60.810811,39.189189,pred-gemini-1.5-flash-8b,0.0,False,sst2
3,148,58.108108,41.891892,pred-gemini-1.5-flash-8b,0.1,False,sst2
5,148,56.081081,43.918919,pred-gemini-1.5-flash-8b,0.1,True,sst2
7,148,52.702703,47.297297,pred-gemini-1.5-flash-8b,0.2,False,sst2
9,148,50.675676,49.324324,pred-gemini-1.5-flash-8b,0.2,True,sst2
11,148,48.648649,51.351351,pred-gemini-1.5-flash-8b,0.3,False,sst2
13,148,48.648649,51.351351,pred-gemini-1.5-flash-8b,0.3,True,sst2
15,148,50.0,50.0,pred-gemini-1.5-flash-8b,0.4,False,sst2
17,148,45.27027,54.72973,pred-gemini-1.5-flash-8b,0.4,True,sst2
19,148,46.621622,53.378378,pred-gemini-1.5-flash-8b,0.5,False,sst2


In [33]:

data_construction_df = pd.DataFrame()

def extract_combined_metrics_by_data_construction(filePath, mask_rate=0, self_denoise=False, data_construction_method = "word"):
    global data_construction_df  # Reference the global DataFrame
    
    df = pd.read_csv(filePath)
    
    df['data_construction'] = df.index.map(lambda idx: get_data_construction(idx))
    filtered_df = df[df['data_construction'] == data_construction_method]

    # Initialize labels dictionary and metrics DataFrame
    labels = {}  # Contains true labels and predictions
    labels['true_label'] = filtered_df['true_label'].to_numpy()

    model_list = MODEL_SET['hug_gen']

    # Add predictions for each model to the labels dictionary
    for model in model_list:
        labels['pred-' + model.replace('/', '_')] = filtered_df['pred-' + model.replace('/', '_')].to_numpy()

    combined_metrics_df = pd.DataFrame()  # Temporary DataFrame for current file's metrics
    for key in labels.keys():
        if key != 'true_label':
            

            # Map predictions to label IDs
            pred_label = np.array([LABEL_TO_ID["sst2"].get(label, -1) for label in labels[key]])

            # Compute metrics
            metric_dict = compute_metric(pred_label, labels['true_label'], "sst2")

            # Add metadata to the metric_dict
            metric_dict['model'] = key
            metric_dict['mask_rate'] = mask_rate
            metric_dict['self_denoise'] = self_denoise
            metric_dict['task'] = "sst2"
            metric_dict['data_construction'] = data_construction_method

            # Append to the temporary DataFrame
            combined_metrics_df = pd.concat(
                [combined_metrics_df, pd.DataFrame([metric_dict])],
                ignore_index=True
            )
    
    data_construction_df = pd.concat(
        [data_construction_df, combined_metrics_df],
        ignore_index=True
    )

file_path = f'result/merge_advglue_sst2_hug_gen.csv'
extract_combined_metrics_by_data_construction(file_path, mask_rate = 0, data_construction_method = "word")
extract_combined_metrics_by_data_construction(file_path, mask_rate = 0, data_construction_method = "sentence")
for i in range(1, 10):
    mask_rate = f'0.{i}'
    file_path_mask = f'result/merge_advglue_sst2_hug_gen_maskrate_{mask_rate}.csv'

    extract_combined_metrics_by_data_construction(file_path_mask, mask_rate=float(mask_rate),data_construction_method = "word")
    extract_combined_metrics_by_data_construction(file_path_mask, mask_rate=float(mask_rate),data_construction_method = "sentence")

    file_path_mask = f'result/merge_advglue_sst2_hug_gen_maskrate_{mask_rate}_self-denoise.csv'
    extract_combined_metrics_by_data_construction(file_path_mask, mask_rate=float(mask_rate), self_denoise=True, data_construction_method = "word")
    extract_combined_metrics_by_data_construction(file_path_mask, mask_rate=float(mask_rate), self_denoise=True, data_construction_method = "sentence")

filtered_df_flash_word = data_construction_df.loc[
    (data_construction_df['model'] == 'pred-gemini-1.5-flash') & 
    (data_construction_df['data_construction'] == 'word')
]

# Filter for 'pred-gemini-1.5-flash' model and 'sentence' data construction
filtered_df_flash_sentence = data_construction_df.loc[
    (data_construction_df['model'] == 'pred-gemini-1.5-flash') & 
    (data_construction_df['data_construction'] == 'sentence')
]

# Filter for 'pred-gemini-1.5-flash-8b' model and 'word' data construction
filtered_df_flash_8b_word = data_construction_df.loc[
    (data_construction_df['model'] == 'pred-gemini-1.5-flash-8b') & 
    (data_construction_df['data_construction'] == 'word')
]

# Filter for 'pred-gemini-1.5-flash-8b' model and 'sentence' data construction
filtered_df_flash_8b_sentence = data_construction_df.loc[
    (data_construction_df['model'] == 'pred-gemini-1.5-flash-8b') & 
    (data_construction_df['data_construction'] == 'sentence')
]


display(HTML('<h3>Flash Metrics Words DataFrame</h3>'))
display(filtered_df_flash_word.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))

display(HTML('<h3>Flash Metrics Sentence DataFrame</h3>'))
display(filtered_df_flash_sentence.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))

display(HTML('<h3>Flash 8B Metrics Words DataFrame</h3>'))
display(filtered_df_flash_8b_word.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))

display(HTML('<h3>Flash 8B Metrics Sentence DataFrame</h3>'))
display(filtered_df_flash_8b_sentence.style.set_table_attributes('style="width: 100%; font-size: 12px;"'))

Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task,data_construction
0,80,70.0,30.0,pred-gemini-1.5-flash,0.0,False,sst2,word
4,80,71.25,28.75,pred-gemini-1.5-flash,0.1,False,sst2,word
8,80,75.0,25.0,pred-gemini-1.5-flash,0.1,True,sst2,word
12,80,68.75,31.25,pred-gemini-1.5-flash,0.2,False,sst2,word
16,80,70.0,30.0,pred-gemini-1.5-flash,0.2,True,sst2,word
20,80,58.75,41.25,pred-gemini-1.5-flash,0.3,False,sst2,word
24,80,63.75,36.25,pred-gemini-1.5-flash,0.3,True,sst2,word
28,80,56.25,43.75,pred-gemini-1.5-flash,0.4,False,sst2,word
32,80,65.0,35.0,pred-gemini-1.5-flash,0.4,True,sst2,word
36,80,51.25,48.75,pred-gemini-1.5-flash,0.5,False,sst2,word


Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task,data_construction
2,51,58.823529,41.176471,pred-gemini-1.5-flash,0.0,False,sst2,sentence
6,51,52.941176,47.058824,pred-gemini-1.5-flash,0.1,False,sst2,sentence
10,51,50.980392,49.019608,pred-gemini-1.5-flash,0.1,True,sst2,sentence
14,51,56.862745,43.137255,pred-gemini-1.5-flash,0.2,False,sst2,sentence
18,51,56.862745,43.137255,pred-gemini-1.5-flash,0.2,True,sst2,sentence
22,51,50.980392,49.019608,pred-gemini-1.5-flash,0.3,False,sst2,sentence
26,51,45.098039,54.901961,pred-gemini-1.5-flash,0.3,True,sst2,sentence
30,51,56.862745,43.137255,pred-gemini-1.5-flash,0.4,False,sst2,sentence
34,51,47.058824,52.941176,pred-gemini-1.5-flash,0.4,True,sst2,sentence
38,51,54.901961,45.098039,pred-gemini-1.5-flash,0.5,False,sst2,sentence


Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task,data_construction
1,80,67.5,32.5,pred-gemini-1.5-flash-8b,0.0,False,sst2,word
5,80,67.5,32.5,pred-gemini-1.5-flash-8b,0.1,False,sst2,word
9,80,68.75,31.25,pred-gemini-1.5-flash-8b,0.1,True,sst2,word
13,80,61.25,38.75,pred-gemini-1.5-flash-8b,0.2,False,sst2,word
17,80,61.25,38.75,pred-gemini-1.5-flash-8b,0.2,True,sst2,word
21,80,55.0,45.0,pred-gemini-1.5-flash-8b,0.3,False,sst2,word
25,80,62.5,37.5,pred-gemini-1.5-flash-8b,0.3,True,sst2,word
29,80,62.5,37.5,pred-gemini-1.5-flash-8b,0.4,False,sst2,word
33,80,50.0,50.0,pred-gemini-1.5-flash-8b,0.4,True,sst2,word
37,80,53.75,46.25,pred-gemini-1.5-flash-8b,0.5,False,sst2,word


Unnamed: 0,num_examples,acc,asr,model,mask_rate,self_denoise,task,data_construction
3,51,41.176471,58.823529,pred-gemini-1.5-flash-8b,0.0,False,sst2,sentence
7,51,45.098039,54.901961,pred-gemini-1.5-flash-8b,0.1,False,sst2,sentence
11,51,37.254902,62.745098,pred-gemini-1.5-flash-8b,0.1,True,sst2,sentence
15,51,49.019608,50.980392,pred-gemini-1.5-flash-8b,0.2,False,sst2,sentence
19,51,39.215686,60.784314,pred-gemini-1.5-flash-8b,0.2,True,sst2,sentence
23,51,49.019608,50.980392,pred-gemini-1.5-flash-8b,0.3,False,sst2,sentence
27,51,41.176471,58.823529,pred-gemini-1.5-flash-8b,0.3,True,sst2,sentence
31,51,47.058824,52.941176,pred-gemini-1.5-flash-8b,0.4,False,sst2,sentence
35,51,45.098039,54.901961,pred-gemini-1.5-flash-8b,0.4,True,sst2,sentence
39,51,41.176471,58.823529,pred-gemini-1.5-flash-8b,0.5,False,sst2,sentence
