In [None]:
import pandas as pd
import json
from evaluate import load

In [None]:
df = pd.read_csv('gpt_best_prompts_summs.csv')

In [None]:
# List to store dictionaries from JSONL file
jsonl_data = []

with open('llama2_task_a_note.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        jsonl_data.append({
            'ID': data.get('question_id'),
            'llama2': data.get('text', None)  # default to None if "text" key is not present
        })

# Convert JSONL data into a temporary DataFrame
temp_df = pd.DataFrame(jsonl_data)

# Merge the two DataFrames on the ID columns
df = df.merge(temp_df, on='ID', how='left')

In [None]:
# List to store dictionaries from JSONL file
jsonl_data = []

with open('llama2_task_a_note_finetune.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        jsonl_data.append({
            'ID': data.get('question_id'),
            'llama2_finetuned': data.get('text', None)  # default to None if "text" key is not present
        })

# Convert JSONL data into a temporary DataFrame
temp_df = pd.DataFrame(jsonl_data)

# Merge the two DataFrames on the ID columns
df = df.merge(temp_df, on='ID', how='left')

In [None]:
df2 = pd.read_csv('gpt_generic_prompt.csv')

In [None]:
df2 = df2[["ID","text-ada-001", "text-babbage-001", "text-curie-001", "text-davinci-003", "gpt-3.5-turbo", "gpt-4"]]
df = df.merge(df2, on='ID', how='left',suffixes=("","_generic"))

Branch 1

In [None]:
from evaluate import load
import pandas as pd

# Assuming df is your DataFrame
models = [
    "text-ada-001", "text-babbage-001", "text-curie-001",
    "text-davinci-003", "gpt-3.5-turbo", "gpt-4",
    "llama2", "llama2_finetuned","text-ada-001_generic",
    "text-babbage-001_generic", "text-curie-001_generic",
    "text-davinci-003_generic", "gpt-3.5-turbo_generic", "gpt-4_generic"
]

rouge = load('rouge')
rouge_type = 'rougeLsum'

for column in models:
    # Extract lists of predictions and references from the DataFrame
    predictions = df[column].fillna("").astype(str).tolist()
    references = df['section_text'].fillna("").astype(str).tolist()

    # Compute ROUGE-Lsum scores for the entire list of predictions and references
    scores = rouge.compute(predictions=predictions, references=references, rouge_types = [rouge_type], use_aggregator=False)

    # Create a new column for the ROUGE-Lsum 'f' scores
    df[f'rouge_score_{column}'] = scores[rouge_type]

In [None]:
# Original models
comparison_models = ["text-ada-001", "text-babbage-001", "text-curie-001", "text-davinci-003", "gpt-3.5-turbo", "gpt-4"]

compare_against_models = ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]

# Suffix added to each model name for the generic version
generic_suffix = "_generic"

# Create the original "llama2_win_or_lose" and "llama2_finetuned_win_or_lose" columns
df['llama2_vs_all'] = df.apply(lambda row: 'win' if row['rouge_score_llama2'] > max([row[f'rouge_score_{col}'] for col in comparison_models]) else 'lose', axis=1)
df['llama2_finetuned_vs_all'] = df.apply(lambda row: 'win' if row['rouge_score_llama2_finetuned'] > max([row[f'rouge_score_{col}'] for col in comparison_models]) else 'lose', axis=1)

# Create 3 columns to compare "llama2" against the other three models
for model in compare_against_models:
    new_col_name = f'llama2_vs_{model}'
    rouge_col_name = f'rouge_score_{model}'
    df[new_col_name] = df.apply(lambda row: 'win' if row['rouge_score_llama2'] > row[rouge_col_name] else 'lose', axis=1)

# Create 3 columns to compare "llama2_finetuned" against the other three models
for model in compare_against_models:
    new_col_name = f'llama2_finetuned_vs_{model}'
    rouge_col_name = f'rouge_score_{model}'
    df[new_col_name] = df.apply(lambda row: 'win' if row['rouge_score_llama2_finetuned'] > row[rouge_col_name] else 'lose', axis=1)

# Comparing "gpt-3.5-turbo" against "gpt-4"
df['gpt3.5_turbo_vs_gpt4'] = df.apply(lambda row: 'win' if row['rouge_score_gpt-3.5-turbo'] > row['rouge_score_gpt-4'] else 'lose', axis=1)

# Compare original models with their generic versions
for model in comparison_models:
    generic_model = f"{model}{generic_suffix}"
    new_col_name = f'{model}_vs_{generic_model}'
    rouge_col_name_original = f'rouge_score_{model}'
    rouge_col_name_generic = f'rouge_score_{generic_model}'
    df[new_col_name] = df.apply(lambda row: 'win' if row[rouge_col_name_original] > row[rouge_col_name_generic] else 'lose', axis=1)

In [None]:
# List of the 8 new columns
new_columns = [
    'llama2_vs_all',
    'llama2_finetuned_vs_all',
    'llama2_vs_text-davinci-003',
    'llama2_vs_gpt-3.5-turbo',
    'llama2_vs_gpt-4',
    'llama2_finetuned_vs_text-davinci-003',
    'llama2_finetuned_vs_gpt-3.5-turbo',
    'llama2_finetuned_vs_gpt-4',
    'gpt3.5_turbo_vs_gpt4'
]

# Compare original models to generic models
generic_models = [f"{model}_generic" for model in comparison_models]

for original, generic in zip(comparison_models, generic_models):
    new_col_name = f"{original}_vs_{generic}"
    new_columns.append(new_col_name)  # Append to the list of new columns
    rouge_original_col_name = f"rouge_score_{original}"
    rouge_generic_col_name = f"rouge_score_{generic}"
    df[new_col_name] = df.apply(lambda row: 'win' if row[rouge_original_col_name] > row[rouge_generic_col_name] else 'lose', axis=1)

# Calculate the win percentage for each of the new columns
for column in new_columns:
    win_count = df[column].value_counts().get('win', 0)  # Get count of 'win', default to 0 if not found
    total_count = len(df[column])  # Get total number of rows
    win_percentage = (win_count / total_count) * 100  # Calculate win percentage
    print(f"Win percentage for {column}: {win_percentage:.2f}%")



In [None]:
df.to_csv('comparisons_rouge_Lsum.csv')

Branch 2

In [None]:
# Define the list of known models and metrics
known_models = [
    "text-ada-001", "text-babbage-001", "text-curie-001", "text-davinci-003",
    "gpt-3.5-turbo", "gpt-4"
]

rouge = load('rouge')

# Initialize an empty DataFrame to collect data in a long-format
long_format_df = pd.DataFrame(columns=['section', 'model', 'metric', 'winner'])

temp_rows = []  # Temporary list to store rows

# Loop through each unique section and corresponding group in the original DataFrame
for name, group in df.groupby('section_header'):
    for model in known_models:
        # Prepare the predictions and references as lists
        predictions_original = group[model].fillna("").astype(str).tolist()
        predictions_generic = group[f"{model}_generic"].fillna("").astype(str).tolist()
        references = group['section_text'].fillna("").astype(str).tolist()

        # Compute ROUGE scores for both the original and generic versions
        rouge_scores_original = rouge.compute(predictions=predictions_original, references=references)
        rouge_scores_generic = rouge.compute(predictions=predictions_generic, references=references)

        # Compare each metric's F1 score to determine the 'winner'
        for metric in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
            original_score = rouge_scores_original.get(metric, 0)
            generic_score = rouge_scores_generic.get(metric, 0)
            winner = 'specific' if original_score > generic_score else 'generic'

            # Append the winner data to temp_rows
            temp_rows.append({
                'section': name,
                'model': model,
                'metric': metric.replace('rouge', 'rouge-'),  # Convert to the desired format
                'winner': winner
            })

# Convert temp_rows to a DataFrame and concatenate to the original long_format_df
long_format_df = pd.concat([long_format_df, pd.DataFrame(temp_rows)], ignore_index=True)

# Pivot the long-format DataFrame into the final comparison_df
comparison_df = long_format_df.pivot(index=['section', 'model'], columns='metric', values='winner').reset_index()


In [None]:
# Initialize a new DataFrame to store the overall winner for each section
overall_winner_df = pd.DataFrame(columns=['section', 'overall_winner'])

# Iterate through each unique section in the comparison_df
for section in comparison_df['section'].unique():
    # Filter the DataFrame for rows that belong to the current section
    section_df = comparison_df[comparison_df['section'] == section]

    # Count the number of 'original' and 'generic' winners
    count_original = (section_df.iloc[:, 3:] == 'specific').sum().sum()
    count_generic = (section_df.iloc[:, 3:] == 'generic').sum().sum()

    # Determine the overall winner for the section
    overall_winner = 'specific' if count_original > count_generic else 'generic'

    # Add the overall winner to the overall_winner_df
    overall_winner_df = overall_winner_df.append({'section': section, 'overall_winner': overall_winner}, ignore_index=True)

# Merge the overall_winner_df with the original comparison_df (optional)
final_df = pd.merge(comparison_df, overall_winner_df, on='section', how='left')

In [None]:
final_df.to_csv("comparison_per_section.csv")

In [None]:
# Loop through the rows of the overall_winner_df to print each section and its overall winner
for index, row in overall_winner_df.iterrows():
    print(f"{row['section']}: {row['overall_winner']}")


Branch 3


In [None]:
# Initialize an empty DataFrame to store the results
sampled_df = pd.DataFrame(columns=['section_header', 'dialogue', 'section_text', 'gpt-4', 'gpt-4_generic'])

# Group by 'section_header' and sample up to 3 random rows from each group
for name, group in df.groupby('section_header'):
    num_samples = min(len(group), 3)
    sampled_group = group.sample(n=num_samples, replace=False)
    sampled_df = pd.concat([sampled_df, sampled_group[['section_header', 'dialogue', 'section_text', 'gpt-4', 'gpt-4_generic']]])

# Reset the index for the new DataFrame
sampled_df.reset_index(drop=True, inplace=True)

In [None]:
sampled_df.to_csv("gpt4_sampled_generic_specific.csv")

Branch 4

In [None]:
rouge = load('rouge')

In [None]:
predictions_mine = df["gpt-4"].fillna("").astype(str).tolist()
predictions_theirs = df["Prompt: Generate a Meaningful Summary"].fillna("").astype(str).tolist()
references = df['section_text_x'].fillna("").astype(str).tolist()

In [None]:
rouge_scores_mine = rouge.compute(predictions=predictions_mine, references=references)
rouge_scores_theirs = rouge.compute(predictions=predictions_theirs, references=references)

In [None]:
print('gpt-4 generic prompt')
rouge_scores_mine

In [None]:
print('yours')
rouge_scores_theirs

Branch 5

In [None]:
df = pd.read_csv('gpt_best_prompts_summs.csv')
df2 = pd.read_csv('gpt_generic_prompt.csv')
df3 = pd.read_csv('TaskA-val - Sheet1.csv')

In [None]:
df = df.merge(df2, on='ID', how='left')
df = df.merge(df3, on='ID', how='left')

In [None]:
new_df = df.loc[:, ['ID', 'section_header', 'dialogue','section_text', 'gpt-4_x', 'gpt-4_y', "Prompt: Generate a Meaningful Summary"]].rename(columns={'section_text':'ground_truth','gpt-4_x': 'gpt-4_specific', 'gpt-4_y': 'gpt-4_generic', "Prompt: Generate a Meaningful Summary":"2-step"})

In [None]:
# Load the ROUGE metric
rouge = load('rouge')

# Create an empty DataFrame to store results
result_df = pd.DataFrame(columns=['section', 'model', 'rouge-1', 'rouge-2', 'rouge-L', 'rouge-Lsum'])

# Iterate over each section to group by 'section_header'
for section, group in new_df.groupby('section_header'):
    for model in ['gpt-4_specific', 'gpt-4_generic', '2-step']:
        predictions = group[model].fillna("").astype(str).tolist()
        references = group['ground_truth'].fillna("").astype(str).tolist()

        # Compute ROUGE scores
        rouge_scores = rouge.compute(predictions=predictions, references=references)

        # Append the scores to the result_df
        result_df = result_df.append({
            'section': section,
            'model': model,
            'rouge-1': rouge_scores.get('rouge1', 0),
            'rouge-2': rouge_scores.get('rouge2', 0),
            'rouge-L': rouge_scores.get('rougeL', 0),
            'rouge-Lsum': rouge_scores.get('rougeLsum', 0)
        }, ignore_index=True)

In [None]:
result_df.to_csv('section_rouge_specific_generic_2step.csv')