In [1]:
%cd /Users/samuelamouyal/PycharmProjects/reading_comprehension_research/

/Users/samuelamouyal/PycharmProjects/reading_comprehension_research


In [8]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.stats import pearsonr, zscore, kendalltau, spearmanr, rankdata
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.express as px
import json
from collections import defaultdict
from math import ceil, fabs


df_llm = pd.read_csv('experiments/gardenpath_10_24/results/llm_results/averaged_all.csv')
df_llm_distr = pd.read_csv('experiments/gardenpath_10_24/results/llm_results/all_results.csv')
df_humans = pd.read_csv('experiments/gardenpath_10_24/results/human_results/sampled_results.csv')

with open('experiments/gardenpath_10_24/analysis/data/families.json', 'r') as f:
    families = json.load(f)

In [9]:
accuracy_df = df_llm.groupby(['model', 'quest_type']).agg(
    avg_accuracy=('avg_correct', 'mean')
).reset_index()

# Pivot the data to have quest_type as columns
pivoted_accuracy_df = accuracy_df.pivot(index='model', columns='quest_type', values='avg_accuracy').reset_index()

# Step 4: Filter models based on accuracy conditions
def condition(row):
    return (row['simple_question'] - row['GP_question']) >= 0.05

filtered_models_accuracy = pivoted_accuracy_df[~pivoted_accuracy_df.apply(condition, axis=1)]['model'].unique()
df_llm = df_llm[~df_llm['model'].isin(filtered_models_accuracy)]

print(f"We filtered {len(filtered_models_accuracy)} models out of {len(df_llm['model'].unique()) + len(filtered_models_accuracy)} models")

We filtered 35 models out of 188 models


In [12]:
df_llm = df_llm[df_llm['quest_type'] == 'GP_question']
df_humans = df_humans[df_humans['quest_type'] == 'GP_question']
df_llm_distr = df_llm_distr[df_llm_distr['quest_type'] == 'GP_question']

In [13]:
# Calculate average accuracy and standard error for each combination of ManipulationType and SentenceType
avg_accuracy_humans = df_humans.groupby(['ManipulationType', 'SentenceType']).correct.mean().reset_index(name='correct_mean')
std_error_humans = df_humans.groupby(['ManipulationType', 'SentenceType']).correct.sem().reset_index(name='correct_sem')
avg_accuracy_humans = avg_accuracy_humans.merge(std_error_humans, on=['ManipulationType', 'SentenceType'])
avg_accuracy_humans['model'] = 'Human'


df_gemma = df_llm_distr[df_llm_distr['model'] == 'google/gemma-2-9b']
avg_accuracy_gemma = df_gemma.groupby(['ManipulationType', 'SentenceType']).correct.mean().reset_index(name='correct_mean')
std_error_gemma = df_gemma.groupby(['ManipulationType', 'SentenceType']).correct.sem().reset_index(name='correct_sem')
avg_accuracy_gemma = avg_accuracy_gemma.merge(std_error_gemma, on=['ManipulationType', 'SentenceType'])
avg_accuracy_gemma['model'] = 'Gemma-2-9B'

avg_accuracy = pd.concat([avg_accuracy_humans, avg_accuracy_gemma])
avg_accuracy['text_position'] = avg_accuracy['correct_mean'] + avg_accuracy['correct_sem'] + 0.02
avg_accuracy['manip'] = avg_accuracy['ManipulationType'].map({'prob': 'Plausible', 'improb': 'Implausible', 'reflexive': 'Reflexive'}).reset_index(drop=True)
avg_accuracy['Sentence type'] = avg_accuracy['SentenceType']

In [None]:
fig = px.bar(
    avg_accuracy,
    x='manip',
    y='correct_mean',
    color='Sentence type',
    pattern_shape="Sentence type",
    barmode='group',
    error_y='correct_sem',
    category_orders={"manip": ["Plausible", "Implausible", "Reflexive"]},
    color_discrete_sequence=['#E69F00', '#56B4E9'],
    facet_col='model',
    facet_col_spacing=0.05,  # Adjust spacing between models
    labels={'model': ''}
)

# Update layout for the custom font and legend
fig.update_layout(
    font=dict(
        family="Arial",
        color="black",
        size=15
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.23,
        xanchor='center',
        x=0.5,
        font=dict(
            family="American Typewriter",
            color="black",
            size=15
        )
    )
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=20, color="black"), y=-0.14, yanchor='bottom'))


x_pos = [0.80, 1.20, -0.2, 0.2, 1.80, 2.20, 0.80, 1.20, -0.2, 0.2, 1.80, 2.20]
# Add custom text annotations
for i in range(avg_accuracy.shape[0]):
    row = avg_accuracy.iloc[i]
    fig.add_annotation(
        x=x_pos[i],
        y=row['text_position'],
        text=f"{row['correct_mean']:.3f}",
        showarrow=False,
        font=dict(color='black', size=18),
        col=2 if row['model'] == 'Gemma-2-9B' else 1,
        row=1
    )

fig.update_layout(
    yaxis=dict(title='Accuracy'),
    xaxis_title = "",
    template='plotly_white',
    width=800,
    height=500
)

fig.update_xaxes(tickfont=dict(size=18, color="black"), title_text="")

fig.update_layout(font=dict(family="American Typewriter"))
fig.write_image('experiments/gardenpath_10_24/analysis/plots/humans_next_llm.pdf')

# Show the plot
fig.show()

In [17]:
df_drawings = pd.read_csv('/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/experiments/gardenpath_10_24/analysis/classification_progress.csv')
df_drawings['ManipulationType'] = df_drawings['sent_type'].map(lambda x: x.split('_')[-1])
df_drawings['ManipulationType'] = df_drawings['ManipulationType'].map({'prob': 'Plausible', 'improb': 'Implausible', 'reflexive': 'Reflexive'})
df_drawings['SentenceType'] = df_drawings['sent_type'].map(lambda x: x.split('_')[0])
df_drawings['correct'] = df_drawings['classification'] == 'correctly understood'
df_drawings['partial'] = df_drawings['classification'] == 'partial misunderstanding'
df_drawings['correct'] = df_drawings['correct'].astype(int)
df_drawings['partial'] = df_drawings['partial'].astype(int)

avg_accuracy_drawings = df_drawings.groupby(['ManipulationType', 'SentenceType']).correct.mean().reset_index(name='correct_mean')
avg_misinterpretation = df_drawings.groupby(['ManipulationType', 'SentenceType']).partial.mean().reset_index(name='partial_mean')
avg_accuracy_drawings = avg_accuracy_drawings.merge(avg_misinterpretation, on=['ManipulationType', 'SentenceType'])
avg_accuracy_drawings['correct_text_position'] = avg_accuracy_drawings['correct_mean'] - 0.02
avg_accuracy_drawings['partial_text_position'] = avg_accuracy_drawings['partial_mean'] - 0.02

category_order = ['Plausible', 'Implausible', 'Reflexive']
avg_accuracy_drawings['ManipulationType'] = pd.Categorical(
    avg_accuracy_drawings['ManipulationType'],
    categories=category_order,
    ordered=True
)
avg_accuracy_drawings.sort_values('ManipulationType', inplace=True)
avg_accuracy_drawings

Unnamed: 0,ManipulationType,SentenceType,correct_mean,partial_mean,correct_text_position,partial_text_position
2,Plausible,GP,0.066667,0.8,0.046667,0.78
3,Plausible,nonGP,0.133333,0.755556,0.113333,0.735556
0,Implausible,GP,0.377778,0.4,0.357778,0.38
1,Implausible,nonGP,0.622222,0.177778,0.602222,0.157778
4,Reflexive,GP,0.125,0.75,0.105,0.73
5,Reflexive,nonGP,0.541667,0.208333,0.521667,0.188333


In [None]:
# Define colors and patterns
colors = {
    'GP': ['#E69F00', '#F8E1B8'],       # Dark and light shades for GP
    'nonGP': ['#56B4E9', '#B0DFF8'],    # Dark and light shades for nonGP
}
patterns = {
    'GP': '',                 # Solid for GP bars
    'nonGP': 'x'             # Cross-hatch for nonGP bars
}

# Create a figure
fig = go.Figure()

# Add bars for GP and nonGP
for sentence_type in ['GP', 'nonGP']:
    # Correct Mean
    fig.add_trace(go.Bar(
        x=avg_accuracy_drawings[avg_accuracy_drawings['SentenceType'] == sentence_type]['ManipulationType'],
        y=avg_accuracy_drawings[avg_accuracy_drawings['SentenceType'] == sentence_type]['correct_mean'],
        name=f'{sentence_type} (correct)',
        marker_color=colors[sentence_type][0],  # Darker shade for correct_mean
        marker_pattern_shape=patterns[sentence_type]
    ))
    
    # Partial Mean
    fig.add_trace(go.Bar(
        x=avg_accuracy_drawings[avg_accuracy_drawings['SentenceType'] == sentence_type]['ManipulationType'],
        y=avg_accuracy_drawings[avg_accuracy_drawings['SentenceType'] == sentence_type]['partial_mean'],
        name=f'{sentence_type} (partial)',
        marker_color=colors[sentence_type][1],  # Lighter shade for partial_mean
        marker_pattern_shape=patterns[sentence_type]
    ))

# Update layout for the custom font and legend
fig.update_layout(
    font=dict(
        family="Arial",
        size=15,
        color="black"
    ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.2,
        xanchor='center',
        x=0.5,
        font=dict(
            family="American Typewriter",
            size=16,
            color="black"
        )
    ),
    yaxis=dict(title='Proportion'),
    xaxis_title="",
    template='plotly_white',
    width=800,
    height=500,
)

changes = [-0.3, 0.1, 0.7, 1.1, 1.7, 2.1]
# Annotations
for i in range(avg_accuracy_drawings.shape[0]):
    row = avg_accuracy_drawings.iloc[i]
    # Correct mean annotation
    fig.add_annotation(
        x= changes[i],
        y=row['correct_text_position'] + 0.04,
        text=f"{row['correct_mean']:.3f}",
        showarrow=False,
        font=dict(color='black', size=12),
    )
    # Partial mean annotation
    fig.add_annotation(
        x=changes[i] + 0.2,
        y=row['partial_text_position'] + 0.04,
        text=f"{row['partial_mean']:.3f}",
        showarrow=False,
        font=dict(color='black', size=12),
    )

# Show the plot
fig.write_image('experiments/gardenpath_10_24/analysis/plots/drawings_understood.pdf')
fig.show()

In [20]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.stats import pearsonr, zscore, kendalltau, spearmanr, rankdata
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.express as px
import json
from collections import defaultdict
from math import ceil, fabs


df_llm = pd.read_csv('experiments/gardenpath_10_24/results/llm_results/averaged_all.csv')
df_humans = pd.read_csv('experiments/gardenpath_10_24/results/human_results/sampled_results.csv')

with open('experiments/gardenpath_10_24/analysis/data/families.json', 'r') as f:
    families = json.load(f)

accuracy_df = df_llm.groupby(['model', 'quest_type']).agg(
    avg_accuracy=('avg_correct', 'mean')
).reset_index()

# Pivot the data to have quest_type as columns
pivoted_accuracy_df = accuracy_df.pivot(index='model', columns='quest_type', values='avg_accuracy').reset_index()

# Step 4: Filter models based on accuracy conditions
def condition(row):
    return (row['simple_question'] - row['GP_question']) >= 0.001

filtered_models_accuracy = ['Qwen/Qwen2.5-0.5B', 'Qwen/Qwen2.5-0.5B-Instruct']
df_llm = df_llm[~df_llm['model'].isin(filtered_models_accuracy)]

df_llm = df_llm[df_llm.quest_type == "GP_question"]
df_humans = df_humans[df_humans.quest_type == "GP_question"]

a = df_llm.model.unique()
olmo_1 = ['allenai/OLMo-1B-0724-hf_step1218000-tokens2553B',
       'allenai/OLMo-1B-0724-hf_step289000-tokens605B',
       'allenai/OLMo-1B-0724-hf_step468000-tokens981B',
       'allenai/OLMo-1B-0724-hf_step648000-tokens1358B',
       'allenai/OLMo-1B-0724-hf_step827000-tokens1733B',]
b = a[~np.isin(a, olmo_1)]
df_llm = df_llm[df_llm.model.isin(b)]

In [23]:
import pandas as pd
from scipy.stats import spearmanr, rankdata
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
import numpy as np
import plotly.graph_objects as go
from collections import defaultdict


model_mapping = {
    "gpt-4": "GPT4", "gpt-4-turbo": "GPT4-Turbo", "gpt-4o": "GPT4-O", "gpt-4o-mini": "GPT4-O-Mini", "o1-mini": "O1-Mini", "o1-preview": "O1-Preview",

    'meta-llama/Llama-3.2-1B': 'Llama3.2-1B', 'meta-llama/Llama-3.2-1B-Instruct': 'Llama3.2-1B-Inst',
    'meta-llama/Llama-3.2-3B': 'Llama3.2-3B', 'meta-llama/Llama-3.2-3B-Instruct': 'Llama3.2-3B-Inst', 'meta-llama/Llama-3.2-11B-Vision': 'Llama3.2-11B-Vis', 
    'meta-llama/Llama-3.2-11B-Vision-Instruct': 'Llama3.2-11B-Vis-Inst', 'meta-llama/Llama-3.2-90B-Vision': 'Llama3.2-90B-Vis',
    'meta-llama/Llama-3.2-90B-Vision-Instruct': 'Llama3.2-90B-Vis-Inst',

    'google/gemma-2-2b': 'Gemma-2B', 'google/gemma-2-2b-it': 'Gemma-2B-Inst',  'google/gemma-2-9b': 'Gemma-9B', 'google/gemma-2-9b-it': 'Gemma-9B-Inst', 
    'google/gemma-2-27b': 'Gemma-27B', 'google/gemma-2-27b-it': 'Gemma-27B-Inst', 
    
    "Qwen/Qwen2.5-0.5B": "Qwen-0.5B", "Qwen/Qwen2.5-0.5B-Instruct": "Qwen-0.5B-Inst", "Qwen/Qwen2.5-1.5B": "Qwen-1.5B", "Qwen/Qwen2.5-1.5B-Instruct": "Qwen-1.5B-Inst", 
    "Qwen/Qwen2.5-3B": "Qwen-3B", "Qwen/Qwen2.5-3B-Instruct": "Qwen-3B-Inst", "Qwen/Qwen2.5-7B": "Qwen-7B", "Qwen/Qwen2.5-7B-Instruct": "Qwen-7B-Inst",
    "Qwen/Qwen2.5-14B": "Qwen-14B", "Qwen/Qwen2.5-14B-Instruct": "Qwen-14B-Inst", "Qwen/Qwen2.5-32B": "Qwen-32B", "Qwen/Qwen2.5-32B-Instruct": "Qwen-32B-Inst",
    "Qwen/Qwen2.5-72B": "Qwen-72B", "Qwen/Qwen2.5-72B-Instruct": "Qwen-72B-Inst", 

    'allenai/OLMo-7B-0724-hf_step2000-tokens8B': 'Olmo-7B-Tokens-8B', 'allenai/OLMo-7B-0724-hf_step26500-tokens111B': 'Olmo-7B-Tokens-111B', 
    'allenai/OLMo-7B-0724-hf_step106500-tokens446B': 'Olmo-7B-Tokens-446B', 'allenai/OLMo-7B-0724-hf_step143000-tokens599B': 'Olmo-7B-Tokens-599B', 
    'allenai/OLMo-7B-0724-hf_step330000-tokens1384B': 'Olmo-7B-Tokens-1384B', 'allenai/OLMo-7B-0724-hf_step395000-tokens1656B': 'Olmo-7B-Tokens-1656B', 'allenai/OLMo-7B-0724-hf_step458000-tokens1920B': 'Olmo-7B-Tokens-1920B',
    'allenai/OLMo-7B-0724-hf_step519000-tokens2176B': 'Olmo-7B-Tokens-2176B', 'allenai/OLMo-7B-0724-hf_step582000-tokens2441B': 'Olmo-7B-Tokens-2441B',
    'allenai/OLMo-7B-0724-hf_step647650-tokens2716B': 'Olmo-7B-Tokens-2716B', 'allenai/OLMo-7B-0724-hf_step650650-tokens2729B': 'Olmo-7B-Tokens-2729B',
    
    'allenai/OLMo-1B-0724-hf_step20000-tokens41B': 'Olmo-1B-Tokens-41B', 'allenai/OLMo-1B-0724-hf_step289000-tokens605B': 'Olmo-1B-Tokens-605B',
    'allenai/OLMo-1B-0724-hf_step379000-tokens794B': 'Olmo-1B-Tokens-794B', 'allenai/OLMo-1B-0724-hf_step468000-tokens981B': 'Olmo-1B-Tokens-981B',
    'allenai/OLMo-1B-0724-hf_step558000-tokens1169B': 'Olmo-1B-Tokens-1169B', 'allenai/OLMo-1B-0724-hf_step648000-tokens1358B': 'Olmo-1B-Tokens-1358B',
    'allenai/OLMo-1B-0724-hf_step738000-tokens1547B': 'Olmo-1B-Tokens-1547B', 'allenai/OLMo-1B-0724-hf_step827000-tokens1733B': 'Olmo-1B-Tokens-1733B',
    'allenai/OLMo-1B-0724-hf_step917000-tokens1922B': 'Olmo-1B-Tokens-1922B',
    'allenai/OLMo-1B-0724-hf_step1038000-tokens2176B': 'Olmo-1B-Tokens-2176B', 'allenai/OLMo-1B-0724-hf_step1128000-tokens2364B': 'Olmo-1B-Tokens-2346B',
    'allenai/OLMo-1B-0724-hf_step1218000-tokens2553B': 'Olmo-1B-Tokens-2553B', 'allenai/OLMo-1B-0724-hf_step1308000-tokens2742B': 'Olmo-1B-Tokens-2742B',
    'allenai/OLMo-1B-0724-hf_step1399000-tokens2932B': 'Olmo-1B-Tokens-2932B', 'allenai/OLMo-1B-0724-hf_step1454000-tokens3048B': 'Olmo-1B-Tokens-3048B',}


def extract_step(model_name):
    if 'allenai' in model_name:
        step_num = model_name.split('tokens')[-1].replace('B', '')
    elif 'Eleuther' in model_name:
        step_num = model_name.split('step')[-1]
    else:
        step_num = model_name.split('ckpt_')[-1]
    return step_num


def convert_to_num(string):
    try:
        return float(string)
    except:
        return None


def get_model_parameters(model_name, family_name = None):

    if family_name in ['Pythia-1.4b', 'Pythia-1b', 'Pythia-2.8b', 'Pythia-6.9b', 'OLMo-1b', 'OLMo-7b', 'Pythia-12b', "K2", 'Amber']:
        return int(extract_step(model_name))
    
    model_name = model_name.lower()
    parts = model_name.split('-')
    for part in parts:
        if 'b' in part:
            num_params_new = convert_to_num(part.replace('b', ''))
            if num_params_new is not None:
                return num_params_new
    return -1


def is_instruction_tuned(model_name):
    return 'inst' in model_name.lower() or 'gpt' in model_name.lower() or 'it' in model_name.lower() or 'chat' in model_name.lower() or 'vicuna' in model_name.lower() or 'iml' in model_name.lower() or 'o1-' in model_name.lower()


def rank_correlation(llm_df, human_df, families):

    model_to_family = {}
    basic_keys = ["OLMo-7b", "OLMo-1b", "Llama-3.2", "Qwen-2.5", "Gemma-2", "GPT"]
    for family, models in families.items():
        if family not in basic_keys:
            continue
        for model in models:
            model_to_family[model] = family

    llm_df['Family'] = llm_df['model'].map(model_to_family)
    human_df['model'] = 'Human'
    human_df['Family'] = 'Human'

    human_item_perf = human_df.groupby(['sent_type', 'quest_type', 'set_id'])['correct'].mean().reset_index()
    human_item_perf = human_item_perf.rename(columns={'correct': 'human_accuracy'})

    llm_item_perf = llm_df.copy()
    llm_item_perf = llm_item_perf.rename(columns={'avg_correct': 'llm_performance'})

    # Group the dataframe by ('sentence', 'question')
    grouped = df_humans.groupby(['sentence', 'question'])

    # Initialize empty lists to collect group data
    group1_list = []
    group2_list = []

    for name, group in grouped:
        # Shuffle the group data
        shuffled_group = group.sample(frac=1, random_state=1)  # Seed for reproducibility if needed
        # Split the group into two halves
        group1 = shuffled_group.iloc[:5]
        group2 = shuffled_group.iloc[5:]
        group1_list.append(group1)
        group2_list.append(group2)

    # Concatenate lists into DataFrames
    df_group1 = pd.concat(group1_list).reset_index(drop=True)
    df_group2 = pd.concat(group2_list).reset_index(drop=True)


    half_1_item_perf = df_group1.groupby(['sent_type', 'quest_type', 'set_id'])['correct'].mean().reset_index()
    half_1_item_perf = half_1_item_perf.rename(columns={'correct': 'half_1_accuracy'})

    half_2_item_perf = df_group2.groupby(['sent_type', 'quest_type', 'set_id'])['correct'].mean().reset_index()
    half_2_item_perf = half_2_item_perf.rename(columns={'correct': 'half_2_accuracy'})

    merged_df = pd.merge(
        llm_item_perf,
        human_item_perf,
        on=['sent_type', 'quest_type', 'set_id'],
        how='inner'
    )

    merged_df = pd.merge(
        merged_df,
        half_1_item_perf,
        on=['sent_type', 'quest_type', 'set_id'],
        how='inner'
    )

    merged_df = pd.merge(
        merged_df,
        half_2_item_perf,
        on=['sent_type', 'quest_type', 'set_id'],
        how='inner'
    )

    conditions = merged_df[['sent_type', 'quest_type']].drop_duplicates()
    similarity_results = defaultdict(lambda: list())

    tasks = ['Global']

    sentence_mappings = {'GP_prob_GP_question': 'GP-Plaus.',
                         'nonGP_prob_GP_question': 'nonGP-Plaus.',
                         'GP_improb_GP_question': 'GP-Implaus.',
                         'nonGP_improb_GP_question': 'nonGP-Implaus.',
                         'GP_reflexive_GP_question': 'GP-Reflexive.',
                         'nonGP_reflexive_GP_question': 'nonGP-Reflexive.',
                         'Global': 'Global'}
    
    variance_std = defaultdict(lambda: defaultdict(lambda: dict()))
    
    for model in merged_df['model'].unique():
        if model == 'Human':
            continue  # Skip human vs. human comparison
        model_data = merged_df[merged_df['model'] == model]
        model_data = model_data.fillna(0)
    
        # Ensure there are enough items to compute correlation
        if len(model_data) < 2:
            continue  # Not enough data to compute correlation
    
        # Get the per-item performances
        human_perf = model_data['human_accuracy']
        llm_perf = model_data['llm_performance']
        human_ranks = rankdata([1 - x for x in human_perf], method='average')
        LLM_ranks = rankdata([1 - x for x in llm_perf], method='average')
        # Compute Spearman rank correlation
        corr_coef, p_value = kendalltau(human_ranks, LLM_ranks)
        if np.isnan(corr_coef):
            corr_coef = 0.0
        similarity_results[model].append(corr_coef)
            
    h1_perf = model_data['half_1_accuracy']
    h2_perf = model_data['half_2_accuracy']
    human_ranks = rankdata([1 - x for x in h1_perf], method='average')
    LLM_ranks = rankdata([1 - x for x in h2_perf], method='average')
    # Compute Spearman rank correlation
    corr_coef, p_value = kendalltau(human_ranks, LLM_ranks)
    if np.isnan(corr_coef):
        corr_coef = 0.0
    similarity_results['Half_split'].append(corr_coef)
        

    for idx, condition_row in conditions.iterrows():
        sent_type = condition_row['sent_type']
        quest_type = condition_row['quest_type']
        condition_name = f"{sent_type}_{quest_type}"

        tasks.append(sentence_mappings[condition_name])
    
        # Subset data for the current condition
        condition_data = merged_df[
            (merged_df['sent_type'] == sent_type) &
            (merged_df['quest_type'] == quest_type)
        ]
    
        # Get unique models
        models = condition_data['model'].unique()

        h1_perf = condition_data['half_1_accuracy']
        h2_perf = condition_data['half_2_accuracy']
        human_ranks = rankdata([1 - x for x in h1_perf], method='average')
        LLM_ranks = rankdata([1 - x for x in h2_perf], method='average')
        # Compute Spearman rank correlation
        corr_coef, p_value = kendalltau(human_ranks, LLM_ranks)
        if np.isnan(corr_coef):
            corr_coef = 0.0
        similarity_results['Half_split'].append(corr_coef)
    
        # For each model, compute Spearman rank correlation with human data
        for model in models:
            if model == 'Human':
                continue  # Skip human vs. human comparison
            model_data = condition_data[condition_data['model'] == model]
            model_data = model_data.fillna(0)
        
            # Ensure there are enough items to compute correlation
            if len(model_data) < 2:
                continue  # Not enough data to compute correlation
        
            # Get the per-item performances
            human_perf = model_data['human_accuracy']
            llm_perf = model_data['llm_performance']

            variance_std[model][condition_name]['std'] = np.std(llm_perf)
            variance_std[model][condition_name]['variance'] = np.var(llm_perf)
            variance_std[model][condition_name]['mean'] = np.mean(llm_perf)

            variance_std['humans'][condition_name]['std'] = np.std(human_perf)
            variance_std['humans'][condition_name]['variance'] = np.var(human_perf)
            variance_std['humans'][condition_name]['mean'] = np.mean(human_perf)

            human_ranks = rankdata([1 - x for x in human_perf], method='average')
            LLM_ranks = rankdata([1 - x for x in llm_perf], method='average')
            # Compute Spearman rank correlation
            corr_coef, p_value = kendalltau(human_ranks, LLM_ranks)
            if np.isnan(corr_coef):
                corr_coef = 0.0
            similarity_results[model].append(corr_coef)

    weights = [45 / 228, 45 / 228, 45 / 228, 45 / 228, 24 / 228, 24 / 228]
    for model in similarity_results:
        # we insert the average of the correlations as the second element of the list
        similarity_results[model].insert(1, np.average(similarity_results[model][1:], weights=weights))
    
    tasks.insert(1, 'Averaged')
        
    similarity_results['Human'] = [1.0] * len(similarity_results['o1-preview'])

    similarity_df = pd.DataFrame.from_dict(similarity_results, orient='index', columns=tasks)

    return merged_df, similarity_df



In [24]:
sentences = df_humans.sentence.to_list()
set_ids = df_humans.set_id.to_list()

set_mapping = dict(zip(sentences, set_ids))
df_llm['set_id'] = df_llm['sentence'].map(set_mapping)
merged_df, similarity_df = rank_correlation(df_llm, df_humans, families)



Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [25]:
def output_distance_graph(distances_df, families, dist_name='L2 distance'):

    model_to_family = {}
    for family, models in families.items():
        for model in models:
            model_to_family[model] = family

    distances_df['Family'] = distances_df.index.map(model_to_family)
    basic_keys = ["GPT", "Llama-3.2", "Gemma-2", "Qwen-2.5", "OLMo-7b", "OLMo-1b"]
    distances_df = distances_df[distances_df['Family'].isin(basic_keys)]

    # we want the families to appear in the same order as in basic keys
    distances_df['Family'] = pd.Categorical(distances_df['Family'], categories=basic_keys, ordered=True)

    family_list = sorted(distances_df['Family'].dropna().unique())
    num_families = len(family_list)
    colors = px.colors.qualitative.Plotly
    if num_families > len(colors):
        colors = colors * ((num_families // len(colors)) + 1)
    family_colors = dict(zip(family_list, colors[:num_families]))

    model_names = distances_df.index
    instructions_tuned = [is_instruction_tuned(name) for name in model_names]
    num_params = [get_model_parameters(name, model_to_family[name]) for name in model_names]

    distances_df['instruction_tuned'] = instructions_tuned
    distances_df['model_name'] = model_names
    distances_df['num_params'] = num_params

    families = distances_df['Family'].unique()
    num_families = len(families)

    # Set up the subplot grid
    cols = 2
    rows = ceil(num_families / cols)
    fig = make_subplots(
        rows=rows, 
        cols=cols, 
        subplot_titles=basic_keys, 
        shared_xaxes=False, 
        shared_yaxes=True, 
        horizontal_spacing=0.05  # Reduce distance between columns
    )

    # Update layout template
    fig.update_layout(template="plotly_white")

    # Define colors and patterns for instruction-tuned and non-instruction-tuned models
    colors = {True: 'lightcoral', False: 'lightcyan'}
    patterns = {True: "/", False: "\\"}

    # Loop through each family to create subplots
    for i, family in enumerate(basic_keys):
        family_df = distances_df[distances_df['Family'] == family]
        if family_df.empty:
            continue

        row = (i // cols) + 1
        col = (i % cols) + 1

        if family == 'GPT':
            x_var = 'nicknames'
            x_categories = sorted(family_df[x_var].unique())
            family_df[x_var] = pd.Categorical(family_df[x_var], categories=x_categories, ordered=True)
        else:
            x_var = 'num_params_str'
            num_params_list = sorted(family_df['num_params'].unique())
            x_categories = [str(n) for n in num_params_list]
            family_df['num_params_str'] = family_df['num_params'].astype(str)
            family_df['num_params_str'] = pd.Categorical(family_df['num_params_str'], categories=x_categories, ordered=True)

        # Loop through instruction_tuned values
        for instruction_tuned_value in [True, False]:
            df_filtered = family_df[family_df['instruction_tuned'] == instruction_tuned_value]
            if df_filtered.empty:
                continue

            df_filtered = df_filtered.sort_values(x_var)

            trace_bar = go.Bar(
                x=df_filtered[x_var],
                y=df_filtered['Global'],
                name='Instruction Tuned' if instruction_tuned_value else 'Not Instruction Tuned',
                marker_color=colors[instruction_tuned_value],
                marker_pattern_shape=patterns[instruction_tuned_value],
                showlegend=(family == "Qwen-2.5"),  # Show legend only once
            )
            fig.add_trace(trace_bar, row=row, col=col)

        # Update axes titles for each subplot
        if family == "GPT":
            x_axis_title = 'Model Name'
        elif "OLM" in family:
            x_axis_title = 'Number of Tokens (Billions)'
        else:
            x_axis_title = 'Number of Parameters (Billions)'

        fig.update_xaxes(title_text=x_axis_title, row=row, col=col)
        show_y_axis_title = (col == 1)
        fig.update_yaxes(
            title_text="Spearman" if show_y_axis_title else None, 
            showticklabels=show_y_axis_title, 
            row=row, col=col
        )

    # Update the layout of the figure
    fig.update_layout(
        barmode='group',
        title=dict(
            text=f"{dist_name} by Model Family",
            x=0.5,
            xanchor='center'
        ),
        legend_title_text='Model Type',
        height=rows * 300,  # Adjust height for smaller subplots
        width=800,  # Adjust width
    )

    return fig

In [None]:
model_mapping_short = {
    "gpt-4": "GPT4", "gpt-4-turbo": "GPT4-T", "gpt-4o": "GPT4-O", "gpt-4o-mini": "GPT4-O-M", "o1-mini": "O1-M", "o1-preview": "O1-Prev.",

    'meta-llama/Llama-3.2-1B': 'Llama3.2-1B', 'meta-llama/Llama-3.2-1B-Instruct': 'Llama3.2-1B-Inst',
    'meta-llama/Llama-3.2-3B': 'Llama3.2-3B', 'meta-llama/Llama-3.2-3B-Instruct': 'Llama3.2-3B-Inst', 'meta-llama/Llama-3.2-11B-Vision': 'Llama3.2-11B-Vis', 
    'meta-llama/Llama-3.2-11B-Vision-Instruct': 'Llama3.2-11B-Vis-Inst', 'meta-llama/Llama-3.2-90B-Vision': 'Llama3.2-90B-Vis',
    'meta-llama/Llama-3.2-90B-Vision-Instruct': 'Llama3.2-90B-Vis-Inst',

    'google/gemma-2-2b': 'Gemma-2B', 'google/gemma-2-2b-it': 'Gemma-2B-Inst',  'google/gemma-2-9b': 'Gemma-9B', 'google/gemma-2-9b-it': 'Gemma-9B-Inst', 
    'google/gemma-2-27b': 'Gemma-27B', 'google/gemma-2-27b-it': 'Gemma-27B-Inst', 
    
    "Qwen/Qwen2.5-0.5B": "Qwen-0.5B", "Qwen/Qwen2.5-0.5B-Instruct": "Qwen-0.5B-Inst", "Qwen/Qwen2.5-1.5B": "Qwen-1.5B", "Qwen/Qwen2.5-1.5B-Instruct": "Qwen-1.5B-Inst", 
    "Qwen/Qwen2.5-3B": "Qwen-3B", "Qwen/Qwen2.5-3B-Instruct": "Qwen-3B-Inst", "Qwen/Qwen2.5-7B": "Qwen-7B", "Qwen/Qwen2.5-7B-Instruct": "Qwen-7B-Inst",
    "Qwen/Qwen2.5-14B": "Qwen-14B", "Qwen/Qwen2.5-14B-Instruct": "Qwen-14B-Inst", "Qwen/Qwen2.5-32B": "Qwen-32B", "Qwen/Qwen2.5-32B-Instruct": "Qwen-32B-Inst",
    "Qwen/Qwen2.5-72B": "Qwen-72B", "Qwen/Qwen2.5-72B-Instruct": "Qwen-72B-Inst", 
    
    'allenai/OLMo-7B-0724-hf_step26500-tokens111B': 'Olmo-7B-Tokens-111B', 
    'allenai/OLMo-7B-0724-hf_step106500-tokens446B': 'Olmo-7B-Tokens-446B', 
    'allenai/OLMo-7B-0724-hf_step330000-tokens1384B': 'Olmo-7B-Tokens-1384B', 
    'allenai/OLMo-7B-0724-hf_step519000-tokens2176B': 'Olmo-7B-Tokens-2176B',
    'allenai/OLMo-7B-0724-hf_step647650-tokens2716B': 'Olmo-7B-Tokens-2716B', 'allenai/OLMo-7B-0724-hf_step650650-tokens2729B': 'Olmo-7B-Tokens-2729B',
    
    'allenai/OLMo-1B-0724-hf_step20000-tokens41B': 'Olmo-1B-Tokens-41B', 
    'allenai/OLMo-1B-0724-hf_step379000-tokens794B': 'Olmo-1B-Tokens-794B',
    'allenai/OLMo-1B-0724-hf_step558000-tokens1169B': 'Olmo-1B-Tokens-1169B',
    'allenai/OLMo-1B-0724-hf_step1038000-tokens2176B': 'Olmo-1B-Tokens-2176B',
    'allenai/OLMo-1B-0724-hf_step1218000-tokens2553B': 'Olmo-1B-Tokens-2553B',
    'allenai/OLMo-1B-0724-hf_step1454000-tokens3048B': 'Olmo-1B-Tokens-3048B'}

similarity_df['nicknames'] = similarity_df.index.map(model_mapping_short)
fig = output_distance_graph(similarity_df, families, dist_name='Kendall Tau Rank Correlation (item level)')
fig.update_layout(
    font=dict(color='black'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.15,
        xanchor='center',
        x=0.5,
        font=dict(
            family="Arial",
            color="black",
            size=13
        )
    )
)
fig.write_image('experiments/gardenpath_10_24/analysis/plots/per_family_item_tau.pdf')
fig.show()

In [None]:
def prepare_data_distance(llm_df, human_df, families):

    human_performance = human_df.groupby(['sent_type', 'quest_type'])['correct'].mean().reset_index()
    human_performance['model'] = 'Human'

    llm_performance = llm_df.groupby(['model', 'sent_type', 'quest_type'])['avg_correct'].mean().reset_index()

    human_performance['condition'] = human_performance['sent_type'] + '_' + human_performance['quest_type']
    llm_performance['condition'] = llm_performance['sent_type'] + '_' + llm_performance['quest_type']

    human_pivot = human_performance.pivot(index='model', columns='condition', values='correct')
    llm_pivot = llm_performance.pivot(index='model', columns='condition', values='avg_correct')

    performance_df = pd.concat([human_pivot, llm_pivot])
    performance_df = performance_df.fillna(0)
    return performance_df


performance_df = prepare_data_distance(df_llm, df_humans, families)

def spearman_distance(performance_df_norm):

    performance_ranks = performance_df_norm.rank(axis=1, method='average', ascending=False)
    human_ranks = performance_ranks.loc['Human']

    distances = {}
    for model in performance_ranks.index:
        if model != 'Human':
            model_ranks = performance_ranks.loc[model]
            corr_coef, _ = spearmanr(human_ranks, model_ranks)
            distance = corr_coef
            distances[model] = distance

    distances_df = pd.DataFrame.from_dict(distances, orient='index', columns=['Global'])
    distances_df = distances_df.sort_values('Global')
    return distances_df

spearman_distances_df = spearman_distance(performance_df)
spearman_distances_df['nicknames'] = spearman_distances_df.index.map(model_mapping_short)
fig = output_distance_graph(spearman_distances_df, families, "Spearman correlation (by category)")
fig.update_layout(
    font=dict(color='black'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.15,
        xanchor='center',
        x=0.5,
        font=dict(
            family="Arial",
            color="black",
            size=13
        )
    )
)

fig.write_image('experiments/gardenpath_10_24/analysis/plots/per_category_cat_spearman.pdf')
fig.show()



An input array is constant; the correlation coefficient is not defined.



In [28]:
model_mapping_short = {
    "gpt-4": "GPT4", "gpt-4-turbo": "GPT4-T", "gpt-4o": "GPT4-O", "gpt-4o-mini": "GPT4-O-M", "o1-mini": "O1-Mini", "o1-preview": "O1-Prev.",

    'meta-llama/Llama-3.2-1B': 'Llama3.2-1B', 'meta-llama/Llama-3.2-1B-Instruct': 'Llama3.2-1B-Inst',
    'meta-llama/Llama-3.2-3B': 'Llama3.2-3B', 'meta-llama/Llama-3.2-3B-Instruct': 'Llama3.2-3B-Inst', 'meta-llama/Llama-3.2-11B-Vision': 'Llama3.2-11B-Vis', 
    'meta-llama/Llama-3.2-11B-Vision-Instruct': 'Llama3.2-11B-Vis-Inst', 'meta-llama/Llama-3.2-90B-Vision': 'Llama3.2-90B-Vis',
    'meta-llama/Llama-3.2-90B-Vision-Instruct': 'Llama3.2-90B-Vis-Inst',

    'google/gemma-2-2b': 'Gemma-2B', 'google/gemma-2-2b-it': 'Gemma-2B-Inst',  'google/gemma-2-9b': 'Gemma-9B', 'google/gemma-2-9b-it': 'Gemma-9B-Inst', 
    'google/gemma-2-27b': 'Gemma-27B', 'google/gemma-2-27b-it': 'Gemma-27B-Inst', 
    
    "Qwen/Qwen2.5-0.5B": "Qwen-0.5B", "Qwen/Qwen2.5-0.5B-Instruct": "Qwen-0.5B-Inst", "Qwen/Qwen2.5-1.5B": "Qwen-1.5B", "Qwen/Qwen2.5-1.5B-Instruct": "Qwen-1.5B-Inst", 
    "Qwen/Qwen2.5-3B": "Qwen-3B", "Qwen/Qwen2.5-3B-Instruct": "Qwen-3B-Inst", "Qwen/Qwen2.5-7B": "Qwen-7B", "Qwen/Qwen2.5-7B-Instruct": "Qwen-7B-Inst",
    "Qwen/Qwen2.5-14B": "Qwen-14B", "Qwen/Qwen2.5-14B-Instruct": "Qwen-14B-Inst", "Qwen/Qwen2.5-32B": "Qwen-32B", "Qwen/Qwen2.5-32B-Instruct": "Qwen-32B-Inst",
    "Qwen/Qwen2.5-72B": "Qwen-72B", "Qwen/Qwen2.5-72B-Instruct": "Qwen-72B-Inst", 
    
    'allenai/OLMo-7B-0724-hf_step2000-tokens8B' : 'Olmo-7B-Tokens-8B',
    'allenai/OLMo-7B-0724-hf_step26500-tokens111B': 'Olmo-7B-Tokens-111B', 
    'allenai/OLMo-7B-0724-hf_step106500-tokens446B': 'Olmo-7B-Tokens-446B', 
    'allenai/OLMo-7B-0724-hf_step143000-tokens599B': 'Olmo-7B-Tokens-599B',
    'allenai/OLMo-7B-0724-hf_step330000-tokens1384B': 'Olmo-7B-Tokens-1384B',
    'allenai/OLMo-7B-0724-hf_step395000-tokens1656B' : 'Olmo-7B-Tokens-1656B',
    'allenai/OLMo-7B-0724-hf_step458000-tokens1920B' : 'Olmo-7B-Tokens-1920B', 
    'allenai/OLMo-7B-0724-hf_step519000-tokens2176B': 'Olmo-7B-Tokens-2176B',
    'allenai/OLMo-7B-0724-hf_step647650-tokens2716B': 'Olmo-7B-Tokens-2716B', 'allenai/OLMo-7B-0724-hf_step650650-tokens2729B': 'Olmo-7B-Tokens-2729B',

    'allenai/OLMo-1B-0724-hf_step20000-tokens41B': 'Olmo-1B-Tokens-41B', 
    'allenai/OLMo-1B-0724-hf_step379000-tokens794B': 'Olmo-1B-Tokens-794B',
    'allenai/OLMo-1B-0724-hf_step558000-tokens1169B': 'Olmo-1B-Tokens-1169B',
    'allenai/OLMo-1B-0724-hf_step738000-tokens1547B': 'Olmo-1B-Tokens-1547B',
    'allenai/OLMo-1B-0724-hf_step917000-tokens1922B': 'Olmo-1B-Tokens-1922B',
    'allenai/OLMo-1B-0724-hf_step1038000-tokens2176B': 'Olmo-1B-Tokens-2176B',
    'allenai/OLMo-1B-0724-hf_step1128000-tokens2364B': 'Olmo-1B-Tokens-2364B',
    'allenai/OLMo-1B-0724-hf_step1308000-tokens2742B': 'Olmo-1B-Tokens-2742B',
    'allenai/OLMo-1B-0724-hf_step1399000-tokens2932B': 'Olmo-1B-Tokens-2932B',
    'allenai/OLMo-1B-0724-hf_step1454000-tokens3048B': 'Olmo-1B-Tokens-3048B',}


performance_df['model'] = performance_df.index
model_to_family = {}
for family, models in families.items():
    for model in models:
        model_to_family[model] = family

performance_df['Family'] = performance_df.index.map(model_to_family)
basic_keys = ["GPT", "Llama-3.2", "Gemma-2", "Qwen-2.5", "OLMo-7b", "OLMo-1b"]
performance_df_short = performance_df[performance_df['Family'].isin(basic_keys)]
#models = ['google/gemma-2-2b-it', 'meta-llama/Meta-Llama-3-70B', 'lmsys/vicuna-13b-v1.5', 'google/gemma-2-9b', 'Human']


filtered_perf = performance_df[performance_df.model.isin(model_mapping_short.keys())]
filtered_perf['nicknames'] = filtered_perf.model.map(model_mapping_short)

#model_order = ['Humans', 'Gemma-9B', 'Vicuna-13B', "Gemma-2B-I", "Llama3-70B-I"]
# model_order = ['Humans', 'Gemma-9B', "Llama3-70B"]

sent_type_mapping = {
    'GP_prob_GP_question': "GP-Plaus.", 
    'nonGP_prob_GP_question': "NGP-Plaus.", 
    'GP_improb_GP_question': "GP-Implaus.", 
    'nonGP_improb_GP_question': "NGP-Implaus.", 
    'GP_reflexive_GP_question': "GP-Reflexive", 
    'nonGP_reflexive_GP_question': "NGP-Reflexive"
}

sent_type_order = ["GP-Plaus.", "NGP-Plaus.", "GP-Implaus.", "NGP-Implaus.", "GP-Reflexive", "NGP-Reflexive"]


def get_model_final_name(series):
    if series['Family'] == 'GPT':
        return series['nicknames']
    elif 'OLMo' in series['Family']:
        return series['num_parameters']
    else:
        return str(series['num_parameters']) + '-Inst' if series['instruction_tuned'] else series['num_parameters']



# Melt the DataFrame to long format
df_melted = filtered_perf.melt(id_vars=['nicknames', 'Family', 'model'], var_name='condition', value_name='performance')
df_melted['conds'] = df_melted.condition.map(sent_type_mapping)
model_names = df_melted.model.to_list()
num_params = [get_model_parameters(name, model_to_family[name]) for name in model_names]
df_melted['num_parameters'] = num_params
df_melted['instruction_tuned'] = df_melted.model.map(is_instruction_tuned)
df_melted['SentenceType'] = df_melted['condition'].map(lambda x: x.split('_')[0])
df_melted['Manipulation'] = df_melted['conds'].map(lambda x: x.split('-')[1].replace('.', 'ible'))
df_melted['final_name'] = df_melted.apply(get_model_final_name, axis=1)


In [29]:
all_models = { 'GPT': ['GPT4', 'GPT4-T', 'GPT4-O', 'GPT4-O-M', 'O1-Mini',
        'O1-Prev.'],
        'Qwen-2.5': [
        'Qwen-14B', 'Qwen-32B', 'Qwen-32B-Inst',
        'Qwen-72B', 'Qwen-72B-Inst', 'Qwen-7B'],
 'Gemma-2': ['Gemma-27B-Inst', 'Gemma-27B', 'Gemma-2B', 'Gemma-2B-Inst',
        'Gemma-9B', 'Gemma-9B-Inst'],
 'Llama-3.2': ['Llama3.2-11B-Vis', 'Llama3.2-11B-Vis-Inst', 'Llama3.2-1B',
        'Llama3.2-3B', 'Llama3.2-90B-Vis', 'Llama3.2-90B-Vis-Inst'],
 'OLMo-7b': ['Olmo-7B-Tokens-446B', 'Olmo-7B-Tokens-111B',
        'Olmo-7B-Tokens-1384B', 'Olmo-7B-Tokens-2176B',
        'Olmo-7B-Tokens-2716B', 'Olmo-7B-Tokens-2729B'],
 'OLMo-1b': ['Olmo-1B-Tokens-2176B', 'Olmo-1B-Tokens-2553B',
        'Olmo-1B-Tokens-3048B', 'Olmo-1B-Tokens-41B',
        'Olmo-1B-Tokens-794B', 'Olmo-1B-Tokens-1169B'],}

titles = {'Qwen-2.5': 'Number of Parameters (Billions)', 'OLMo-1b': 'Number of Tokens (Billions)', 'OLMo-7b': 'Number of Tokens (Billions)', 'Gemma-2': 'Number of Parameters (Billions)', 'GPT': 'Model name', 'Llama-3.2': 'Number of Parameters (Billions)'}

In [30]:
all_models_interest = list()
for key in all_models:
    all_models_interest.extend(all_models[key])

df_melted_interest = df_melted[df_melted.nicknames.isin(all_models_interest)]

figures = list()
for family, models in all_models.items():
    df_family = df_melted[df_melted.Family == family]
    df_family['Sentence type'] = df_family['SentenceType']
    df_family = df_family[df_family.nicknames.isin(all_models[family])]
    df_family = df_family.sort_values(by=['Family', 'num_parameters', 'instruction_tuned'], ascending=[True, True, True])
    
    fig = px.bar(
        df_family,
        x='Manipulation',
        y='performance',
        color='Sentence type',
        pattern_shape="Sentence type",
        barmode='group',
        category_orders={"Manipulation": ["Plausible", "Implausible", "Reflexive"]},
        color_discrete_sequence=['#E69F00', '#56B4E9'],
        facet_col='final_name',
        facet_col_spacing=0.025,
        labels={'final_name': ''}
    )

    # Update layout for the custom font and legend
    fig.update_layout(
        font=dict(
            family="Arial",
            color="black",
            size=16
        ),
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=-0.67,
            xanchor='center',
            x=0.5,
            font=dict(
                family="Arial",
                color="black",
                size=13
            )
        )
    )

    fig.update_layout(
        yaxis=dict(title='Accuracy'),
        xaxis_title = titles[family],
        template='plotly_white',
        width=800,
        height=300
    )

    fig.update_layout(
        title=dict(
            text=family,
            x=0.5,
            xanchor='center',
            font=dict(
                size=20
            ),
        ),
        showlegend=False
    )

    if family == "OLMo-1b":
        fig.update_layout(
            showlegend=True,
            height=400
        )
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.43, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = True, title_text="", tickangle=40)
    elif family == "OLMo-7b":
        fig.update_layout(
            showlegend=True,
            height=400
        )
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.43, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = True, title_text="", tickangle=40)
    else:
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.14, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = False, title_text="", tickangle=70)
    
    fig.add_annotation(
        x=0.5,
        y=-0.15 if (family != "OLMo-1b" and family != "OLMo-7b") else -0.45,
        xref='paper',
        yref='paper',
        showarrow=False,
        text=titles[family],
        xanchor='center',
        yanchor='top',
        font=dict(size=16)
    )

    figures.append(fig)

In [None]:
from PIL import Image

# Parameters
fig_height = 300
fig_width = 800
shared_section_height = 35

# Save each figure as a PNG image
for idx, fig in enumerate(figures):
    if idx < len(figures) - 2:
        fig.write_image(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/figure_{idx}.png", width=800, height=300)
    else:
        fig.write_image(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/figure_{idx}.png", width=800, height=400)

# Adjusted images array with shared section placeholder
images = []
for idx in range(len(figures)):
    img = Image.open(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/figure_{idx}.png")
    images.append(img)

# Create the shared section
shared_img = images[-2].crop((0, images[-2].height - shared_section_height, fig_width, images[-2].height))
blank_img = images[-2].crop((0, images[-1].height -35, 250, images[-1].height))

# Calculate necessary dimensions for the final combined image
cols = 2
rows = (len(images) + (cols - 1)) // cols
max_width = fig_width * cols
max_height = (300 * 2 + 400) - shared_section_height

# Create a new blank image with the combined dimensions
combined_image = Image.new('RGB', (max_width, max_height + shared_section_height), color='white')

# Paste each image onto the combined image
y_offset = 0
for row in range(rows):
    x_offset = 0
    for col in range(cols):
        idx = row * cols + col
        if idx < len(images):
            combined_image.paste(images[idx], (x_offset, y_offset))
            x_offset += fig_width
    y_offset += 300 if row < rows - 1 else 400

combined_image.paste(blank_img, (250, 962))
combined_image.paste(blank_img, (1050, 962))
combined_image.paste(blank_img, (1250, 962))
combined_image.paste(shared_img, (400, 962))

# (Optional) Save the combined image as PDF
combined_image.save('/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/experiments/gardenpath_10_24/analysis/plots/combined_image.pdf', "PDF", resolution=100.0)

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm 

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('experiments/gardenpath_10_24/results/llm_results/rephrasing_all_results.csv')
df_data = pd.read_csv('experiments/gardenpath_10_24/data/llm_data/rephrasing_experiment.csv')

In [None]:
def find_verb(sentence):

    doc = nlp(sentence)
    look_first = True if sentence[:5] == "While" else False
    found_first = False
    for token in doc:
        if token.pos_ == 'VERB':
            if look_first:
                return token.lemma_
            else:
                if found_first:
                    return token.lemma_
                found_first = True
    return "swing"


def find_noun(sentence):

    doc = nlp(sentence)
    look_first = False if sentence[:5] == "While" else True
    found_first = False
    for token in doc:
        if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':
            if look_first:
                return token.lemma_
            else:
                if found_first:
                    return token.lemma_
                found_first = True
    return None

In [None]:
df_data['verb'] = df_data['sentence'].map(find_verb)
df_data['noun'] = df_data['sentence'].map(find_noun)

In [None]:
nouns = df_data['noun'].to_list()
sentences = df_data['sentence'].to_list()
noun_mappings = dict()
for i in range(len(nouns)):
    noun_mappings[sentences[i]] = nouns[i]

verbs = df_data['verb'].to_list()
verb_mappings = dict()
for i in range(len(verbs)):
    verb_mappings[sentences[i]] = verbs[i]


small_df = df_llm[df_llm.model == 'gpt-4']
sentences = small_df.sentence.to_list()
sent_types = small_df.sent_type.to_list()
sent_maps = dict()
for i in range(len(sentences)):
    sent_maps[sentences[i]] = sent_types[i]

df['noun'] = df['sentence'].map(noun_mappings)
df['verb'] = df['sentence'].map(verb_mappings)
df['sent_type'] = df['sentence'].map(sent_maps)

df.dropna(inplace=True)

In [None]:
def get_predictions(output_txt):
    lines = output_txt.split('\n')
    if lines[0].strip() == "Splitted:":
        lines = lines[1:]
    s1 = lines[0].split('1. ')[-1].split('. ')[0] + '.'
    s2 = lines[1].split('2. ')[-1].split('. ')[0] + '.'
    return [s1.replace('..', '.'), s2.replace('..', '.')]


def check_sentences(sentences, verb, noun):

    correct, found = False, False
    for sent in sentences:
        doc = nlp(sent)
        for token in doc:
            if token.lemma_ == verb or token.lemma_ == "swung" or token.lemma_ == "parking":
                found = True
                if noun not in sent:
                    correct = True
                    
    return found, correct


founds, corrects, formatteds = [], [], []
for i in range(len(df)):
    try:
        found, correct = check_sentences(get_predictions(df['txt'].iloc[i]), df['verb'].iloc[i], df['noun'].iloc[i])
        formatted = True
    except:
        found, correct = False, False
        formatted = False
    founds.append(found)
    corrects.append(correct)
    formatteds.append(formatted)

df['found'] = founds
df['gp_correct'] = corrects
df['formatted'] = formatteds

In [None]:
to_drop = list()
for mod in df.model.unique():
    model_data = df[df.model == mod]
    if model_data.found.mean() < 0.9 or model_data.formatted.mean() < 0.9:
        to_drop.append(mod)

df_dropped = df[~df.model.isin(to_drop)]
print(len(to_drop))
print(to_drop)

6
['meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'Qwen/Qwen2.5-1.5B', 'Qwen/Qwen2.5-3B', 'lmsys/vicuna-13b-v1.5', 'lmsys/vicuna-7b-v1.5']


In [None]:
llm_performance = df_dropped.groupby(['model', 'sent_type'])['gp_correct'].mean().reset_index()
paraphrase_performance_df = llm_performance.pivot(index='model', columns='sent_type', values='gp_correct')

model_mapping_short = {
    "gpt-4": "GPT4", "gpt-4-turbo": "GPT4-T", "gpt-4o": "GPT4-O", "gpt-4o-mini": "GPT4-O-M", "o1-mini": "O1-Mini", "o1-preview": "O1-Prev.",

    'meta-llama/Llama-3.2-1B': 'Llama3.2-1B', 'meta-llama/Llama-3.2-1B-Instruct': 'Llama3.2-1B-Inst',
    'meta-llama/Llama-3.2-3B': 'Llama3.2-3B', 'meta-llama/Llama-3.2-3B-Instruct': 'Llama3.2-3B-Inst', 'meta-llama/Llama-3.2-11B-Vision': 'Llama3.2-11B-Vis', 
    'meta-llama/Llama-3.2-11B-Vision-Instruct': 'Llama3.2-11B-Vis-Inst', 'meta-llama/Llama-3.2-90B-Vision': 'Llama3.2-90B-Vis',
    'meta-llama/Llama-3.2-90B-Vision-Instruct': 'Llama3.2-90B-Vis-Inst',

    'google/gemma-2-2b': 'Gemma-2B', 'google/gemma-2-2b-it': 'Gemma-2B-Inst',  'google/gemma-2-9b': 'Gemma-9B', 'google/gemma-2-9b-it': 'Gemma-9B-Inst', 
    'google/gemma-2-27b': 'Gemma-27B', 'google/gemma-2-27b-it': 'Gemma-27B-Inst', 
    
    "Qwen/Qwen2.5-0.5B": "Qwen-0.5B", "Qwen/Qwen2.5-0.5B-Instruct": "Qwen-0.5B-Inst", "Qwen/Qwen2.5-1.5B": "Qwen-1.5B", "Qwen/Qwen2.5-1.5B-Instruct": "Qwen-1.5B-Inst", 
    "Qwen/Qwen2.5-3B": "Qwen-3B", "Qwen/Qwen2.5-3B-Instruct": "Qwen-3B-Inst", "Qwen/Qwen2.5-7B": "Qwen-7B", "Qwen/Qwen2.5-7B-Instruct": "Qwen-7B-Inst",
    "Qwen/Qwen2.5-14B": "Qwen-14B", "Qwen/Qwen2.5-14B-Instruct": "Qwen-14B-Inst", "Qwen/Qwen2.5-32B": "Qwen-32B", "Qwen/Qwen2.5-32B-Instruct": "Qwen-32B-Inst",
    "Qwen/Qwen2.5-72B": "Qwen-72B", "Qwen/Qwen2.5-72B-Instruct": "Qwen-72B-Inst", 
    
    'allenai/OLMo-7B-0724-hf_step26500-tokens111B': 'Olmo-7B-Tokens-111B', 
    'allenai/OLMo-7B-0724-hf_step106500-tokens446B': 'Olmo-7B-Tokens-446B', 
    'allenai/OLMo-7B-0724-hf_step330000-tokens1384B': 'Olmo-7B-Tokens-1384B', 
    'allenai/OLMo-7B-0724-hf_step519000-tokens2176B': 'Olmo-7B-Tokens-2176B',
    'allenai/OLMo-7B-0724-hf_step647650-tokens2716B': 'Olmo-7B-Tokens-2716B', 'allenai/OLMo-7B-0724-hf_step650650-tokens2729B': 'Olmo-7B-Tokens-2729B',
    
    'allenai/OLMo-1B-0724-hf_step20000-tokens41B': 'Olmo-1B-Tokens-41B', 
    'allenai/OLMo-1B-0724-hf_step379000-tokens794B': 'Olmo-1B-Tokens-794B',
    'allenai/OLMo-1B-0724-hf_step558000-tokens1169B': 'Olmo-1B-Tokens-1169B',
    'allenai/OLMo-1B-0724-hf_step1038000-tokens2176B': 'Olmo-1B-Tokens-2176B',
    'allenai/OLMo-1B-0724-hf_step1218000-tokens2553B': 'Olmo-1B-Tokens-2553B',
    'allenai/OLMo-1B-0724-hf_step1454000-tokens3048B': 'Olmo-1B-Tokens-3048B',}


paraphrase_performance_df['model'] = paraphrase_performance_df.index

model_to_family = {}
for family, models in families.items():
    for model in models:
        model_to_family[model] = family

paraphrase_performance_df['Family'] = paraphrase_performance_df.index.map(model_to_family)
basic_keys = ["GPT", "Llama-3.2", "Gemma-2", "Qwen-2.5"]
paraphrase_performance_df_short = paraphrase_performance_df[paraphrase_performance_df['Family'].isin(basic_keys)]
#models = ['google/gemma-2-2b-it', 'meta-llama/Meta-Llama-3-70B', 'lmsys/vicuna-13b-v1.5', 'google/gemma-2-9b', 'Human']


paraphrase_filtered_perf = paraphrase_performance_df[paraphrase_performance_df.model.isin(model_mapping_short.keys())]
paraphrase_filtered_perf['nicknames'] = paraphrase_filtered_perf.model.map(model_mapping_short)

#model_order = ['Humans', 'Gemma-9B', 'Vicuna-13B', "Gemma-2B-I", "Llama3-70B-I"]
# model_order = ['Humans', 'Gemma-9B', "Llama3-70B"]

sent_type_mapping = {
    'GP_prob': "GP-Plaus.", 
    'nonGP_prob': "NGP-Plaus.", 
    'GP_improb': "GP-Implaus.", 
    'nonGP_improb': "NGP-Implaus.", 
    'GP_reflexive': "GP-Reflexive", 
    'nonGP_reflexive': "NGP-Reflexive"
}

sent_type_order = ["GP-Plaus.", "NGP-Plaus.", "GP-Implaus.", "NGP-Implaus.", "GP-Reflexive", "NGP-Reflexive"]


def get_model_final_name(series):
    if series['Family'] == 'GPT':
        return series['nicknames']
    elif 'OLMo' in series['Family']:
        return series['num_parameters']
    else:
        return str(series['num_parameters']) + '-Inst' if series['instruction_tuned'] else series['num_parameters']



# Melt the DataFrame to long format
paraphrase_df_melted = paraphrase_filtered_perf.melt(id_vars=['nicknames', 'Family', 'model'], var_name='sent_type', value_name='performance')
paraphrase_df_melted['conds'] = paraphrase_df_melted.sent_type.map(sent_type_mapping)
paraphrase_model_names = paraphrase_df_melted.model.to_list()
paraphrase_num_params = [get_model_parameters(name, model_to_family[name]) for name in paraphrase_model_names]
paraphrase_df_melted['num_parameters'] = paraphrase_num_params
paraphrase_df_melted['instruction_tuned'] = paraphrase_df_melted.model.map(is_instruction_tuned)
paraphrase_df_melted['SentenceType'] = paraphrase_df_melted['sent_type'].map(lambda x: x.split('_')[0])
paraphrase_df_melted['Manipulation'] = paraphrase_df_melted['conds'].map(lambda x: x.split('-')[1].replace('.', 'ible'))
paraphrase_df_melted['final_name'] = paraphrase_df_melted.apply(get_model_final_name, axis=1)


In [None]:
all_models = { 'GPT': ['GPT4', 'GPT4-T', 'GPT4-O', 'GPT4-O-M', 'O1-Mini',
        'O1-Prev.'],
        'Qwen-2.5': [
        'Qwen-14B', 'Qwen-32B', 'Qwen-32B-Inst',
        'Qwen-72B', 'Qwen-72B-Inst', 'Qwen-7B'],
 'Gemma-2': ['Gemma-27B-Inst', 'Gemma-2B', 'Gemma-2B-Inst',
        'Gemma-9B', 'Gemma-9B-Inst'],
 'Llama-3.2': ['Llama3.2-11B-Vis', 'Llama3.2-11B-Vis-Inst', 'Llama3.2-1B',
        'Llama3.2-3B', 'Llama3.2-90B-Vis', 'Llama3.2-90B-Vis-Inst'],}

titles = {'Qwen-2.5': 'Number of Parameters (Billions)', 'OLMo-1b': 'Number of Tokens (Billions)', 'OLMo-7b': 'Number of Tokens (Billions)', 'Gemma-2': 'Number of Parameters (Billions)', 'GPT': 'Model name', 'Llama-3.2': 'Number of Parameters (Billions)'}

In [None]:
all_models_interest = list()
for key in all_models:
    all_models_interest.extend(all_models[key])

paraphrase_df_melted_interest = paraphrase_df_melted[paraphrase_df_melted.nicknames.isin(all_models_interest)]

In [None]:
figures = list()
for family, models in all_models.items():
    paraphrase_df_family = paraphrase_df_melted[paraphrase_df_melted.Family == family]
    paraphrase_df_family = paraphrase_df_family[paraphrase_df_family.nicknames.isin(all_models[family])]
    paraphrase_df_family = paraphrase_df_family.sort_values(by=['Family', 'num_parameters', 'instruction_tuned'], ascending=[True, True, True])
    
    fig = px.bar(
        paraphrase_df_family,
        x='Manipulation',
        y='performance',
        color='SentenceType',
        pattern_shape="SentenceType",
        barmode='group',
        category_orders={"Manipulation": ["Plausible", "Implausible", "Reflexive"]},
        color_discrete_sequence=['#E69F00', '#56B4E9'],
        facet_col='final_name',
        facet_col_spacing=0.025,
        labels={'final_name': ''}
    )

    # Update layout for the custom font and legend
    fig.update_layout(
        font=dict(
            family="Arial",
            color="black",
            size=16
        ),
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=-0.61,
            xanchor='center',
            x=0.5,
            font=dict(
                family="Arial",
                color="black",
                size=13
            )
        )
    )

    fig.update_layout(
        yaxis=dict(title='Accuracy'),
        xaxis_title = titles[family],
        template='plotly_white',
        width=800,
        height=300
    )

    fig.update_yaxes(range=[0, 1], tickvals=[i * 0.1 for i in range(11)], tickfont=dict(size=16, color="black"))

    fig.update_layout(
        title=dict(
            text=family,
            x=0.5,
            xanchor='center',
            font=dict(
                size=20
            ),
        ),
        showlegend=False
    )

    if family == "Llama-3.2":
        fig.update_layout(
            showlegend=True,
            height=400
        )
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.37, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = True, title_text="", tickangle=40)
    elif family == "Gemma-2":
        fig.update_layout(
            showlegend=True,
            height=400
        )
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.37, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = True, title_text="", tickangle=40)
    else:
        fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1], font=dict(size=16, color="black"), y=-0.14, yanchor='bottom'))
        fig.update_xaxes(tickfont=dict(size=16, color="black"), showticklabels = False, title_text="", tickangle=70)
    
    fig.add_annotation(
        x=0.5,
        y=-0.15 if (family != "Gemma-2" and family != "Llama-3.2") else -0.38,
        xref='paper',
        yref='paper',
        showarrow=False,
        text=titles[family],
        xanchor='center',
        yanchor='top',
        font=dict(size=16)
    )

    figures.append(fig)

In [None]:
from PIL import Image

# Parameters
fig_height = 300
fig_width = 800
shared_section_height = 35

# Save each figure as a PNG image
for idx, fig in enumerate(figures):
    if idx < len(figures) - 2:
        fig.write_image(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/paraphrase_figure_{idx}.png", width=800, height=300)
    else:
        fig.write_image(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/paraphrase_figure_{idx}.png", width=800, height=400)

# Adjusted images array with shared section placeholder
images = []
for idx in range(len(figures)):
    img = Image.open(f"/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/temp_files/paraphrase_figure_{idx}.png")
    images.append(img)

# Create the shared section
shared_img = images[-2].crop((0, images[-2].height - shared_section_height, fig_width, images[-2].height))
blank_img = images[-2].crop((0, images[-1].height -35, 250, images[-1].height))

# Calculate necessary dimensions for the final combined image
cols = 2
rows = (len(images) + (cols - 1)) // cols
max_width = fig_width * cols
max_height = (300 + 400) - shared_section_height

# Create a new blank image with the combined dimensions
combined_image = Image.new('RGB', (max_width, max_height + shared_section_height), color='white')

# Paste each image onto the combined image
y_offset = 0
for row in range(rows):
    x_offset = 0
    for col in range(cols):
        idx = row * cols + col
        if idx < len(images):
            combined_image.paste(images[idx], (x_offset, y_offset))
            x_offset += fig_width
    y_offset += 300 if row < rows - 1 else 400

combined_image.paste(blank_img, (250, 662))
combined_image.paste(blank_img, (1050, 662))
combined_image.paste(blank_img, (1250, 662))
combined_image.paste(shared_img, (400, 662))

# (Optional) Save the combined image as PDF
combined_image.save('/Users/samuelamouyal/PycharmProjects/reading_comprehension_research/experiments/gardenpath_10_24/analysis/plots/combined_paraphrase_image.pdf', "PDF", resolution=100.0)