### MELU type data

- To create MELU data (which follows entity and respective retain), we simply do data operations on forget and retain to create a dataset in such a way. First we do this only direct and indirectly connected data, and then we randomly assign the general knowledge.

In [None]:
import pandas as pd
import numpy as np

In [None]:
forget = pd.read_csv('./data/dpo_forget_idk.csv')
retain = pd.read_csv('./data/full_retain_qa.csv')

In [None]:
# for now we ignore the idk column and take only the direct + indirect retain samples
forget.drop(columns = ['idk'], inplace= True)
general_retain = retain.loc[retain['type'] == 'general']
other_retain = retain.loc[retain['type'] != 'general']

In [None]:
merged_df1 = pd.merge(forget, other_retain, on = 'title', how = 'outer',suffixes = ('_forget', '_retain'))
# after this we will have a huge dataset with duplicates

In [None]:
def cyclic_pair_and_concat(forget: pd.DataFrame,
                           retain: pd.DataFrame,
                           on: str = 'title',
                           suffixes=('_forget', '_retain')) -> pd.DataFrame:
    """
    For each unique value in `on`, take the two sub‐DataFrames:
      fg = forget[forget[on] == value]
      rt = retain[retain[on] == value]
    and then:
      - if len(fg) < len(rt): cycle fg to match len(rt), pair fg_cycle[i] with rt.iloc[i]
      - else:                cycle rt to match len(fg), pair fg.iloc[i] with rt_cycle[i]
    Finally, concat side‐by‐side (axis=1), using suffixes to keep columns distinct.
    Returns the concatenated DataFrame for all titles.
    """
    out_dfs = []
    titles = set(forget[on]).union(retain[on])

    for t in titles:
        fg = forget[forget[on] == t].reset_index(drop=True)
        rt = retain[retain[on] == t].reset_index(drop=True)
        if fg.empty or rt.empty:
            # if one side is empty, you can choose to skip or just take the non‐empty side
            continue

        n_fg, n_rt = len(fg), len(rt)
        if n_fg < n_rt:
            # cycle fg
            idx_fg = [i % n_fg for i in range(n_rt)]
            fg_cycle = fg.iloc[idx_fg].reset_index(drop=True)
            rt_cycle = rt
        else:
            # cycle rt
            idx_rt = [i % n_rt for i in range(n_fg)]
            fg_cycle = fg
            rt_cycle = rt.iloc[idx_rt].reset_index(drop=True)

        # now both have same length
        fg_cycle = fg_cycle.add_suffix(suffixes[0])
        rt_cycle = rt_cycle.add_suffix(suffixes[1])

        # make sure the key column isn't duplicated/SUFFIXed twice
        # so we’ll take title_forget and then rename it back to title:
        fg_cycle = fg_cycle.rename(columns={f"{on}{suffixes[0]}": on})

        # concat side by side
        paired = pd.concat([fg_cycle, rt_cycle.drop(columns=[f"{on}{suffixes[1]}"])], axis=1)
        out_dfs.append(paired)

    return pd.concat(out_dfs, ignore_index=True)

In [None]:
new_df = cyclic_pair_and_concat(forget, other_retain)

extending to general retain

- we randomly assign forget sample to a general retain sample

In [None]:
def extend_with_general_retain_only_paired(
    new_df: pd.DataFrame,
    general_retain_df: pd.DataFrame,
    question_key: str = "question",
    answer_key: str   = "answer",
    random_state: int = None
) -> pd.DataFrame:
    """
    For each row in `general_retain_df`, sample one random
    (question_forget, answer_forget) pair from new_df, then combine it
    with that retain row to create a new paired row.

    Returns new_df extended with one paired row per general_retain row.
    """
    rng = pd.np.random.RandomState(random_state)  # or use np.random.RandomState

    # Extract just the forget‐side pool from new_df
    fg_pool = new_df[[f"{question_key}_forget", f"{answer_key}_forget"]]

    extras = []
    for _, gr in general_retain_df.reset_index(drop=True).iterrows():
        # sample one forget‐QA from new_df’s forget‐pool
        src = fg_pool.sample(n=1, random_state=rng).iloc[0]

        extras.append({
            f"{question_key}_forget": src[f"{question_key}_forget"],
            f"{answer_key}_forget":   src[f"{answer_key}_forget"],
            f"{question_key}_retain": gr[question_key],
            f"{answer_key}_retain":   gr[answer_key],
        })

    extra_df = pd.DataFrame(extras)
    return pd.concat([new_df.reset_index(drop=True), extra_df], ignore_index=True)

In [None]:
extended_df = extend_with_general_retain_only_paired(
    new_df=new_df,
    general_retain_df=general_retain,
    question_key="question",
    answer_key="answer",
    random_state=42
)

In [None]:
extended_df.to_csv('melu_data.csv', index = False)

### balanced dataset creation

In [None]:
full_r = pd.read_csv('./data/full_retain_qa.csv')
domain_r = full_r.loc[full_r['type'] == 'domain']


In [None]:
def downsample_group(g):
    if g.name[1] == 'domain':     # g.name is a tuple (title, type)
        title = g.name[0]
        n_entity = len(full_r[(full_r['title']==title) & (full_r['type']=='entity')])
        return g.sample(n=n_entity, random_state=42)
    else:
        return g


In [None]:
df_balanced2 = (
    full_r
    .groupby(['title','type'], group_keys=False)
    .apply(downsample_group)
    .reset_index(drop=True)
)

In [None]:
df_balanced2.to_csv('balanced_retain.csv', index=False)