In [None]:
import pandas as pd

In [None]:
wiki_edits_df = pd.read_csv("../../datasets/wikiedits_multigec/wikiedits_multigec.csv")
wiki_edits_df

In [None]:
LANG_TO_CODE: dict[str, str] = {
    "czech": "cs",
    "english": "en",
    "estonian": "et",
    "german": "de",
    "greek": "el",
    "icelandic": "is",
    "italian": "it",
    "latvian": "lv",
    "slovene": "sl",
    "swedish": "sv",
    "ukrainian": "uk"
}

CODE_TO_LANG = {
    v: k
    for k, v in LANG_TO_CODE.items()
}

wiki_edits_df.loc[:, "language"] = wiki_edits_df.loc[:, "code_lang"].map(lambda x: CODE_TO_LANG[x])
wiki_edits_df = wiki_edits_df.loc[:, [
    "language",
    "text_del_clean",
    "text_ins_clean"
]].rename(columns={
    "text_del_clean": "feature",
    "text_ins_clean": "target",
})
wiki_edits_df.loc[:, "corpora"] = "wikiedits"

wiki_edits_df

In [None]:
def split_by_language(df, lang_col='language', split_col='split', train_frac=0.8, val_frac=0.1, seed=42):
    # Make a copy to avoid modifying the original DataFrame
    df = df.copy()

    # Function that assigns split labels within each language group
    def _assign_split(group):
        # Shuffle each group
        group = group.sample(frac=1, random_state=seed).reset_index(drop=True)

        # Calculate boundaries
        total = len(group)
        train_end = int(total * train_frac)
        val_end = int(total * (train_frac + val_frac))

        # Assign split
        group.loc[:train_end - 1, split_col] = 'train'
        group.loc[train_end:val_end - 1, split_col] = 'val'
        group.loc[val_end:, split_col] = 'test'

        return group

    # Group by language, shuffle, then label
    df = df.groupby(lang_col, group_keys=False).apply(_assign_split)

    return df

wiki_edits_df = split_by_language(wiki_edits_df, lang_col='language')
wiki_edits_df

In [None]:
wiki_edits_df.groupby(["language", "split"]).count()["feature"]

In [None]:
wiki_edits_df.groupby(["split"]).count()["feature"]

In [None]:
track = "fluency"

multigec_df = pd.read_csv(f"../../datasets/multigec/multigec_{track}.csv")
multigec_df

In [None]:
multigec_df = multigec_df.loc[:, [
    "language",
    "feature",
    "target",
    "split",
]]
multigec_df.loc[:, "corpora"] = "multigec"
multigec_df

In [None]:
omnigec_df = pd.concat([
    multigec_df,
    wiki_edits_df,
])
omnigec_df

In [None]:
omnigec_df.groupby(["split", "corpora"]).count()["feature"]

In [None]:
omnigec_df.reset_index(drop=True).to_csv(f"../../datasets/omnigec_model_training/multigec_wikiedits_{track}.csv")