In [None]:
import pandas as pd
import hvplot.pandas

omnigec_df = pd.read_csv("../../datasets/omnigec/omnigec_minimal.csv")
omnigec_df

In [None]:
omnigec_df

In [None]:
import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

def count_tokens(x):
    # if it's not a string (e.g. NaN, list, int), treat as zero‑length
    if not isinstance(x, str):
        return 0
    return len(tokenizer.encode(x))

omnigec_df['tokens'] = omnigec_df['feature'].apply(count_tokens)
omnigec_df

In [None]:
import hvplot.pandas  # register hvplot
import holoviews as hv

# build one overlay per language
plots = []
for lang in omnigec_df['language'].unique():
    df_lang = omnigec_df[omnigec_df['language'] == lang]
    # one KDE curve per corpora, labeled so hv will color them
    curves = []
    for corp in df_lang['corpora'].unique():
        df_corp = df_lang[df_lang['corpora'] == corp]
        curves.append(
            df_corp.hvplot.kde(
                y='tokens',
                label=str(corp),      # gives each curve a legend entry
                hover=False           # optional: simpler hover
            )
        )
    overlay = hv.Overlay(curves).opts(
        title=str(lang),
        #sharex=True,
        #sharey=True,
        legend_position='top_right',
        width=1000,
        height=200
    )
    plots.append(overlay)

# stack vertically
layout = hv.Layout(plots).cols(1)
layout


In [None]:
omnigec_df = pd.read_csv("../../datasets/omnigec/omnigec_minimal.csv")
omnigec_df.loc[omnigec_df.loc[:, "corpora"] == "multigec", "corpora"] = "multigec-minimal"
omnigec_fluency_df = pd.read_csv("../../datasets/omnigec/omnigec_fluency.csv")
omnigec_fluency_df = omnigec_fluency_df.loc[omnigec_fluency_df.loc[:, "corpora"] == "multigec"]
omnigec_fluency_df.loc[omnigec_fluency_df.loc[:, "corpora"] == "multigec", "corpora"] = "multigec-fluency"
omnigec_df = pd.concat([
    omnigec_df,
    omnigec_fluency_df
])

In [None]:
omnigec_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FuncFormatter

# 0) Filter out the top 0.5% of tokens within each language
q995 = omnigec_df.groupby('language')['tokens'].transform(lambda x: x.quantile(0.9999))
df_filtered = omnigec_df[omnigec_df['tokens'] <= q995]

# 1) Set style
sns.set_style("whitegrid")

# 2) Build FacetGrid with legend inside
g = sns.FacetGrid(
    df_filtered,
    row="language",
    hue="corpora",
    sharex=True,
    sharey=True,
    height=1.25,
    aspect=6,
    legend_out=False
)

# 3) Map the KDE with clip=(0,None) to prevent negative support
g.map(
    sns.kdeplot,
    "tokens",
    common_norm=False,
    clip=(0, None)
)

# 4) Remove per‑subplot labels (we’ll use global ones)
g.set_axis_labels("", "")

# 5) Enable minor ticks and grids on all plots, then clamp x to [0,1600]
for ax in g.axes.flatten():
    ax.minorticks_on()
    ax.grid(which="minor", linestyle=":", linewidth=0.5)
    ax.grid(which="major", linestyle="--", linewidth=0.8)
    ax.set_xlim(0)  # ← clamp right at 1600

# 6) Configure bottom facet’s locators & rotated minor tick labels
bottom_ax = g.axes[-1][0]
bottom_ax.xaxis.set_major_locator(MultipleLocator(500))
bottom_ax.xaxis.set_minor_locator(MultipleLocator(100))
bottom_ax.xaxis.set_minor_formatter(FuncFormatter(lambda x, _: f"{int(x)}"))
bottom_ax.tick_params(axis='x', which='major', labelbottom=True, labelrotation=45, labelsize=12)
bottom_ax.tick_params(axis='x', which='minor', labelbottom=True, labelrotation=45, labelsize=12, pad=5)

# 7) Shared axis labels with extra left margin and tighter vertical spacing
fig = g.fig
fig.subplots_adjust(left=0.18, hspace=0.1)   # ← decrease hspace to tighten rows
fig.supylabel("Token Length Probability", x=0.02, fontsize=14)
fig.supxlabel("Token Length", y=0, fontsize=14)

# 8) Increase the row titles (language names)
g.set_titles(
    row_template="{row_name}",
    size=14,
    y=1.0,
    x=0.0,
    ha='left'
)

# 9) Single legend inside with larger text
g.add_legend(
    title="Corpora",
    loc="upper left",
    frameon=True,
    bbox_to_anchor=(0.95, 0.95),
    fontsize=12,
    title_fontsize=14
)

# 10) Final layout, save, show
plt.tight_layout()
plt.savefig("kde_per_language_and_corpora_clipped_largerfonts.png")
plt.show()


In [None]:
wiki_english = pd.read_csv("../../datasets/wikiedits/wiki_correction_v2.csv")
wiki_english = wiki_english.rename(columns={
    "text_del_clean": "feature",
    "text_ins_clean": "target",
}).loc[:, ["feature", "target", "code_lang"]]
wiki_english = wiki_english.loc[wiki_english.loc[:, "code_lang"] == "en"]
wiki_english.loc[:, "corpora"] = "wikiedits"
wiki_english.loc[:, "language"] = "english"
wiki_english

In [None]:
omnigec_ua_df = pd.read_csv("../../datasets/omnigec/omnigec_plus_ua_minimal.csv")
omnigec_df = pd.concat([
    omnigec_df,
    omnigec_ua_df.loc[omnigec_ua_df.loc[:, "corpora"] == "ubertext"],
    wiki_english
]).drop_duplicates()

In [None]:
omnigec_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# 0) rename & reorder your corpora
rename_map = {
    'multigec-minimal': 'MultiGEC-minimal',
    'multigec-fluency': 'MultiGEC-fluency',
    'wikiedits':       'WikiEdits-MultiGEC',
    'reddit':          'Reddit-MultiGEC',
    'ubertext':        "UberText-GEC",
}
omnigec_df['corpora_display'] = omnigec_df['corpora'].map(rename_map)
order = [
    'MultiGEC-minimal',
    'MultiGEC-fluency',
    'WikiEdits-MultiGEC',
    'Reddit-MultiGEC',
    'UberText-GEC',
]

# 1) capitalized language names
omnigec_df['language_display'] = omnigec_df['language'].str.capitalize()

sns.set_style("whitegrid")

# 2) define a fixed palette so we know the colors
palette = dict(zip(order, sns.color_palette(n_colors=len(order))))

# 3) draw the catplot with our palette
g = sns.catplot(
    data=omnigec_df,
    row="language_display",
    y="corpora_display",
    x="tokens",
    hue="corpora_display",
    palette=palette,
    kind="box",
    orient="h",
    sharex=True,
    sharey=True,
    order=order,
    hue_order=order,
    height=1.5,
    aspect=8,
)

# 4) clean up titles & labels
g.set_titles(row_template="{row_name}", size=14, y=1.0, x=0.0, ha='left')
g.set_axis_labels("", "")

fig = g.fig
fig.subplots_adjust(left=0.18, top=0.92)
fig.supxlabel("Token Length", y=-0.02, fontsize=14)
fig.supylabel("Corpora", x=0, fontsize=14)
fig.suptitle("Boxplot of Token Length by Corpora and Language", fontsize=16)
fig.supxlabel("Token Length", y=0, fontsize=14)

# 5) remove any existing legend
if g._legend:
    g._legend.remove()

# 6) build manual legend proxies & place inside figure
proxies = [Patch(facecolor=palette[label]) for label in order]
fig.legend(
    proxies,
    order,
    title="Corpora",
    loc="upper right",
    bbox_to_anchor=(0.95, 0.95),
    bbox_transform=fig.transFigure,
    frameon=True,
    fontsize=12,
    title_fontsize=14,
    borderaxespad=0.0,

)

plt.tight_layout()
plt.savefig("box_per_language_and_corpora.png", dpi=300)
plt.show()
