In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

# 1) Build the DataFrame
data = {
    'Dataset': (
        ['MultiGEC-minimal'] * 11 +
        ['MultiGEC-fluency'] * 11 +
        ['OmniGEC'] * 11 +
        ['WikiEdits-MultiGEC'] * 11 +
        ['Reddit-MultiGEC'] * 11 +
        ['UberText-GEC'] * 11
    ),
    'Language': ['Czech','English','Estonian','German','Greek',
                 'Icelandic','Italian','Latvian','Slovenian','Swedish','Ukrainian'] * 6,
    'Count': [
        # MultiGEC-minimal
        12564, 5050,   258, 1033, 1289,   0,  813, 1015,  109,  502, 1872,
        # MultiGEC-fluency
           0,    0,  1761,    0,    0, 369,    0,    0,    0,    0, 1872,
        # OmniGEC
        18402,51671,10728,46937, 8196, 220,16440, 6869, 9160, 2546,7551,
        # WikiEdits-MultiGEC
          511, 6807,   55, 2386,  212,   0, 3142,   33,   38,  293,2519,
        # Reddit-MultiGEC
        17891,51671,10673,44551, 7984, 220,13298, 6836, 9122, 2253,5032,
        # UberText-GEC
           0,    0,    0,    0,    0,   0,    0,    0,    0,    0,199951
    ]
}
df = pd.DataFrame(data)

# 2) Drop the real "OmniGEC" rows (we use it as a meta‑group)
df = df[df['Dataset'] != 'OmniGEC']

# 3) Pivot and sort languages by descending total
pivot = df.pivot(index='Language', columns='Dataset', values='Count')
order = pivot.sum(axis=1).sort_values(ascending=False).index
pivot = pivot.loc[order]

# 4) Reorder so UberText‑GEC is last
cols = [c for c in pivot.columns if c != 'UberText-GEC'] + ['UberText-GEC']
pivot = pivot[cols]

# 5) Mask zeros for gray cells
masked = np.ma.masked_equal(pivot.values, 0)

# 6) Plot heatmap
fig, ax = plt.subplots(figsize=(10, 8))
norm = LogNorm(vmin=masked.min(), vmax=masked.max())
cax = ax.matshow(masked, aspect='auto', cmap='Blues', norm=norm, alpha=0.5)

# 7) Colorbar with 5 log‑spaced ticks (formatted with commas)
cbar = fig.colorbar(cax, ax=ax, label='Samples (log scale)')
vmin, vmax = masked.min(), masked.max()
tick_locs = np.logspace(np.log10(vmin), np.log10(vmax), num=5)
cbar.set_ticks(tick_locs)
cbar.set_ticklabels([f"{int(t):,}" for t in tick_locs])

# 8) Individual corpus labels at bottom
n_cols = pivot.shape[1]
ax.set_xticks(np.arange(n_cols))
ax.set_xticklabels(pivot.columns, ha='center', rotation=0)
ax.xaxis.set_ticks_position('bottom')
ax.tick_params(axis='x', bottom=True, top=False)

# 9) Meta‑group labels above
first_group = 2
second_group = n_cols - first_group
frac1 = first_group / n_cols
frac2 = second_group / n_cols

ax.text(frac1/2, 1.025, 'MultiGEC-25 Comp.',
        transform=ax.transAxes, ha='center', va='bottom', weight='bold')
ax.text(frac1 + frac2/2, 1.025, 'OmniGEC',
        transform=ax.transAxes, ha='center', va='bottom', weight='bold')

ax.axvline(x=first_group - 0.5, color='black', linewidth=1)

# 10) Language labels on left
ax.set_yticks(np.arange(pivot.shape[0]))
ax.set_yticklabels(pivot.index)

# 11) Annotate cells with commas for thousands
for i in range(pivot.shape[0]):
    for j in range(n_cols):
        val = pivot.iat[i, j]
        txt = f"{val:,}" if val > 0 else "–"
        ax.text(j, i, txt, ha='center', va='center', fontsize=9)

plt.tight_layout()
plt.savefig("corpus_data.png")
plt.show()


In [None]:
df = pd.read_csv("../../datasets/ubertext/ubertext_social_silver_post_slim.csv")
df

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns

# re‑use the same df_long from above

fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)
for ax, split in zip(axes, ['train','val','test']):
    mat = (
        df_long[df_long['Split']==split]
        .pivot(index='Language', columns='Dataset', values='Count')
    )
    sns.heatmap(mat,
                ax=ax,
                annot=True, fmt=".0f",
                cbar=ax is axes[-1],        # only one colorbar
                cmap="viridis",
                linewidths=.5)
    ax.set_title(split.capitalize())
    ax.tick_params(axis='x', rotation=45)

fig.tight_layout()
plt.show()
