In [2]:
# Text Normalization Challenge - English Language
# Plan:
# - Inspect provided files and sample submission format.
# - Load train/test from zipped CSVs using pandas.
# - EDA: show columns, sizes, and a few samples.
# - Build a frequency-based mapper: for each (class, before) choose the most frequent 'after'.
# - Apply mapper to test; default backoff: after = before when unseen.
# - Save predictions to submission.csv in required format.

import os, zipfile, pandas as pd, numpy as np
pd.set_option('display.max_columns', 100)
print('Files in CWD:', os.listdir())

from zipfile import ZipFile

# Peek sample submission structure
ss_zip = 'en_sample_submission_2.csv.zip'
with ZipFile(ss_zip) as z:
    ss_name = z.namelist()[0]
    with z.open(ss_name) as f:
        sample_sub = pd.read_csv(f)
print('Sample submission shape:', sample_sub.shape)
print(sample_sub.head())

# Peek train/test columns
def read_head(zip_path, n=5):
    with ZipFile(zip_path) as z:
        name = z.namelist()[0]
        with z.open(name) as f:
            return pd.read_csv(f, nrows=n)

train_head = read_head('en_train.csv.zip', n=10)
test_head = read_head('en_test_2.csv.zip', n=10)
print('Train head:')
print(train_head)
print('Test head:')
print(test_head)

Files in CWD: ['main.ipynb', 'agent_metadata', 'en_test_2.csv.zip', 'en_sample_submission_2.csv.zip', 'requirements.txt', 'task.txt', 'en_train.csv.zip', 'docker_run.log', 'description.md']


Sample submission shape: (993465, 2)
    id     after
0  0_0     Rocky
1  0_1  Mountain
2  0_2      News
3  0_3         .
4  1_0         "
Train head:
   sentence_id  token_id  class         before          after
0            0         0  PLAIN  Brillantaisia  Brillantaisia
1            0         1  PLAIN             is             is
2            0         2  PLAIN              a              a
3            0         3  PLAIN          genus          genus
4            0         4  PLAIN             of             of
5            0         5  PLAIN          plant          plant
6            0         6  PLAIN             in             in
7            0         7  PLAIN         family         family
8            0         8  PLAIN    Acanthaceae    Acanthaceae
9            0         9  PUNCT              .              .
Test head:
   sentence_id  token_id     before
0            0         0      Rocky
1            0         1   Mountain
2            0         2       News
3           

In [None]:
import time
from collections import Counter, defaultdict
from zipfile import ZipFile
import pandas as pd
import numpy as np

t0 = time.time()
print('Building memorizer from train...')
cnt = Counter()
rows_processed = 0
with ZipFile('en_train.csv.zip') as z:
    name = z.namelist()[0]
    for i, ch in enumerate(pd.read_csv(z.open(name), usecols=['before','after'], dtype='string', chunksize=2_000_000)):
        # Drop rows with missing before/after to avoid pd.NA comparison issues
        ch = ch.dropna(subset=['before','after'])
        cnt.update(zip(ch['before'].tolist(), ch['after'].tolist()))
        rows_processed += len(ch)
        if i % 1 == 0:
            print(f'  Chunk {i}, cumulative rows: {rows_processed:,}, unique pairs: {len(cnt):,}, elapsed: {time.time()-t0:.1f}s', flush=True)

print('Consolidating most frequent after per before...')
by_before = defaultdict(list)
for (b,a), c in cnt.items():
    by_before[b].append((c, a))
most = {b: max(v)[1] for b, v in by_before.items()}
# Unchanged-majority set (safe after NA drop) for potential rule gating later
unchanged = set()
for b, v in by_before.items():
    total = 0
    top_c, top_a = 0, None
    for c, a in v:
        total += c
        if c > top_c:
            top_c, top_a = c, a
    if total > 0 and top_a == b and (top_c / total) >= 0.9:
        unchanged.add(b)
print(f'Memorizer size: {len(most):,} befores. Unchanged-majority size: {len(unchanged):,}. Elapsed: {time.time()-t0:.1f}s')

print('Loading test and generating baseline predictions...')
with ZipFile('en_test_2.csv.zip') as zt:
    name_t = zt.namelist()[0]
    test = pd.read_csv(zt.open(name_t), dtype={'sentence_id':np.int32,'token_id':np.int32,'before':'string'})
print('Test shape:', test.shape)
pred = test['before'].map(most).fillna(test['before'])
sub = pd.DataFrame({
    'id': test['sentence_id'].astype(str) + '_' + test['token_id'].astype(str),
    'after': pred
})
sub.to_csv('submission.csv', index=False)
changed = (sub['after'] != test['before']).mean()
covered = test['before'].isin(most.keys()).mean()
print(f'Saved submission.csv. Covered by memorizer: {covered:.3%}. Changed fraction: {changed:.3%}. Total time: {time.time()-t0:.1f}s')

Building memorizer from train...


  Chunk 0, cumulative rows: 1,999,973, unique pairs: 175,079, elapsed: 2.9s


  Chunk 1, cumulative rows: 3,999,946, unique pairs: 274,252, elapsed: 5.8s
