# Plan & Experiment Log

Objective: Win a medal (>= 0.97592 accuracy) in Russian Text Normalization.

Strategy (iterative, time-boxed):
- Baseline: Context-aware memorization.
  - Build mapping from (semiotic_class, token, prev_token, next_token) -> most frequent normalized form ("after").
  - Backoffs: (semiotic_class, token) -> after, then (token) -> after.
  - Final fallback: identity (return the raw token) for safety; add simple class heuristics later if needed.
- Modeling upgrade (if needed for medal):
  - Train CatBoost/LightGBM classifier for ambiguous cases (keys with multiple labels) using contextual features.
  - Add class-specific regex/rules for NUMBERS, DATES, ORDINALS, etc., only where memory fails.

Planned Milestones:
1) Data load + schema inspection.
2) Baseline memorization mapper + CV on train (simulate test via sentence-level split).
3) Inference on test, create submission.csv, quick score via local validation.
4) Error analysis: where backoff hits, add lightweight heuristics or model.

Experiment Log:
- [T0] Init plan. Next: load files, confirm columns in ru_train.csv.zip / ru_test_2.csv.zip, sample submission format.

Notes:
- Always log progress and timings.
- Use expert review after major steps (planning, EDA, first baseline, post-score).

In [1]:
# Quick data inspection: list files, inspect sample submission and test schema
import os, sys, time, zipfile, pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)
print('CWD:', os.getcwd())
print('Files:', os.listdir())

t0 = time.time()
sample_path = 'ru_sample_submission_2.csv.zip'
test_path = 'ru_test_2.csv.zip'
train_path = 'ru_train.csv.zip'

print('\nReading sample submission...')
sample_sub = pd.read_csv(sample_path)
print('Sample submission shape:', sample_sub.shape)
print(sample_sub.head(3))
print('Sample submission columns:', list(sample_sub.columns))

print('\nReading test head...')
test_df = pd.read_csv(test_path, nrows=50000)
print('Test head shape:', test_df.shape)
print(test_df.head())
print('Test columns:', list(test_df.columns))

print('\nReading train head...')
train_df = pd.read_csv(train_path, nrows=50000)
print('Train head shape:', train_df.shape)
print(train_df.head())
print('Train columns:', list(train_df.columns))

print(f'Elapsed: {time.time()-t0:.2f}s', flush=True)

CWD: /app/agent_run_states/text-normalization-challenge-russian-language-spray-20250909-024401
Files: ['main.ipynb', 'agent_metadata', 'ru_sample_submission_2.csv.zip', 'requirements.txt', 'task.txt', 'ru_test_2.csv.zip', 'docker_run.log', 'description.md', 'ru_train.csv.zip']

Reading sample submission...


Sample submission shape: (1059191, 2)
    id        after
0  0_0       Теперь
1  0_1          все
2  0_2  уважительно
Sample submission columns: ['id', 'after']

Reading test head...
Test head shape: (50000, 3)
   sentence_id  token_id       before
0            0         0       Теперь
1            0         1          все
2            0         2  уважительно
3            0         3        зовут
4            0         4      Ямамото
Test columns: ['sentence_id', 'token_id', 'before']

Reading train head...
Train head shape: (50000, 5)
   sentence_id  token_id  class     before  \
0            0         0  PLAIN         По   
1            0         1  PLAIN  состоянию   
2            0         2  PLAIN         на   
3            0         3   DATE   1862 год   
4            0         4  PUNCT          .   

                                    after  
0                                      По  
1                               состоянию  
2                                      на  
3  т

In [2]:
# Baseline: context-aware memorization with backoffs using before, prev_before, next_before
import pandas as pd, numpy as np, time, os, pickle
from sklearn.model_selection import GroupKFold

t0 = time.time()
train_path = 'ru_train.csv.zip'
test_path = 'ru_test_2.csv.zip'

print('Loading full train/test...')
dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
train = pd.read_csv(train_path, dtype=dtype_train)
test = pd.read_csv(test_path, dtype=dtype_test)
print('Loaded. train:', train.shape, 'test:', test.shape)

# Ensure proper order and create context
def add_context(df):
    df = df.sort_values(['sentence_id','token_id']).copy()
    df['prev_before'] = df.groupby('sentence_id')['before'].shift(1).fillna('⟂')
    df['next_before'] = df.groupby('sentence_id')['before'].shift(-1).fillna('⟂')
    return df

train = add_context(train)
test = add_context(test)

# Helper to compute majority mapping
def majority_map(df, keys, target='after'):
    vc = (df.groupby(keys)[target]
            .agg(lambda s: s.value_counts().index[0]))
    vc = vc.reset_index()
    return {tuple(row[k] for k in keys): row[target] for _, row in vc.iterrows()}

# Prediction with backoffs: (before, prev, next) -> (before) -> identity
def predict_after(df, map_ctx, map_uni):
    preds = []
    hit_ctx = hit_uni = miss = 0
    for b, p, n in zip(df['before'].array, df['prev_before'].array, df['next_before'].array):
        key_ctx = (b, p, n)
        if key_ctx in map_ctx:
            preds.append(map_ctx[key_ctx]); hit_ctx += 1
        else:
            key_uni = (b,)
            if key_uni in map_uni:
                preds.append(map_uni[key_uni]); hit_uni += 1
            else:
                preds.append(b); miss += 1
    return np.array(preds, dtype=object), {'ctx': hit_ctx, 'uni': hit_uni, 'miss': miss}

# Cross-validation
print('\nStarting GroupKFold CV...')
gkf = GroupKFold(n_splits=5)
groups = train['sentence_id'].to_numpy()
accs = []
fold_logs = []
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    t_fold = time.time()
    tr = train.iloc[tr_idx]
    va = train.iloc[va_idx]
    # Build maps on training fold
    map_ctx = majority_map(tr, keys=['before','prev_before','next_before'])
    map_uni = majority_map(tr, keys=['before'])
    # Predict on validation fold
    va_preds, stats = predict_after(va, map_ctx, map_uni)
    acc = (va_preds == va['after'].to_numpy()).mean()
    accs.append(acc)
    fold_logs.append((fold, acc, stats))
    print(f'Fold {fold}: acc={acc:.6f}, hits ctx={stats["ctx"]}, uni={stats["uni"]}, miss={stats["miss"]}, elapsed={time.time()-t_fold:.1f}s', flush=True)

print(f'CV mean acc: {np.mean(accs):.6f} +- {np.std(accs):.6f}')

# Fit on full train and predict test
t_fit = time.time()
map_ctx_full = majority_map(train, keys=['before','prev_before','next_before'])
map_uni_full = majority_map(train, keys=['before'])
test_preds, stats_test = predict_after(test, map_ctx_full, map_uni_full)
print('Test backoff stats:', stats_test)

# Build submission
sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': test_preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission to', out_path, 'with shape', sub.shape)
print(sub.head(5))
print(f'Total elapsed: {time.time()-t0:.1f}s')

In [3]:
# Fast submission: efficient majority mapping without CV
import pandas as pd, numpy as np, time

t0 = time.time()
train_path = 'ru_train.csv.zip'
test_path = 'ru_test_2.csv.zip'

def add_context(df):
    df = df.sort_values(['sentence_id','token_id']).copy()
    df['prev_before'] = df.groupby('sentence_id')['before'].shift(1).fillna('⟂')
    df['next_before'] = df.groupby('sentence_id')['before'].shift(-1).fillna('⟂')
    return df

print('Loading data (if needed)...')
need_load = False
try:
    _ = train.shape
    _ = test.shape
except NameError:
    need_load = True
if need_load:
    dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
    dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
    train = pd.read_csv(train_path, dtype=dtype_train)
    test = pd.read_csv(test_path, dtype=dtype_test)
    print('Loaded from disk.')
else:
    print('Using in-memory dataframes.')

train = add_context(train)
test = add_context(test)
print('Train/Test ready:', train.shape, test.shape, flush=True)

def top_freq_map(df, keys, target='after'):
    # Count occurrences per (keys + target), pick top target per keys
    cols = list(keys) + [target]
    cnt = df.groupby(cols, observed=True).size().reset_index(name='cnt')
    cnt.sort_values(keys + ['cnt'], ascending=[True]*len(keys) + [False], inplace=True)
    top = cnt.drop_duplicates(subset=keys, keep='first')
    # Build dict
    return {tuple(row[k] for k in keys): row[target] for _, row in top.iterrows()}

print('Building maps...')
t_build = time.time()
map_ctx = top_freq_map(train, keys=['before','prev_before','next_before'])
map_uni = top_freq_map(train, keys=['before'])
print(f'Maps built in {time.time()-t_build:.1f}s | ctx keys={len(map_ctx):,} uni keys={len(map_uni):,}', flush=True)

def predict_after(df, map_ctx, map_uni):
    preds = []
    hit_ctx = hit_uni = miss = 0
    b_arr = df['before'].array
    p_arr = df['prev_before'].array
    n_arr = df['next_before'].array
    for b, p, n in zip(b_arr, p_arr, n_arr):
        t = map_ctx.get((b, p, n))
        if t is not None:
            preds.append(t); hit_ctx += 1
        else:
            t = map_uni.get((b,))
            if t is not None:
                preds.append(t); hit_uni += 1
            else:
                preds.append(b); miss += 1
    return np.array(preds, dtype=object), {'ctx': hit_ctx, 'uni': hit_uni, 'miss': miss}

print('Predicting test...')
t_pred = time.time()
test_preds, stats_test = predict_after(test, map_ctx, map_uni)
print(f'Backoff stats on test: {stats_test}, pred time {time.time()-t_pred:.1f}s')

sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': test_preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, 'Elapsed total:', f'{time.time()-t0:.1f}s')
sub.head()

In [4]:
# Ultra-fast baseline: unigram mapping (before -> most frequent after), identity fallback
import pandas as pd, numpy as np, time

t0 = time.time()
train_path = 'ru_train.csv.zip'
test_path = 'ru_test_2.csv.zip'

print('Loading data (if needed)...')
need_load = False
try:
    _ = train.shape; _ = test.shape
except NameError:
    need_load = True
if need_load:
    dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
    dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
    train = pd.read_csv(train_path, dtype=dtype_train)
    test = pd.read_csv(test_path, dtype=dtype_test)
    print('Loaded from disk.')
else:
    print('Using in-memory dataframes.')

print('Building unigram map (before -> most frequent after)...')
t_build = time.time()
cnt = (train.groupby(['before','after'], observed=True).size().reset_index(name='cnt'))
cnt.sort_values(['before','cnt'], ascending=[True, False], inplace=True)
top = cnt.drop_duplicates(subset=['before'], keep='first')
map_uni = dict(zip(top['before'].array, top['after'].array))
print(f'Unigram map size: {len(map_uni):,} built in {time.time()-t_build:.1f}s')

print('Predicting on test with identity fallback...')
t_pred = time.time()
b = test['before'].array
preds = np.fromiter((map_uni.get(x, x) for x in b), dtype=object, count=len(b))
print(f'Predicted {len(preds):,} tokens in {time.time()-t_pred:.1f}s')

sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, 'Total elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

In [5]:
# Optimized unigram mapper using categorical codes + numpy (fast on 9.5M rows)
import pandas as pd, numpy as np, time

t0 = time.time()
train_path = 'ru_train.csv.zip'
test_path = 'ru_test_2.csv.zip'

print('Loading data (if needed)...')
need_load = False
try:
    _ = train.shape; _ = test.shape
except NameError:
    need_load = True
if need_load:
    dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
    dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
    train = pd.read_csv(train_path, dtype=dtype_train)
    test = pd.read_csv(test_path, dtype=dtype_test)
    print('Loaded from disk.')
else:
    print('Using in-memory dataframes.')

print('Building categorical encodings...')
t1 = time.time()
before_cat = pd.Categorical(train['before'])
after_cat = pd.Categorical(train['after'])
bc = before_cat.codes.astype(np.int32)
ac = after_cat.codes.astype(np.int32)
print(f'Unique before: {len(before_cat.categories):,}, unique after: {len(after_cat.categories):,}, took {time.time()-t1:.1f}s')

print('Counting (before, after) pairs via numpy unique...')
t2 = time.time()
key = (bc.astype(np.int64) << 32) | ac.astype(np.int64)
uniq_key, counts = np.unique(key, return_counts=True)
bc_u = (uniq_key >> 32).astype(np.int32)
ac_u = (uniq_key & np.int64(0xFFFFFFFF)).astype(np.int32)
print(f'Pairs: {len(uniq_key):,}, counting took {time.time()-t2:.1f}s')

print('Selecting top after per before...')
t3 = time.time()
order = np.argsort(bc_u, kind='mergesort')
bc_s = bc_u[order]
ac_s = ac_u[order]
cnt_s = counts[order]
diff = np.empty_like(bc_s, dtype=bool)
diff[0] = True
diff[1:] = bc_s[1:] != bc_s[:-1]
starts = np.flatnonzero(diff)
ends = np.append(starts[1:], len(bc_s))
best_ac = np.full(len(before_cat.categories), -1, dtype=np.int32)
for s, e in zip(starts, ends):
    arg = s + np.argmax(cnt_s[s:e])
    best_ac[bc_s[s]] = ac_s[arg]
print(f'Selected best labels for {len(starts):,} befores in {time.time()-t3:.1f}s')

print('Predicting on test...')
t4 = time.time()
test_b_codes = pd.Categorical(test['before'], categories=before_cat.categories).codes.astype(np.int32)
seen_mask = test_b_codes != -1
pred_after_codes = np.full(len(test_b_codes), -1, dtype=np.int32)
pred_after_codes[seen_mask] = best_ac[test_b_codes[seen_mask]]
# Map codes to strings; unseen -> identity
after_strings = np.asarray(after_cat.categories, dtype=object)
test_before_arr = test['before'].to_numpy(dtype=object)
preds = test_before_arr.copy()
seen_and_have = seen_mask & (pred_after_codes != -1)
preds[seen_and_have] = after_strings[pred_after_codes[seen_and_have]]
print(f'Predicted {seen_and_have.sum():,} via map, {len(preds)-seen_and_have.sum():,} via identity; time {time.time()-t4:.1f}s')

sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, 'Total elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

In [6]:
# Chunked mapper restricted to test tokens: fast and memory-efficient
import pandas as pd, numpy as np, time, gc

t0 = time.time()
train_path = 'ru_train.csv.zip'
test_path = 'ru_test_2.csv.zip'

print('Loading test...')
dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
test = pd.read_csv(test_path, dtype=dtype_test)
print('Test shape:', test.shape, flush=True)

# Collect unique test befores
test_befores = pd.unique(test['before']).tolist()
test_before_set = set(test_befores)
print('Unique test befores:', len(test_before_set))

print('Counting train pairs (before, after) restricted to test befores)...')
t1 = time.time()
chunk_size = 500_000
dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
counts = {}  # dict of before -> dict(after->count)
total_rows = 0
for i, chunk in enumerate(pd.read_csv(train_path, dtype=dtype_train, chunksize=chunk_size)):
    total_rows += len(chunk)
    sub = chunk[chunk['before'].isin(test_before_set)][['before','after']]
    if not sub.empty:
        grp = sub.groupby(['before','after'], observed=True).size().reset_index(name='cnt')
        for b, a, c in grp.itertuples(index=False):
            d = counts.get(b)
            if d is None:
                d = {}; counts[b] = d
            d[a] = d.get(a, 0) + int(c)
    if (i+1) % 10 == 0:
        print(f'  processed ~{total_rows:,} rows, current keys={len(counts):,}', flush=True)
    del chunk, sub
    gc.collect()
print(f'Finished counting in {time.time()-t1:.1f}s; total rows {total_rows:,}, keys {len(counts):,}')

print('Selecting top after per before...')
t2 = time.time()
map_uni = {}
for b, d in counts.items():
    # pick after with max count; ties resolved by first encountered
    best_a = max(d.items(), key=lambda kv: kv[1])[0]
    map_uni[b] = best_a
counts = None
gc.collect()
print(f'Map size: {len(map_uni):,} built in {time.time()-t2:.1f}s')

print('Predicting test with identity fallback...')
t3 = time.time()
b_arr = test['before'].to_numpy(dtype=object)
preds = np.fromiter((map_uni.get(x, x) for x in b_arr), dtype=object, count=len(b_arr))
print(f'Predicted {len(preds):,} tokens in {time.time()-t3:.1f}s')

sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, 'Total elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

In [7]:
# Emergency identity submission to get LB baseline fast
import pandas as pd, time
t0 = time.time()
test_path = 'ru_test_2.csv.zip'
dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
print('Loading test...')
test = pd.read_csv(test_path, dtype=dtype_test)
print('Test shape:', test.shape)
sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': test['before']
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved identity submission:', out_path, sub.shape, 'Elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

In [9]:
# Fastest path: copy sample submission from zip to submission.csv without parsing
import zipfile, os, time
t0 = time.time()
zip_path = 'ru_sample_submission_2.csv.zip'
out_path = 'submission.csv'
with zipfile.ZipFile(zip_path, 'r') as zf:
    names = zf.namelist()
    if not names:
        raise RuntimeError('Sample submission zip has no files')
    member = names[0]
    with zf.open(member, 'r') as src, open(out_path, 'wb') as dst:
        # stream copy
        while True:
            chunk = src.read(1024 * 1024)
            if not chunk:
                break
            dst.write(chunk)
print('Wrote submission.csv from', zip_path, 'member:', member, 'size:', os.path.getsize(out_path), 'bytes', 'Elapsed:', f'{time.time()-t0:.1f}s')

Wrote submission.csv from ru_sample_submission_2.csv.zip member: ru_sample_submission_2.csv size: 23892801 bytes Elapsed: 0.0s


In [10]:
# Efficient unigram mapper: unzip train once, chunked counting restricted to test tokens
import os, zipfile, time, gc
import pandas as pd, numpy as np

t0 = time.time()
train_zip = 'ru_train.csv.zip'
test_zip = 'ru_test_2.csv.zip'
train_csv = 'ru_train.csv'

# 1) Ensure uncompressed train CSV exists (pandas chunks work only on uncompressed CSV reliably)
if not os.path.exists(train_csv):
    print('Extracting train CSV from zip...')
    t_ex = time.time()
    with zipfile.ZipFile(train_zip, 'r') as zf:
        names = zf.namelist()
        member = names[0]
        with zf.open(member, 'r') as src, open(train_csv, 'wb') as dst:
            while True:
                chunk = src.read(1024 * 1024 * 8)
                if not chunk:
                    break
                dst.write(chunk)
    print(f'Extracted to {train_csv} in {time.time()-t_ex:.1f}s, size={os.path.getsize(train_csv):,} bytes')
else:
    print('Found existing uncompressed train CSV:', train_csv, 'size=', os.path.getsize(train_csv))

# 2) Load test (small enough to read directly from zip)
print('Loading test...')
dtype_test = {'sentence_id':'int32', 'token_id':'int32', 'before':'string'}
test = pd.read_csv(test_zip, dtype=dtype_test)
print('Test shape:', test.shape, flush=True)

# 3) Collect unique test befores
test_before_set = set(pd.unique(test['before']))
print('Unique test befores:', len(test_before_set))

# 4) Chunked counting over uncompressed train CSV
print('Counting (before, after) pairs restricted to test befores...')
t1 = time.time()
chunk_size = 1_000_000
dtype_train = {'sentence_id':'int32', 'token_id':'int32', 'class':'category', 'before':'string', 'after':'string'}
counts = {}  # before -> dict(after->count)
total_rows = 0
reader = pd.read_csv(train_csv, dtype=dtype_train, usecols=['before','after'], chunksize=chunk_size)
for i, chunk in enumerate(reader, 1):
    total_rows += len(chunk)
    sub = chunk[chunk['before'].isin(test_before_set)]
    if not sub.empty:
        grp = sub.groupby(['before','after'], observed=True).size().reset_index(name='cnt')
        for b, a, c in grp.itertuples(index=False):
            d = counts.get(b)
            if d is None:
                d = {}; counts[b] = d
            d[a] = d.get(a, 0) + int(c)
    if i % 5 == 0:
        print(f'  processed ~{total_rows:,} rows, keys so far={len(counts):,}', flush=True)
    del chunk, sub
    gc.collect()
print(f'Finished counting in {time.time()-t1:.1f}s; total rows {total_rows:,}, keys {len(counts):,}')

# 5) Select top after per before
print('Selecting top after per before...')
t2 = time.time()
map_uni = {}
for b, d in counts.items():
    best_a = max(d.items(), key=lambda kv: kv[1])[0]
    map_uni[b] = best_a
counts = None
gc.collect()
print(f'Map size: {len(map_uni):,} built in {time.time()-t2:.1f}s')

# 6) Predict on test with identity fallback
print('Predicting test...')
t3 = time.time()
b_arr = test['before'].to_numpy(dtype=object)
preds = np.fromiter((map_uni.get(x, x) for x in b_arr), dtype=object, count=len(b_arr))
print(f'Predicted {len(preds):,} tokens in {time.time()-t3:.1f}s')

# 7) Write submission
sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, 'Total elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

Extracting train CSV from zip...


Extracted to ru_train.csv in 0.7s, size=443,635,601 bytes
Loading test...


Test shape: (1059191, 3)


Unique test befores: 185535
Counting (before, after) pairs restricted to test befores...


  processed ~5,000,000 rows, keys so far=121,605


  processed ~9,515,325 rows, keys so far=135,873


Finished counting in 20.3s; total rows 9,515,325, keys 135,873
Selecting top after per before...


Map size: 135,873 built in 0.3s
Predicting test...


Predicted 1,059,191 tokens in 0.3s


Saved submission: submission.csv (1059191, 2) Total elapsed: 24.3s


Unnamed: 0,id,after
0,0_0,Теперь
1,0_1,все
2,0_2,уважительно
3,0_3,зовут
4,0_4,Ямамото


In [None]:
# Contextual memorization with backoffs: (before, prev, next) -> (before, prev) -> (before, next) -> unigram -> identity
import pandas as pd, numpy as np, time, os

t0 = time.time()
train_csv = 'ru_train.csv'  # uncompressed from previous step
test_zip = 'ru_test_2.csv.zip'

def add_context(df):
    df = df.sort_values(['sentence_id','token_id']).copy()
    df['prev_before'] = df.groupby('sentence_id', observed=True)['before'].shift(1).fillna('⟂')
    df['next_before'] = df.groupby('sentence_id', observed=True)['before'].shift(-1).fillna('⟂')
    return df

print('Loading test and adding context...')
dtype_test = {'sentence_id':'int32','token_id':'int32','before':'string'}
test = pd.read_csv(test_zip, dtype=dtype_test)
test = add_context(test)
print('Test ready:', test.shape)

# Build key sets from test
print('Building test key sets...')
test['k_tri'] = (test['before'] + '\t' + test['prev_before'] + '\t' + test['next_before']).astype('string')
test['k_bp']  = (test['before'] + '\t' + test['prev_before']).astype('string')
test['k_bn']  = (test['before'] + '\t' + test['next_before']).astype('string')
tri_set = set(pd.unique(test['k_tri']))
bp_set  = set(pd.unique(test['k_bp']))
bn_set  = set(pd.unique(test['k_bn']))
print('Unique test keys -> tri:', len(tri_set), 'bp:', len(bp_set), 'bn:', len(bn_set))

print('Loading full train (uncompressed) minimal cols and adding context...')
dtype_train = {'sentence_id':'int32','token_id':'int32','before':'string','after':'string'}
train = pd.read_csv(train_csv, dtype=dtype_train, usecols=['sentence_id','token_id','before','after'])
train = add_context(train)
print('Train ready:', train.shape, 'Elapsed:', f'{time.time()-t0:.1f}s')

# Compose keys in train
print('Composing train keys...')
t1 = time.time()
train['k_tri'] = (train['before'] + '\t' + train['prev_before'] + '\t' + train['next_before']).astype('string')
train['k_bp']  = (train['before'] + '\t' + train['prev_before']).astype('string')
train['k_bn']  = (train['before'] + '\t' + train['next_before']).astype('string')
print('Keys composed in', f'{time.time()-t1:.1f}s')

# Filter rows that can hit any test key
print('Filtering candidate rows...')
t2 = time.time()
mask = train['k_tri'].isin(tri_set) | train['k_bp'].isin(bp_set) | train['k_bn'].isin(bn_set)
cand = train.loc[mask, ['after','k_tri','k_bp','k_bn']]
print('Candidates:', cand.shape, 'filter time:', f'{time.time()-t2:.1f}s')

def build_top_map(df, key_col):
    if df.empty: return {}
    grp = df.groupby([key_col, 'after'], observed=True).size().reset_index(name='cnt')
    grp.sort_values([key_col, 'cnt'], ascending=[True, False], inplace=True)
    top = grp.drop_duplicates(subset=[key_col], keep='first')
    return dict(zip(top[key_col].array, top['after'].array))

print('Building maps...')
t3 = time.time()
map_tri = build_top_map(cand[['k_tri','after']].rename(columns={'k_tri':'key'}), 'key') if not cand.empty else {}
map_bp  = build_top_map(cand[['k_bp','after']].rename(columns={'k_bp':'key'}), 'key') if not cand.empty else {}
map_bn  = build_top_map(cand[['k_bn','after']].rename(columns={'k_bn':'key'}), 'key') if not cand.empty else {}
print(f'Maps built in {time.time()-t3:.1f}s | tri={len(map_tri):,} bp={len(map_bp):,} bn={len(map_bn):,}')

# Optional: unigram map (fallback). Use fast value_counts style on full train.
print('Building unigram fallback map...')
t4 = time.time()
u = train.groupby(['before','after'], observed=True).size().reset_index(name='cnt')
u.sort_values(['before','cnt'], ascending=[True, False], inplace=True)
u_top = u.drop_duplicates(subset=['before'], keep='first')
map_uni = dict(zip(u_top['before'].array, u_top['after'].array))
print('Unigram size:', len(map_uni), 'built in', f'{time.time()-t4:.1f}s')

# Predict with backoffs
print('Predicting with backoffs...')
t5 = time.time()
preds = []
hit_tri = hit_bp = hit_bn = hit_uni = miss = 0
for b, p, n in zip(test['before'].array, test['prev_before'].array, test['next_before'].array):
    kt = f'{b}\t{p}\t{n}'
    r = map_tri.get(kt)
    if r is not None:
        preds.append(r); hit_tri += 1; continue
    kb = f'{b}\t{p}'
    r = map_bp.get(kb)
    if r is not None:
        preds.append(r); hit_bp += 1; continue
    kn = f'{b}\t{n}'
    r = map_bn.get(kn)
    if r is not None:
        preds.append(r); hit_bn += 1; continue
    r = map_uni.get(b)
    if r is not None:
        preds.append(r); hit_uni += 1; continue
    preds.append(b); miss += 1
preds = np.array(preds, dtype=object)
print(f'Hits tri={hit_tri:,} bp={hit_bp:,} bn={hit_bn:,} uni={hit_uni:,} miss={miss:,} | Elapsed {time.time()-t5:.1f}s')

sub = pd.DataFrame({
    'id': test['sentence_id'].astype('string') + '_' + test['token_id'].astype('string'),
    'after': preds
})
out_path = 'submission.csv'
sub.to_csv(out_path, index=False)
print('Saved submission:', out_path, sub.shape, '| Total elapsed:', f'{time.time()-t0:.1f}s')
sub.head()

Loading test and adding context...


Test ready: (1059191, 5)
Building test key sets...
