In this notebook I'm going to sample characters for positions

In [None]:
from collections import Counter
from multiprocessing import Pool

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
np.random.seed(42)

In [None]:
df = pd.read_csv("/kaggle/input/bms-molecular-translation/train_labels.csv")


In [None]:
test_df = pd.read_csv("/kaggle/input/bms-molecular-translation/sample_submission.csv")

In [None]:
df.head()

In [None]:
# let's see shortest and longest strings
df_lengths = df.InChI.apply(len)
print(df_lengths.max())
print(df_lengths.min())

In [None]:
df_lengths.plot(kind="hist")

In [None]:
positions_freqs = [Counter() for _ in range(df_lengths.max())]
for inch in df.InChI:
    for pos, char in enumerate(inch):
        positions_freqs[pos][char] += 1

In [None]:
len(positions_freqs)

In [None]:
positions_freqs[0]

In [None]:
positions_freqs[10]

In [None]:
positions_freqs[20]

In [None]:
cnts = np.array([v for v in positions_freqs[20].values()])
sum(cnts / sum(cnts)

In [None]:
df_inchi_lens_freqs = df_lengths.value_counts(normalize=True)
def sample_length(size=1):
    return np.random.choice(a=df_inchi_lens_freqs.index.to_list(), p=df_inchi_lens_freqs.values.tolist(), size=size)
sample_length(4)

In [None]:
pos2char_proba = []
for cnt in positions_freqs:
    counts = np.array(list(cnt.values()))
    freqs = counts / sum(counts)
    sampl_chars = list(cnt)
    pos2char_proba.append((sampl_chars, freqs))

In [None]:
def sample_characters(leng: int):
    chars = []
    for i in range(leng):
        sampl_chars, freqs = pos2char_proba[i]
        chars.append(np.random.choice(sampl_chars, p=freqs))
    return "".join(chars)

sample_characters(400)

This is ridiculous, but okay for baseline

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_lengths = sample_length(test_df.shape[0])

In [None]:
result = []
with Pool(16) as pool:
    for res in tqdm(pool.imap(sample_characters, test_lengths, chunksize=60), total=test_lengths.shape[0]):
        result.append(res)

In [None]:
test_df["InChI"] = result

In [None]:
test_df.to_csv("output.csv", index=False)