# Library

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import Levenshtein
import cv2
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns

# Data Loading

In [None]:
%%time

train = pd.read_csv('../input/bms-molecular-translation/train_labels.csv')
test = pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')

def get_train_file_path(image_id):
    return "../input/bms-molecular-translation/train/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

def get_test_file_path(image_id):
    return "../input/bms-molecular-translation/test/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

train['file_path'] = train['image_id'].progress_apply(get_train_file_path)
test['file_path'] = test['image_id'].progress_apply(get_test_file_path)

print(f'train.shape: {train.shape}  test.shape: {test.shape}')
display(train.head())
display(test.head())

# Quick EDA

In [None]:
for i in range(5):
    image = cv2.imread(train.loc[i, 'file_path'], cv2.IMREAD_GRAYSCALE)
    image = 255 - image
    image = image[:, :, np.newaxis]
    label = train.loc[i, 'InChI']
    print(image.shape)
    plt.imshow(image)
    plt.title(label)
    plt.show()

# Preprocessing

In [None]:
%%time

train['InChI_list'] = train['InChI'].progress_apply(lambda x: x.split('/'))
train['InChI_length'] = train['InChI_list'].progress_apply(len)
InChI_df = train['InChI_list'].progress_apply(pd.Series)
train = pd.concat([train, InChI_df.add_prefix('InChI_')], axis=1)
display(train)

In [None]:
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')

# Scoring function

In [None]:
def get_score(y_true, y_pred):
    scores = []
    for true, pred in zip(y_true, y_pred):
        score = Levenshtein.distance(true, pred)
        scores.append(score)
    avg_score = np.mean(scores)
    return avg_score

# Naive baseline

In [None]:
# sample submission baseline
y_true = train['InChI'].values
y_pred = ['InChI=1S/H2O/h1H2'] * len(train)
score = get_score(y_true, y_pred)
print(score)

In [None]:
# mode baseline

mode_concat_string = ''
for i in range(11):
    mode_string = train[f'InChI_{i}'].fillna('nan').mode()[0]
    if mode_string != 'nan':
        if i == 0:
            mode_concat_string += mode_string
        else:
            mode_concat_string += '/' + mode_string
print(mode_concat_string)

y_true = train['InChI'].values
y_pred = [mode_concat_string] * len(train)
score = get_score(y_true, y_pred)
print(score)

# Submission

In [None]:
test['InChI'] = mode_concat_string
output_cols = ['image_id', 'InChI']
display(test[output_cols])
test[output_cols].to_csv('submission.csv', index=False)