In [None]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

In [None]:
path = "../input/bms-molecular-translation/"

labels_path = path + "train_labels.csv"
df_train_labels = pd.read_csv(labels_path)
df_train_labels.head()

In [None]:
test = pd.read_csv(path + 'sample_submission.csv', index_col=0)

In [None]:
df_train_labels.tail()

In [None]:
fully_qualified_path = path + "train/{}/{}/{}/{}.png"
convert_image_id_to_path = lambda image_id_details :fully_qualified_path.format(image_id_details[0], image_id_details[1], image_id_details[2], image_id_details) 

In [None]:
df_train_labels['image_path']=df_train_labels['image_id'].apply(convert_image_id_to_path)

In [None]:
df_train_labels.head()

### Denoise images

In [None]:
def convert_image_id_2_path(image_id: str) -> str:
    return path + "test/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

In [None]:
def visualize_image(image_id, label):
    plt.figure(figsize=(10, 8))
    
    image = cv2.imread(convert_image_id_2_path(image_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    plt.imshow(image)
    plt.title(f"{label}", fontsize=14)
    plt.axis("off")
    
    plt.show()

In [None]:
def visualize_image_denoise(image_id):
    plt.figure(figsize=(10, 8))  
    image = cv2.imread(convert_image_id_2_path(image_id), cv2.IMREAD_GRAYSCALE)
    _, blackAndWhite = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV)
    nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(blackAndWhite, None, None, None, 8, cv2.CV_32S)
    sizes = stats[1:, -1] #get CC_STAT_AREA component
    img2 = np.zeros((labels.shape), np.uint8)
    for i in range(0, nlabels - 1):
        if sizes[i] >= 2:   #filter small dotted regions
            img2[labels == i + 1] = 255
    image = cv2.bitwise_not(img2)
    plt.imshow(image)    
    plt.axis("off")
    plt.show()

In [None]:
i=0
visualize_image(test.index[i], test.index[i])
visualize_image_denoise(test.index[i])

In [None]:
i=1
visualize_image(test.index[i], test.index[i])
visualize_image_denoise(test.index[i])

In [None]:
i=3
visualize_image(test.index[i], test.index[i])
visualize_image_denoise(test.index[i])

In [None]:
i=4
visualize_image(test.index[i], test.index[i])
visualize_image_denoise(test.index[i])

In [None]:
def visualize_train_batch(image_ids, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(3, 3, ind + 1)
        image = cv2.imread(convert_image_id_to_path(image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(f"{label[:30]}...", fontsize=10)
        plt.axis("off")
    
    plt.show()

In [None]:
tmp_df = df_train_labels[:9]
image_ids = tmp_df['image_id']
labels = tmp_df["InChI"].values
visualize_train_batch(image_ids, labels)

In [None]:
tmp_df = df_train_labels[:9]
image_ids = tmp_df['image_id']
labels = tmp_df["InChI"].values
visualize_train_batch(image_ids, labels)

In [None]:
print('Length of training-data:',len(df_train_labels))
print('Number of unique chemical identifier:',len(df_train_labels['InChI'].value_counts().index))
print('Max count of any chemical identifier in trainign data:',max(df_train_labels['InChI'].value_counts().values))

In [None]:
h_shape=[]
w_shape=[]
aspect_ratio=[]
for idx,image_id in enumerate(df_train_labels.image_id.values[:1000]):
    image = cv2.imread(df_train_labels['image_path'][idx])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    h_shape.append(image.shape[0])
    w_shape.append(image.shape[1])
    aspect_ratio.append(1.0 * (image.shape[1] / image.shape[0]))

In [None]:
plt.figure(figsize=(12, 12))
plt.subplots_adjust(top = 0.5, bottom=0.01, hspace=1, wspace=0.4)
plt.subplot(2, 2, 1)
plt.hist(np.array(h_shape) * np.array(w_shape), bins=50)
plt.xticks(rotation=45)
plt.title("Area Image Distribution", fontsize=14)
plt.subplot(2, 2, 2)
plt.hist(h_shape, bins=50)
plt.title("Height Image Distribution", fontsize=14)
print()
plt.subplot(2, 2, 3)
plt.hist(w_shape, bins=50)
plt.title("Width Image Distribution", fontsize=14)
plt.subplot(2, 2, 4)
plt.hist(aspect_ratio, bins=50)
plt.title("Aspect Ratio Distribution", fontsize=14);

In [None]:
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.applications import DenseNet121, ResNet50
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.densenet import preprocess_input
from keras.models import Model

In [None]:
# extract features from each image
def extract_features():
    
 # load the model
    model = DenseNet121()
    # re-structure the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    # summarize
    print(model.summary())
 # extract features from each image
    features = dict()
    for idx,name in enumerate(df_train_labels['image_path'].values[:100]):
        filename = name
        image = load_img(filename, target_size=(224, 224))
         # convert the image pixels to a numpy array
        image = img_to_array(image)
         # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
         # prepare the image for the DenseNet121 model
        image = preprocess_input(image)
         # get features
        feature = model.predict(image, verbose=0)
         # store feature
        features[df_train_labels['image_id'][idx]] = feature
        #print('>%s' % name)
    return features

In [None]:
#same but with ResNet50
# extract features from each image
def extract_features():
    model = ResNet50()
    # re-structure the model
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    # summarize
    print(model.summary())
 # extract features from each image
    features = dict()
    for idx,name in enumerate(df_train_labels['image_path'].values[:100]):
        filename = name
        image = load_img(filename, target_size=(224, 224))
         # convert the image pixels to a numpy array
        image = img_to_array(image)
         # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
         # prepare the image for the DenseNet121 model
        image = preprocess_input(image)
         # get features
        feature = model.predict(image, verbose=0)
         # store feature
        features[df_train_labels['image_id'][idx]] = feature
        #print('>%s' % name)
    return features

In [None]:
features = extract_features()
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

In [None]:
# extract texts for images
def load_text():
    mapping = dict()
    for idx,text in enumerate(df_train_labels['InChI'].values[:101]):
        mapping[df_train_labels['image_id'][idx]]=text
    return mapping

In [None]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key,value in descriptions.items():
        all_desc.update([value])
    return all_desc

In [None]:
texts = load_text()
vocabulary  = to_vocabulary(texts)

In [None]:
print('Loaded: %d ' % len(texts))
print('Vocabulary Size: %d' % len(vocabulary))

In [None]:
from tqdm.auto import tqdm
import Levenshtein

In [None]:
tqdm.pandas()

In [None]:
test = pd.read_csv(path + 'sample_submission.csv')

In [None]:
train=df_train_labels
train['InChI_list'] = train['InChI'].progress_apply(lambda x: x.split('/'))
train['InChI_length'] = train['InChI_list'].progress_apply(len)
InChI_df = train['InChI_list'].progress_apply(pd.Series)
train = pd.concat([train, InChI_df.add_prefix('InChI_')], axis=1)

In [None]:
display(train)

In [None]:
def get_score(y_true, y_pred):
    scores = []
    for true, pred in zip(y_true, y_pred):
        score = Levenshtein.distance(true, pred)
        scores.append(score)
    avg_score = np.mean(scores)
    return avg_score

In [None]:
mode_concat_string = ''
for i in range(11):
    mode_string = train[f'InChI_{i}'].fillna('nan').mode()[0]
    if mode_string != 'nan':
        if i == 0:
            mode_concat_string += mode_string
        else:
            mode_concat_string += '/' + mode_string

In [None]:
print(mode_concat_string)

In [None]:
y_true = train['InChI'].values
y_pred = [mode_concat_string] * len(train)
score = get_score(y_true, y_pred)

In [None]:
print(score)

In [None]:
test['InChI'] = mode_concat_string
output_cols = ['image_id', 'InChI']
display(test[output_cols])
test[output_cols].to_csv('submission.csv', index=False)