In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.keras import layers, initializers, models, metrics, optimizers
import string
import itertools
import pickle

In [None]:
def get_bag_of_words(sentence):
    cv = CountVectorizer()
    docs = np.array([sentence])
    try:
        cv.fit_transform(docs)
        return cv.get_feature_names()
    except ValueError:
        return None

def get_char_trigrams(tokens):
    if tokens is None:
        return ['']
    trigrams = ''
    for token in tokens: 
        vectorizer = CountVectorizer(analyzer="char_wb", ngram_range=(3,3))
        docs = np.array([token])
        bag = vectorizer.fit_transform(docs)
        trigrams = trigrams + ' ' + ' '.join([f.replace(' ', '#') for f in vectorizer.get_feature_names()])
    return [trigrams.strip()]

def words_hashing(string):
    bow = get_bag_of_words(string)
    trigrams = get_char_trigrams(bow)
    return trigrams

DSS model implementation.

In [None]:
def get_all_trigrams():
    alphabet = string.ascii_lowercase + string.digits
    trigrams = [''.join(x) for x in itertools.product(alphabet,repeat=3)]
    bigrams = itertools.product(alphabet,repeat=2)
    for x in bigrams:
        trigrams.extend([f"{''.join(x)}#", f"#{''.join(x)}"])
    return tf.convert_to_tensor(trigrams,dtype=tf.string)

In [None]:
TRIGRAMS = get_all_trigrams()

In [None]:
HIDDEN_NODES = 300
SEMANTIC_NODES = 128

In [None]:
def create_semantic_feature_layer(input_data, name):
    words_hashing_layer = layers.TextVectorization(
        standardize=None,
        output_mode='count',
        vocabulary=TRIGRAMS,
        name=f"words_hashing_{name}"
    )
    words_hashing = words_hashing_layer(input_data)
    hidden_layer1 = layers.Dense(HIDDEN_NODES, kernel_initializer="glorot_uniform", activation="tanh", name=f"hidden_layer_1_{name}")(words_hashing)
    hidden_layer2 = layers.Dense(HIDDEN_NODES, kernel_initializer="glorot_uniform", activation="tanh", name=f"hidden_layer_2_{name}")(hidden_layer1)
    semantic_feature_layer = layers.Dense(SEMANTIC_NODES, kernel_initializer="glorot_uniform", activation="tanh", name=f"semantic_feature_{name}")(hidden_layer2)
    return semantic_feature_layer

In [None]:
input_query = layers.Input(shape=(1,), dtype=tf.string, name="vectorized_query")
input_document = layers.Input(shape=(1,), dtype=tf.string, name="vectorized_documents")
semantic_feature_query = create_semantic_feature_layer(input_query, "query")
semantic_feature_document = create_semantic_feature_layer(input_document, "document")

cosine_similarity = layers.Dot(axes=-1, normalize=True)([semantic_feature_query, semantic_feature_document])

relevance = 2 * tf.squeeze(cosine_similarity) + 1

# outputs = layers.Softmax(axis=-1, name="posterior_probability")(relevance)

model = keras.Model(inputs=[input_query, input_document], outputs=relevance, name="dssm")

In [None]:
model.compile(
    optimizer=optimizers.Adam(), 
    loss="categorical_crossentropy", 
    metrics=[metrics.RootMeanSquaredError()]
)

In [None]:
model.summary()

Data loading.

In [None]:
with open("../input/traindata/train_inputs.pickle", "rb") as f:
    train_inputs = pickle.load(f)

with open("../input/traindata/train_target.pickle", "rb") as f:
    train_target = pickle.load(f)

Model training.

In [None]:
model.fit(
    x=train_inputs,
    y=train_target,
    validation_split=0.5,
    batch_size=1024,
    epochs=20
)

Predicting.

In [None]:
with open("../input/test-data/test_inputs.pickle", "rb") as f:
    test_inputs = pickle.load(f)
    
test_inputs = test_inputs[1]
test = pd.read_csv("../input/home-depot-product-search-relevance/test.csv.zip", encoding="ISO-8859-1")

In [None]:
test_inputs[0] = np.asarray([[''] if x is None else x for x in test_inputs[0]], dtype=object)

In [None]:
predictions = model.predict(
    x=test_inputs
)

In [None]:
predictions_df = pd.DataFrame(
    data={"id": test["id"], "relevance": predictions}
)

In [None]:
with open("./submission.csv", "w", newline="") as f:
    predictions_df.to_csv(f, index=False)

Training data preprocessing and saving.


In [None]:
# train = pd.read_csv("../input/home-depot-product-search-relevance/train.csv.zip", encoding="ISO-8859-1")
# descriptions = pd.read_csv("../input/home-depot-product-search-relevance/product_descriptions.csv.zip")

In [None]:
# train_df = train.merge(descriptions, on="product_uid")
# train_df["product"] = train_df["product_title"] + " " + train_df["product_description"]

In [None]:
# def prepare_train_data(df):
#     prepared_documents = []
#     prepared_queries = []
#     prepared_relevances = []
#     for _, row in df.iterrows():
#         hashed_query = words_hashing(row["search_term"])
#         if hashed_query is None:
#             continue
#         hashed_document = words_hashing(row["product"])
#         if hashed_document is None:
#             continue
#         relevance = row["relevance"]
#         prepared_queries.append(hashed_query)
#         prepared_documents.append(hashed_document)
#         prepared_relevances.append(relevance)
#     inputs = {
#         "vectorized_documents": np.asarray(prepared_documents),
#         "vectorized_query": np.asarray(prepared_queries)
#     }
#     target = np.asarray(prepared_relevances)
#     return inputs, target

In [None]:
# train = prepare_data(train_df, queries)
# train_inputs, train_target = train[0], train[1]

In [None]:
# with open("./train_inputs.pickle", "wb") as f:
#     pickle.dump(train_inputs, f)
    
# with open("./train_target.pickle", "wb") as f:
#     pickle.dump(train_target, f)

Test data preprocessing.

In [None]:
# test = pd.read_csv("../input/home-depot-product-search-relevance/test.csv.zip", encoding="ISO-8859-1")
# descriptions = pd.read_csv("../input/home-depot-product-search-relevance/product_descriptions.csv.zip")

In [None]:
# test_df = test.merge(descriptions, on="product_uid")
# test_df["product"] = test_df["product_title"] + " " + test_df["product_description"]

In [None]:
# def prepare_test_data(df):
#     prepared_documents = []
#     prepared_queries = []
#     for _, row in df.iterrows():
#         prepared_queries.append(words_hashing(row["search_term"]))
#         prepared_documents.append(words_hashing(row["product"]))
#     inputs = [np.asarray(prepared_queries, dtype=object), np.asarray(prepared_documents, dtype=object)]
#     return inputs

In [None]:
# test_inputs = prepare_test_data(test_df)

In [None]:
# with open("./test_inputs.pickle", "wb") as f:
#     pickle.dump(test_inputs, f)