In [None]:
!pip install vizard

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm.auto import tqdm

import vizard

pd.options.display.max_colwidth = None
sns.set_style('darkgrid')

# Data

In [None]:
!unzip ../input/home-depot-product-search-relevance/product_descriptions.csv.zip
!unzip ../input/home-depot-product-search-relevance/train.csv.zip
!unzip ../input/home-depot-product-search-relevance/test.csv.zip

In [None]:
product_description = pd.read_csv('product_descriptions.csv')
print(product_description.shape)
product_description.head()

In [None]:
dtrain = pd.read_csv('train.csv', encoding='latin-1').merge(product_description, on='product_uid')
print(dtrain.shape)
dtrain.head()

In [None]:
dtest = pd.read_csv('test.csv', encoding='latin-1').merge(product_description, on='product_uid')
print(dtest.shape)
dtest.head()

# Cleaning

In [None]:
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
import re, spacy


class TextCleaner(BaseEstimator, TransformerMixin):
    """A general purpose text cleaning pipeline which utilizes `spacy` and regex to:
        * lower cases the text
        * removes urls and emails
        * removes html css and js
        * removes stop words
        * performs lemmatization
        * removes numbers, punctuations
        * trims white spaces

    Args:
        model (str): spacy language model, default: en
    """

    def __init__(self, model="en"):
        self.nlp = spacy.load(model, disable=["parser", "ner"])

    def fit(self, X=None):
        return self

    def transform(self, X):
        transformed = []
        for x in tqdm(X):
            x = str(x).strip().lower()  # Lower case the data
            x = re.sub(r"""((http[s]?://)[^ <>'"{}|\^`[\]]*)""", r" ", x)  # remove urls
            x = re.sub(
                r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", r" ", x
            )  # remove emails
            x = re.sub(r"<style.*>[\s\S]+</style>", " ", x)  # remove css
            x = re.sub(r"<script.*>[\s\S]*</script>", " ", x)  # remove js
            x = re.sub(
                r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", " ", x
            )  # remove html

            if len(x) != 0:
                parsed = self.nlp(x)
                lemmatized = " ".join([w.lemma_ for w in parsed if not w.is_stop])

                # Remove punct
                punct_removed = re.sub(r"\W", " ", str(lemmatized))
                punct_removed = re.sub(r"\d", " ", str(punct_removed))
                punct_removed = re.sub(r"\s+", " ", str(punct_removed))
            else:
                punct_removed = x
            transformed.append(punct_removed)
        return transformed

    def fit_transform(self, X):
        return self.fit(X).transform(X)

In [None]:
# textcleaner = TextCleaner()
# all_text = pd.DataFrame({
#     'raw_text': dtrain['product_title'].values.tolist() + dtrain['search_term'].values.tolist() + 
#     dtest['product_title'].values.tolist() + dtest['search_term'].values.tolist() + 
#     product_description['product_description'].values.tolist()
# }).drop_duplicates().reset_index(drop=True)
# all_text['cleaned_text'] = textcleaner.fit_transform(all_text['raw_text'])

# all_text.to_csv('home-depot-product-search-relevance.csv', index=False)

text_map = pd.read_csv('../input/qqp-cleaned/home-depot-product-search-relevance.csv')
text_map = {x:y for x, y in zip(text_map['raw_text'].values, text_map['cleaned_text'].values)}

dtrain['product_title'] = dtrain['product_title'].apply(lambda x: text_map[x])
dtrain['product_description'] = dtrain['product_description'].apply(lambda x: text_map[x])
dtrain['search_term'] = dtrain['search_term'].apply(lambda x: text_map[x])
dtest['product_title']  =  dtest['product_title'].apply(lambda x: text_map[x])
dtest['product_description']  =  dtest['product_description'].apply(lambda x: text_map[x])
dtest['search_term']  =  dtest['search_term'].apply(lambda x: text_map[x])

del text_map

# Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_vectorizer = TfidfVectorizer(sublinear_tf=True).fit(
    pd.concat([dtrain['product_title'], dtest['product_title']], axis=0).drop_duplicates().dropna()
)
description_vectorizer = TfidfVectorizer(sublinear_tf=True).fit(
    pd.concat([dtrain['product_description'], dtest['product_description']], axis=0).drop_duplicates().dropna()
)
search_vectorizer = TfidfVectorizer(sublinear_tf=True).fit(
    pd.concat([dtrain['search_term'], dtest['search_term']], axis=0).drop_duplicates().dropna()
)


train_title = title_vectorizer.transform(dtrain['product_title'].fillna(''))
test_title = title_vectorizer.transform(dtest['product_title'].fillna(''))
train_description = description_vectorizer.transform(dtrain['product_description'].fillna(''))
test_description = description_vectorizer.transform(dtest['product_description'].fillna(''))
train_search = search_vectorizer.transform(dtrain['search_term'].fillna(''))
test_search = search_vectorizer.transform(dtest['search_term'].fillna(''))

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.base import BaseEstimator, TransformerMixin


class ConvertToSparseTensor(BaseEstimator, TransformerMixin):
    """A utiliy to convert sparse vectors into sparse tensors"""

    def __init__(self):
        pass

    def fit(self, X):
        return self

    def transform(self, X):
        coo = X.tocoo()
        indices = np.mat([coo.row, coo.col]).transpose()
        return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))


In [None]:
cvt_to_tensors = ConvertToSparseTensor().fit(None)
train_title = cvt_to_tensors.transform(train_title)
test_title = cvt_to_tensors.transform(test_title)
train_description = cvt_to_tensors.transform(train_description)
test_description = cvt_to_tensors.transform(test_description)
train_search = cvt_to_tensors.transform(train_search)
test_search = cvt_to_tensors.transform(test_search)

y_train = dtrain['relevance'].values

# Modelling

In [None]:
title_input = tf.keras.Input(shape=train_title.shape[1], name='title')
description_input = tf.keras.Input(shape=train_description.shape[1], name='description')
search_input = tf.keras.Input(shape=train_search.shape[1], name='search')


title_x = tf.keras.layers.Dense(128, activation='relu')(title_input)
title_x = tf.keras.layers.Dropout(0.3)(title_x)
title_x = tf.keras.layers.BatchNormalization()(title_x)
title_x = tf.keras.layers.Dense(32, activation='relu')(title_x)
title_x = tf.keras.layers.Dropout(0.3)(title_x)
title_x = tf.keras.layers.BatchNormalization()(title_x)

description_x = tf.keras.layers.Dense(128, activation='relu')(description_input)
description_x = tf.keras.layers.Dropout(0.3)(description_x)
description_x = tf.keras.layers.BatchNormalization()(description_x)
description_x = tf.keras.layers.Dense(32, activation='relu')(description_x)
description_x = tf.keras.layers.Dropout(0.3)(description_x)
description_x = tf.keras.layers.BatchNormalization()(description_x)

search_x = tf.keras.layers.Dense(128, activation='relu')(search_input)
search_x = tf.keras.layers.Dropout(0.3)(search_x)
search_x = tf.keras.layers.BatchNormalization()(search_x)
search_x = tf.keras.layers.Dense(32, activation='relu')(search_x)
search_x = tf.keras.layers.Dropout(0.3)(search_x)
search_x = tf.keras.layers.BatchNormalization()(search_x)

x = tf.keras.layers.concatenate([title_x, description_x, search_x])
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(32, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.BatchNormalization()(x)

output_layer = tf.keras.layers.Dense(1)(x)

model = tf.keras.models.Model([title_input, description_input, search_input], output_layer)
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(lr=1e-4))
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.summary()

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, verbose=1, restore_best_weights=True)
rlp = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, verbose=1)

history = model.fit(
    [train_title, train_description, train_search], y_train, callbacks=[es, rlp], epochs=250, batch_size=64
)

In [None]:
fig, ax = plt.subplots(figsize=(20,6))
pd.DataFrame(history.history)[['loss']].plot(ax=ax);

In [None]:
pd.DataFrame({
    'id': dtest.id.values,
    'relevance': np.clip(np.ravel(model.predict([test_title, test_description, test_search])), 1, 3)
}).to_csv('submission.csv', index=False)

# Reference
[Neural Collaborative Filtering](https://arxiv.org/pdf/1708.05031.pdf)