# Report link
 Notion link: hvn2706.notion.site/INT3405E_20-B-o-c-o-b-i-t-p-cu-i-kh-a-8ddde04d9d7e400383ece7927e884036

# Initialize

In [None]:
%pip install nltk
%pip install bs4
%pip install textdistance
%pip install catboost

In [None]:
#data visualize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import math
import time

# preprocess
import string
import re

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import ast

# extract feature
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy import spatial
import textdistance

In [None]:
nltk.download('stopwords')

In [None]:
PATH = '../input/home-depot-product-search-relevance/'
attributes = pd.read_csv(f'{PATH}attributes.csv.zip')
descriptions = pd.read_csv(f'{PATH}product_descriptions.csv.zip')
question_test = pd.read_csv(f'{PATH}test.csv.zip', encoding='latin-1')
train = pd.read_csv(f'{PATH}train.csv.zip', encoding='latin-1')

# Data exploration

In [None]:
train['relevance'].describe()

In [None]:
sns.countplot(train['relevance'])

In [None]:
train[train.isna().any(axis=1)]

In [None]:
descriptions[descriptions.isna().any(axis=1)]

In [None]:
attributes[attributes.isna().any(axis=1)]

In [None]:
# number of unique products in total, in test dataset and in train dataset
print(len(descriptions['product_uid'].unique()), len(question_test['product_uid'].unique()), len(train['product_uid'].unique()))

In [None]:
print('Longest title:', train['product_title'].apply(lambda x: len(str(x))).max())
print('Shortest title:', train['product_title'].apply(lambda x: len(str(x))).min())
sns.countplot(train['product_title'].apply(lambda x: len(str(x))))

In [None]:
print('longest search_term:', train['search_term'].apply(lambda x: len(str(x))).max())
print('shortest search_term:', train['search_term'].apply(lambda x: len(str(x))).min())
sns.countplot(train['search_term'].apply(lambda x: len(str(x))))

In [None]:
print('longest attribute:', attributes['value'].apply(lambda x: len(str(x))).max())
print('shortest attribute:', attributes['value'].apply(lambda x: len(str(x))).min())
plt.plot(attributes['value'].apply(lambda x: len(str(x))))

In [None]:
print('longest description:', descriptions['product_description'].apply(lambda x: len(str(x))).max())
print('shortest description:', descriptions['product_description'].apply(lambda x: len(str(x))).min())
plt.plot(descriptions['product_description'].apply(lambda x: len(str(x))))

# Preprocessing

Remove null value

In [None]:
train.fillna(' ')
descriptions.fillna(' ')
attributes[['name', 'value']] = attributes[['name', 'value']].fillna(' ')
attributes['product_uid'] = attributes['product_uid'].fillna(0)

Fixing typos in search_term

In [None]:
# load typos file from https://www.kaggle.com/steubk/fixing-typos
file = open("../input/home-depot-typos/search_term_typo.txt", "r")
contents = file.read()
correct_typo = ast.literal_eval(contents)

file.close()

In [None]:
train['search_term'] = train['search_term'].map(lambda x: correct_typo[x] if x in correct_typo.keys() else x)
question_test['search_term'] = question_test['search_term'].map(lambda x: correct_typo[x] if x in correct_typo.keys() else x)

Merge the data into one data frame

In [None]:
attributes['product_uid'] = attributes['product_uid'].astype(np.int32)
attributes['name_value'] = attributes['name'].map(str) + ' ' + attributes['value'].map(str)
att_tmp = pd.pivot_table(attributes, index=['product_uid'], values=['name_value'], aggfunc=lambda x: ' '.join(x))

In [None]:
att_tmp.head(5)

In [None]:
train = pd.merge(train, descriptions, how='left', on='product_uid')
train = pd.merge(train, att_tmp, how='left', on='product_uid')

question_test = pd.merge(question_test, descriptions, how='left', on='product_uid')
question_test = pd.merge(question_test, att_tmp, how='left', on='product_uid')

In [None]:
train.head()

Turn sentences into tokens of words, remove html tags, stopwords and stemming

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def tokenize(text: str):
    text = str(text)
    word_tokens = tokenizer.tokenize(text)
    return word_tokens


def remove_html(text):
    # reference from https://www.kaggle.com/yowtshjhj/hdp-search-relevant-from-doananh020418?scriptVersionId=82967241&cellId=44
    text = str(text)
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text().replace('Click here to review our return policy for additional information regarding returns', '')
    return text


def remove_stopwords(token_list: list):
    filtered_sentence = []
    for token in token_list:
        if token not in stop_words:
            filtered_sentence.append(token)
    return filtered_sentence


def stemming(token_list: list):
    stemmed_sentence = []
    for token in token_list:
        stemmed_sentence.append(ps.stem(token))
    return stemmed_sentence


def preprocess(df):
    df.fillna('')
    df['name_value'] = df['name_value'].apply(lambda x: remove_html(x))
    df['token_name_value'] = df['name_value'].apply(lambda x: tokenize(x))
    df['token_name_value'] = df['token_name_value'].apply(lambda x: remove_stopwords(x))
    df['token_name_value'] = df['token_name_value'].apply(lambda x: stemming(x))
    print('attribute done!')
    
    df['product_description'] = df['product_description'].apply(lambda x: remove_html(x))
    df['token_description'] = df['product_description'].apply(lambda x: tokenize(x))
    df['token_description'] = df['token_description'].apply(lambda x: remove_stopwords(x))
    df['token_description'] = df['token_description'].apply(lambda x: stemming(x))
    print('description done!')
    
    df['product_title'] = df['product_title'].apply(lambda x: remove_html(x))
    df['token_title'] = df['product_title'].apply(lambda x: tokenize(x))
    df['token_title'] = df['token_title'].apply(lambda x: remove_stopwords(x))
    df['token_title'] = df['token_title'].apply(lambda x: stemming(x))
    print('title done!')
    
    df['search_term'] = df['search_term'].apply(lambda x: remove_html(x))
    df['token_search_term'] = df['search_term'].apply(lambda x: tokenize(x))
    df['token_description'] = df['token_description'].apply(lambda x: remove_stopwords(x))
    df['token_search_term'] = df['token_search_term'].apply(lambda x: stemming(x))
    print('search term done!')
    
    return df

In [None]:
start_time = time.time()
train = preprocess(train) #might take some minutes to run
run_time = time.time() - start_time
print(run_time)

In [None]:
train.head(5)

# Feature extraction

## Get text and token length

In [None]:
def cal_length_text(df):
    # get the number of characters in name_value, product_description, product_title and search_term
    df['len_name_value'] = df['name_value'].apply(lambda x: len(str(x)))
    df['len_description'] = df['product_description'].apply(lambda x: len(x))
    df['len_title'] = df['product_title'].apply(lambda x: len(x))
    df['len_search_term'] = df['search_term'].apply(lambda x: len(x))
    df['len_all'] = df['len_name_value'] + df['len_description'] + df['len_title']

    return df

def cal_length_token(df):
    # get the number of words in name_value, product_description, product_title and search_term
    df['token_len_name_value'] = df['token_name_value'].apply(lambda x: len(x))
    df['token_len_description'] = df['token_description'].apply(lambda x: len(x))
    df['token_len_title'] = df['token_title'].apply(lambda x: len(x))
    df['token_len_search_term'] = df['token_search_term'].apply(lambda x: len(x))
    df['token_len_all'] = df['token_len_name_value'] + df['token_len_description'] + df['token_len_title']

    return df

In [None]:
train = cal_length_text(train)
train = cal_length_token(train)

## Calculate tf-idf

In [None]:
def count_words(term, docs: list):
    # count how many times words in search_term appear in a document
    # docs: list(list(token))
    cnt = 0
    for word in term:
        for token in docs:
            if word in token:
                cnt += 1
    return cnt

In [None]:
def calIDF(term, docs: list):
    # calculate inverse document frequency
    N = len(docs)
    df = 0
    
    for doc in docs:
        for token in doc:
            check = False
            for word in term:
                if word in token:
                    check = True
                    df += 1
                    break
            if check:
                break
    
    return math.log(N / (1 + df))

# calIDF(['hello'], [['hello', 'i', 'am', 'good'], ['your', 'dog'], ['my', 'dog', 'say', 'hi']])

In [None]:
def cal_tfidf_features(df):
    # extract number of words, tf, idf and tfidf in text columns
    len_df = len(df.index)

    cnt_name_value = []
    cnt_description = []
    cnt_title = []
    cnt_all = []

    tf_name_value = []
    tf_description = []
    tf_title = []
    tf_all = []
    
    for i in range(len_df):
        cnt_name_value.append(count_words(df['token_search_term'][i], df['token_name_value'][i]))
        cnt_description.append(count_words(df['token_search_term'][i], df['token_description'][i]))
        cnt_title.append(count_words(df['token_search_term'][i], df['token_title'][i]))
        
        docs = df['token_name_value'][i] + df['token_description'][i] + df['token_title'][i]
        cnt_all.append(count_words(df['token_search_term'][i], docs))


    df['cnt_name_value'] = cnt_name_value
    df['cnt_description'] = cnt_description
    df['cnt_title'] = cnt_title
    df['cnt_all'] = cnt_all

    df['tf_name_value'] = df['cnt_name_value'] / df['token_len_name_value']
    df['tf_description'] = df['cnt_description'] / df['token_len_description']
    df['tf_title'] = df['cnt_title'] / df['token_len_title']
    df['tf_all'] = df['cnt_all'] / df['token_len_all']
    
    idf_all = []

    for i in range(len_df):
        docs = df['token_name_value'][i] + df['token_description'][i] + df['token_title'][i]
        idf_all.append(calIDF(df['token_search_term'][i], docs))
    
    df['idf_all'] = idf_all
    df['tf_idf'] = df['tf_all'] * df['idf_all']
    
    return df

In [None]:
start_time = time.time()
train = cal_tfidf_features(train)
run_time = time.time() - start_time
print(run_time)

## Convert documents into vector and calculate similarity

Merge the tokens into list of documents

In [None]:
def demerge(df):
    # merge back list of tokens into one string to vectorize strings
    
    doc_name_value = []
    for doc in df['token_name_value']:
        doc_name_value.append(' '.join(doc))
    df['doc_name_value'] = doc_name_value
    
    doc_description = []
    for doc in df['token_description']:
        doc_description.append(' '.join(doc))
    df['doc_description'] = doc_description
    
    doc_title = []
    for doc in df['token_title']:
        doc_title.append(' '.join(doc))
    df['doc_title'] = doc_title
    
    doc_search_term = []
    for doc in df['token_search_term']:
        doc_search_term.append(' '.join(doc))
    df['doc_search_term'] = doc_search_term
    
    return df

Convert documents into vector

In [None]:
tfidfvectorizer = TfidfVectorizer()
countvectorizer = CountVectorizer()

def vectorize(df, vectorizer):
    # vectorizer: tfidfvectorizer or countvectorizer
    
    if vectorizer == tfidfvectorizer:
        prefix = 't'
    else:
        prefix = 'c'

    v_name_value = []
    v_description = []
    v_title = []
    v_search_term = []
    
    for i in range(len(df.index)):
        data = [df['doc_name_value'][i], df['doc_description'][i], df['doc_title'][i], df['doc_search_term'][i]]
        count_wm = vectorizer.fit_transform(data)
        vectors = count_wm.toarray()
        
        v_name_value.append(vectors[0])
        v_description.append(vectors[1])
        v_title.append(vectors[2])
        v_search_term.append(vectors[3])
    
    df[f'{prefix}v_name_value'] = v_name_value
    df[f'{prefix}v_description'] = v_description
    df[f'{prefix}v_title'] = v_title
    df[f'{prefix}v_search_term'] = v_search_term
    
    return df
    

def cosine_similarity(data1, data2):
    if all(np.array(data1) == 0) or all(np.array(data2) == 0):
        return 0
    return 1 - spatial.distance.cosine(data1, data2)

    
def get_cosine_similarity(df, prefix = 't'):
    """
    prefix = t: tfidf
    prefix = c: count
    """
    cosine_sl_name_value = []
    cosine_sl_description = []
    cosine_sl_title = []
    
    for i in range(len(df.index)):
        # cosine similarity
        cosine_sl_name_value.append(cosine_similarity(df[f'{prefix}v_name_value'][i], df[f'{prefix}v_search_term'][i]))
        cosine_sl_description.append(cosine_similarity(df[f'{prefix}v_description'][i], df[f'{prefix}v_search_term'][i]))
        cosine_sl_title.append(cosine_similarity(df[f'{prefix}v_title'][i], df[f'{prefix}v_search_term'][i]))

    df[f'{prefix}cosine_sl_name_value'] = cosine_sl_name_value
    df[f'{prefix}cosine_sl_description'] = cosine_sl_description
    df[f'{prefix}cosine_sl_title'] = cosine_sl_title
    
    return df


def get_jaccard_similarity(df):
    jaccard_sl_name_value = []
    jaccard_sl_description = []
    jaccard_sl_title = []

    for i in range(len(df.index)):
        # jaccard similarity
        jaccard_sl_name_value.append(textdistance.jaccard.normalized_similarity(df['doc_search_term'][i], df['doc_name_value'][i]))
        jaccard_sl_description.append(textdistance.jaccard.normalized_similarity(df['doc_search_term'][i], df['doc_description'][i]))
        jaccard_sl_title.append(textdistance.jaccard.normalized_similarity(df['doc_search_term'][i], df['doc_title'][i]))

    df['jaccard_sl_name_value'] = jaccard_sl_name_value
    df['jaccard_sl_description'] = jaccard_sl_description
    df['jaccard_sl_title'] = jaccard_sl_title

    return df


def get_hamming_similarity(df):
    hamming_sl_name_value = []
    hamming_sl_description = []
    hamming_sl_title = []

    for i in range(len(df.index)):
        # hamming similarity
        hamming_sl_name_value.append(textdistance.hamming.normalized_similarity(df['doc_search_term'][i], df['doc_name_value'][i]))
        hamming_sl_description.append(textdistance.hamming.normalized_similarity(df['doc_search_term'][i], df['doc_description'][i]))
        hamming_sl_title.append(textdistance.hamming.normalized_similarity(df['doc_search_term'][i], df['doc_title'][i]))

    df['hamming_sl_name_value'] = hamming_sl_name_value
    df['hamming_sl_description'] = hamming_sl_description
    df['hamming_sl_title'] = hamming_sl_title

    return df

In [None]:
# merge all tokens in name_value, descriptions, title, search_term to vectorize
train = demerge(train)

In [None]:
start_time = time.time()
train = vectorize(train, tfidfvectorizer)
train = vectorize(train, countvectorizer)
run_time = time.time() - start_time
print(run_time)

In [None]:
train = get_cosine_similarity(train, prefix='t')
train = get_cosine_similarity(train, prefix='c')
train = get_jaccard_similarity(train)
train = get_hamming_similarity(train)

# Apply to test data set

Apply to test set

In [None]:
start_time = time.time()

question_test = preprocess(question_test) #might take some minutes to run

question_test = cal_length_text(question_test)
question_test = cal_length_token(question_test)
question_test = cal_tfidf_features(question_test)
print('tfidf done')

question_test = demerge(question_test)
print('demerge done')

question_test = vectorize(question_test, tfidfvectorizer)
print('tfidfvectorize done')

question_test = vectorize(question_test, countvectorizer)
print('countvectorize done')

question_test = get_cosine_similarity(question_test, prefix='t') # tfidf vectorizer
question_test = get_cosine_similarity(question_test, prefix='c') # count vectorizer
question_test = get_jaccard_similarity(question_test) # jaccard
question_test = get_hamming_similarity(question_test) # hamming
print('similarity done')

run_time = time.time() - start_time
print(run_time)

# Fit the data to the model

## Features

In [None]:
features = ['len_name_value', 'len_description', 'len_title', 'len_search_term', 'len_all',
            'token_len_name_value', 'token_len_description', 'token_len_title', 'token_len_search_term', 'token_len_all',
            'cnt_name_value', 'cnt_description', 'cnt_title', 'cnt_all', 
            'tf_name_value', 'tf_description', 'tf_title', 'tf_all', 'idf_all', 'tf_idf', 
            'tcosine_sl_name_value', 'tcosine_sl_description', 'tcosine_sl_title', 
            'ccosine_sl_name_value', 'ccosine_sl_description', 'ccosine_sl_title',
            'jaccard_sl_name_value', 'jaccard_sl_description', 'jaccard_sl_title',
            'hamming_sl_name_value', 'hamming_sl_description', 'hamming_sl_title']

In [None]:
train[['product_uid', 'relevance'] + features]

In [None]:
# split the data to evaluate
from sklearn.model_selection import train_test_split
y = train['relevance']
from sklearn.metrics import mean_squared_error

X = train[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1, random_state=1)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

rf_model = RandomForestRegressor(random_state=1, max_depth=6)
clf = BaggingRegressor(rf_model, random_state=1)
clf.fit(train_X, train_y)

rf_home_depot_preds = clf.predict(val_X)
rf_val_mse = mean_squared_error(val_y, rf_home_depot_preds)
rf_val_rmse = math.sqrt(rf_val_mse)

print("Validation RMSE for Random Forest Model: {}".format(rf_val_rmse))

## Gradient Boosting lightgbm

In [None]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

lgb_model = LGBMRegressor()
lgb_model.fit(train_X, train_y)

lgb_home_depot_preds = lgb_model.predict(val_X)
lgb_val_mse = mean_squared_error(val_y, lgb_home_depot_preds)
lgb_val_rmse = math.sqrt(lgb_val_mse)

print("Validation RMSE for Light GBM: {}".format(lgb_val_rmse))

## Catboost

In [None]:
import catboost as cb

cb_model = cb.CatBoostRegressor(silent=True)
cb_model.fit(train_X, train_y)

cb_home_depot_preds = cb_model.predict(val_X)
cb_val_mse = mean_squared_error(val_y, cb_home_depot_preds)
cb_val_rmse = math.sqrt(cb_val_mse)

print("Validation RMSE for Cat Boost: {}".format(cb_val_rmse))

# Model evaluation

ploting the difference between predicted values and actual values

In [None]:
plt.plot(np.sort(np.array(rf_home_depot_preds - val_y)))

In [None]:
plt.plot(np.sort(np.array(lgb_home_depot_preds - val_y)))

In [None]:
cb_diff = np.sort(np.array(cb_home_depot_preds - val_y))
plt.plot(cb_diff)

# Save submission

In [None]:
final_model = cb_model # best result

final_model.fit(train[features], train['relevance'])
testX = question_test[features]
test_predict = final_model.predict(testX)

answer = pd.DataFrame(data={'id': question_test['id'], 'relevance': test_predict})
answer['relevance'] = answer['relevance'].apply(lambda x: 3 if x > 3 else x) # some predictions exceeded 3 like 3.01 or 3.02

answer.to_csv('submission.csv', index=False)