In [34]:
import numpy as np
import pandas as pd
import random
import xgboost as xgb
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [35]:
hackathon_data = pd.read_excel('hackathon_data.xlsx')
hackathon_data['brand'] = hackathon_data['brand'].astype('category')
hackathon_data.shape

(24799, 26)

In [36]:
meta_dict = pd.DataFrame(hackathon_data.columns.to_list(), columns = ['Colname'])

# Adding metrics such as descriptions, missing rate, number of levels & data type of columns taken from the data dictionary
for idx, cols in enumerate(meta_dict.Colname):
    meta_dict.loc[idx, 'Missing_%'] = round(100*hackathon_data[cols].isna().sum()/hackathon_data.shape[0],2)
    meta_dict.loc[idx, 'Levels'] = hackathon_data[cols].nunique()
    meta_dict.loc[idx, 'Datatype'] = hackathon_data[cols].dtype

meta_dict

Unnamed: 0,Colname,Missing_%,Levels,Datatype
0,id,0.0,24799.0,int64
1,brand,0.05,190.0,category
2,bullet_point,7.73,17997.0,object
3,color,50.47,1399.0,object
4,item_id,0.0,24726.0,object
5,item_name,0.0,22041.0,object
6,model_name,89.1,1364.0,object
7,model_number,39.01,12706.0,object
8,model_year,0.0,11.0,int64
9,product_type,0.0,391.0,object


In [37]:
# we will consider a subset of features that contains enough hit-rates & enough levels
meta_dict[(meta_dict['Missing_%'] <= 50) &(meta_dict.Levels > 5)]

Unnamed: 0,Colname,Missing_%,Levels,Datatype
0,id,0.0,24799.0,int64
1,brand,0.05,190.0,category
2,bullet_point,7.73,17997.0,object
4,item_id,0.0,24726.0,object
5,item_name,0.0,22041.0,object
7,model_number,39.01,12706.0,object
8,model_year,0.0,11.0,int64
9,product_type,0.0,391.0,object
11,item_keywords,17.22,13027.0,object
13,marketplace,0.0,6.0,object


In [38]:
hackathon_data.groupby('product_type').count()['id'].sort_values(ascending=False)

product_type
GROCERY                 5619
SHOES                   1507
CHAIR                   1421
RUG                      817
SOFA                     777
                        ... 
KITCHEN_KNIFE              1
KNIFE_BLOCK_SET            1
LIP_COLOR                  1
MANUAL_SHAVING_RAZOR       1
KITCHEN_TOOLS              1
Name: id, Length: 391, dtype: int64

In [39]:
hackathon_data.groupby('product_category').count()['id'].sort_values(ascending=False)

product_category
GROCERY_SNACK_BEVERAGE       7595
FURNITURE                    4031
SHOES_CLOTHING_PERSONAL      2280
HOME                         1506
HOME_DECOR                   1310
MEDICATION_PERSONAL_CARE     1122
JEWELRY                      1006
LIGHTING                      983
BED_BATH                      871
OFFICE_STUDY                  432
KITCHEN                       391
OUTDOOR                       384
SUPPLEMENT                    358
BEAUTY                        347
TOOLS_HARDWARE_HANDYMAN       298
CE_ACCESSORY                  212
GARDENING                     206
JANITORIAL_CLEANING           163
HOME_ELECTRONIC_APPLIANCE     162
SPORTS_GYM                    158
PHONE_ACCESSORY               155
PET                           154
BAG_LUGGAGE                   137
COMPUTER_ACCESSORY            130
AUTO_ACCESSORY                103
ORGANIZER_STORAGE              98
SAFETY_SECURITY                72
MUSICAL_INSTRUMENT             37
GAMING_HOBBY                   

In [40]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
import re

punctuations_removal = str.maketrans('','', string.punctuation) # creating a mapping to remove punctuations
stops = stopwords.words('english') # creating a list of stop words in English

def clean_text(input_text):
    
    if type(input_text) == float:
        return ""
    # punctuations_removal = str.maketrans('','', string.punctuation) # creating a mapping to remove punctuations
    # stops = stopwords.words('english') # creating a list of stop words in English
    temp = input_text.lower() # converting to lowercase
    temp = temp = re.sub(r'http\S+', '', temp) #removing links
    
    tokens = word_tokenize(temp) # converting strings to list of word
    stripped_tokens = [words.translate(punctuations_removal) for words in tokens] # removing punctuations
    stripped_tokens = [word for word in stripped_tokens if word.isalpha()]# removing non-alphanumeric words 
    stripped_tokens = [lemmatizer.lemmatize(word) for word in stripped_tokens] # lemmatizing
    stripped_tokens = [word for word in stripped_tokens if word not in stops] # removing stop words  
    return ' '.join(stripped_tokens) # joining words to a text list

In [41]:
relevant_cols = ['item_id', 'item_name', 'brand', 'model_year', 'product_type', 'item_keywords', 'marketplace', 'product_category', 'bullet_point']
clean_df = hackathon_data[relevant_cols]
# applying the cleaner function on the dataset
clean_df['unclean_text'] = clean_df['item_keywords']  
clean_df['final_text'] = clean_df.apply(lambda x: clean_text(x['unclean_text']), axis = 1)
clean_df = clean_df[(clean_df.final_text != '') & (clean_df.marketplace != ' TDS')].reset_index().drop(['index'], axis = 1)

In [42]:
clean_df[['unclean_text', 'final_text']]

Unnamed: 0,unclean_text,final_text
0,"'double', 'crystal'",double crystal
1,"'freeze', 'plate', 'excluder', '80x40', 'balle...",freeze plate excluder ballerina owl hunting mi...
2,"'tripods', 'quick', 'mount', 'stands', 'stage'...",tripod quick mount stand stage proline system ...
3,"'zapatos shoe para de ladies mujer womans', 'd...",zapatos shoe para de lady mujer woman designer...
4,"'villeroy', 'crystal', 'shot', 'wine', 'coors'...",villeroy crystal shot wine coors light glass b...
...,...,...
20518,"'monoprice', 'accessories', 'cutter', 'pen'",monoprice accessory cutter pen
20519,"'whole foods, whole food, Whole Foods,365 Ever...",whole food whole food whole everyday value
20520,"'apple', 'MiFi', 'charging', '5', 'charge', 'm...",apple mifi charging charge mac ipod accessory ...
20521,"'energy efficient', 'amazon basics', 'A21', 'L...",energy efficient amazon basic led energy star ...


In [49]:
## IMPORTING WORD2VEC MODEL
import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# #Sentences are encoded by calling model.encode()
# keyword_vector = model.encode(clean_df.final_text)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def word_embedding(X, method, use_idf = True):
    
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
        vectorizer.fit(X)
        embedded_vectors = vectorizer.transform(X).toarray()
        return embedded_vectors

    elif method == 'count':
        vectorizer = CountVectorizer()
        vectorizer.fit(X)
        embedded_vectors = vectorizer.transform(X)
        return embedded_vectors

    elif method == 'wv':
        wv = api.load('word2vec-google-news-300')
        tf = TfidfVectorizer(use_idf=True)
        tf.fit_transform(X)
        # whether we should use IDF to weight it
        if use_idf:
            embeddings = clean_df['final_text'].apply(lambda text: np.mean([wv[word] for word in text.split() if word in wv], axis = 0)).reset_index()['final_text']
        else:
            embeddings = clean_df['final_text'].apply(lambda text: np.mean([wv[word]*tf.idf_[tf.vocabulary_[word]] for word in text.split() if word in wv], axis = 0)).reset_index()['final_text']

        embedded_vectors = np.zeros((len(embeddings),300))
        for i in range(embeddings.shape[0]):
            embedded_vectors[i] = embeddings[i]
        return embedded_vectors
    
    elif method == 'distilbert':
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        return model.encode(X)
    else:
        raise ValueError()

In [72]:
bert_keyword_vector = word_embedding(clean_df["final_text"], 'distilbert')
wv_keyword_vector = word_embedding(clean_df["final_text"], 'wv')

In [73]:
from sklearn.preprocessing import OneHotEncoder
ohe_vectors = {}
for colname in ['product_type', 'marketplace', 'product_category']:
    vector_name = colname + '_vector'
    print(vector_name)
    ohe = OneHotEncoder(categories='auto',  # Categories per feature
        drop='first', # Whether to drop one of the features
        sparse=False, # Will return sparse matrix if set True
        handle_unknown='error' # Whether to raise an error
        )
    transformed = ohe.fit_transform(clean_df[[colname]])
    ohe_vectors[vector_name] = transformed

product_type_vector
marketplace_vector
product_category_vector


In [74]:
print(bert_keyword_vector.shape)
print(wv_keyword_vector.shape)
print(ohe_vectors['product_type_vector'].shape)
print(ohe_vectors['marketplace_vector'].shape)
print(ohe_vectors['product_category_vector'].shape)


(20523, 384)
(20523, 300)
(20523, 375)
(20523, 5)
(20523, 32)


In [75]:
w1, w2, w3, w4 = 0.5, 0, 0.25, 0.25
final_array_bert = np.concatenate((w1 * bert_keyword_vector, #, w2 * ohe_vectors['product_type_vector']
    w3 * ohe_vectors['marketplace_vector'], w4 * ohe_vectors['product_category_vector']), axis = 1)
final_array_wv = np.concatenate((w1 * wv_keyword_vector, #, w2 * ohe_vectors['product_type_vector']
    w3 * ohe_vectors['marketplace_vector'], w4 * ohe_vectors['product_category_vector']), axis = 1)

In [82]:
from scipy.spatial.distance import cosine as cosine_distance
  
def get_top_recommendations(product_id, original_df = clean_df, numerical_array = final_array_bert, n_rows = 10):
    """Returns the top 5 most similar movies to a specified movie """
    
    # Get the index
    idx = original_df[original_df.item_id == product_id].index.tolist()[0]
    # Get the corresponding vector
    product_vector = numerical_array[idx]
    product_name = original_df[original_df.item_id == product_id].item_name.tolist()[0]

    # initilising lists for similarity table
    similarity_score_list = []
    product_id_list = []
    product_name_list = []
    # Iterate over every possible movie and calculate similarity
    for other_product_id in original_df.item_id.to_list():
        other_product_idx = original_df[original_df.item_id == other_product_id].index.tolist()[0]
        # Get the second product vector, and calculate distance
        other_product_vector = numerical_array[other_product_idx]
        similarity_score = 1- cosine_distance(other_product_vector, product_vector)
        product_id_list.append(other_product_id)
        other_product_name = original_df[original_df.item_id == other_product_id].item_name.tolist()[0]
        product_name_list.append(other_product_name)
        similarity_score_list.append(similarity_score)
        
    # sort movies by ascending similarity
    similarity_table = pd.DataFrame({'product_id': product_id_list, 'product_name': product_name_list, 'cosine similarity': similarity_score_list}).sort_values('cosine similarity', ascending=False)
    similarity_table = similarity_table[similarity_table.product_name != product_name] #.drop_duplicates(subset = 'cosine similarity', keep = 'first')
    print(f'Printing top {n_rows} similar products to {product_name}')
    return similarity_table.iloc[0 :n_rows]

Bluetooth Speakers

In [77]:
pd.set_option('max_colwidth', 60)
get_top_recommendations(product_id = 'B00GUTY132', original_df = clean_df, numerical_array = final_array_bert)

Printing top 10 similar products to AmazonBasics Large Portable Bluetooth Speaker


Unnamed: 0,product_id,product_name,cosine similarity
14680,B01FJD4SB4,AmazonBasics Ëø∑‰Ω†‰æøÊê∫ÂºèËìùÁâôÈü≥ÁÆ± Á¥´Ëâ≤,0.737276
19100,B00JZVPGD6,AmazonBasics Wireless Bluetooth Dual 3W Speaker with Bui...,0.708503
11252,B01FJC752O,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄùÊó†Á∫øËìùÁâô 3W Êâ¨Â£∞Âô®,0.696139
19077,B01FJC8FXC,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄùÊó†Á∫øËìùÁâô 3W Êâ¨Â£∞Âô®,0.693452
18257,B01FJCEAWM,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄùÊó†Á∫øËìùÁâô 3W Êâ¨Â£∞Âô®,0.685978
15489,B01FJEEW1Y,AmazonBasics ÎßàÏù¥ÌÅ¨Î°ú Ïö∏Ìä∏Îùº Ìú¥ÎåÄÏö© Î∏îÎ£®Ìà¨Ï...,0.662078
15452,B00LLJ58ME,AmazonBasics ÎßàÏù¥ÌÅ¨Î°ú Ïö∏Ìä∏Îùº Ìú¥ÎåÄÏö© Î∏îÎ£®Ìà¨Ï...,0.661351
8734,B01FJFANUM,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄùÂ§ßÂûã‰æøÊê∫ÂºèËìùÁâôÈü≥ÁÆ±,0.65844
1024,B01FJEZSDU,AmazonBasics ÎßàÏù¥ÌÅ¨Î°ú Ïö∏Ìä∏Îùº Ìú¥ÎåÄÏö© Î∏îÎ£®Ìà¨Ï...,0.655093
3800,B01FJF3OJ4,AmazonBasics Micro ultra-portable ËìùÁâôÈü≥ÁÆ± ÁôΩËâ≤,0.654634


In [78]:
pd.set_option('max_colwidth', 60)
get_top_recommendations(product_id = 'B00GUTY132', original_df = clean_df, numerical_array = final_array_wv)

Printing top 10 similar products to AmazonBasics Large Portable Bluetooth Speaker


Unnamed: 0,product_id,product_name,cosine similarity
19100,B00JZVPGD6,AmazonBasics Wireless Bluetooth Dual 3W Speaker with Bui...,0.877607
12844,B08H8TVBCD,AmazonBasics Computer Speakers for Desktop or Laptop PC ...,0.817273
5214,B07BLDWLS2,AmazonBasics Altavoz Bluetooth inal√°mbrico port√°til (r...,0.816823
17306,B08H8G6KRL,AmazonBasics USB-Powered PC Computer Speakers with Dynam...,0.814994
19824,B00JZVPKE6,AmazonBasics Ëø∑‰Ω†‰æøÊê∫ÂºèËìùÁâôÈü≥ÁÆ± ÁÅ∞Ëâ≤,0.798256
7657,B08HKG694G,AmazonBasics AC Powered PC Multimedia External Speakers ...,0.78918
15452,B00LLJ58ME,AmazonBasics ÎßàÏù¥ÌÅ¨Î°ú Ïö∏Ìä∏Îùº Ìú¥ÎåÄÏö© Î∏îÎ£®Ìà¨Ï...,0.78634
9394,B07D7TV5J3,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄù ÁîµËÑëÊâ¨Â£∞Âô® ÈÄÇÁî®‰∫éÂè...,0.786159
14680,B01FJD4SB4,AmazonBasics Ëø∑‰Ω†‰æøÊê∫ÂºèËìùÁâôÈü≥ÁÆ± Á¥´Ëâ≤,0.768368
8734,B01FJFANUM,AmazonBasics ‰∫öÈ©¨ÈÄäÂÄçÊÄùÂ§ßÂûã‰æøÊê∫ÂºèËìùÁâôÈü≥ÁÆ±,0.759289


Milk

In [79]:
get_top_recommendations(product_id = 'B07W6ZQWH1', original_df = clean_df, numerical_array = bert_keyword_vector)

Printing top 10 similar products to Amazon Brand - Happy Belly 2% Reduced Fat Milk, Gallon, 128 Ounces


Unnamed: 0,product_id,product_name,cosine similarity
18127,B07W842VMV,"Amazon Brand - Happy Belly 2% Reduced Fat Milk, Half Gal...",0.986396
20044,B07WC9MMPD,"Amazon Brand - Happy Belly 1% Low Fat Milk, Gallon, 128 ...",0.960517
17142,B07W5Z8SJ8,"Amazon Brand - Happy Belly 1% Low Fat Milk, Half Gallon,...",0.944055
5109,B07W97F9DN,"Amazon Brand - Happy Belly 1% Low Fat Chocolate Milk, Ha...",0.926453
15191,B07WC9NK19,"Amazon Brand - Happy Belly Whole Milk, Half Gallon, 64 O...",0.91931
15778,B07WC9MMNN,"Amazon Brand - Happy Belly 1% Low Fat Chocolate Milk, Ga...",0.918151
5886,B075JYTT94,"Amazon Brand - Happy Belly 2% Reduced Fat Milk, Half Gal...",0.874792
4446,B075K1Z1YD,"Amazon Brand - Happy Belly 1% Low Fat Milk, Half Gallon,...",0.870134
11075,B07WLXNBZ8,"Amazon Brand - Happy Belly Fat Free Skim Milk, Gallon, 1...",0.864102
11856,B07W5Z8SHM,"Amazon Brand - Happy Belly Fat Free Skim Milk, Half Gall...",0.84124


In [80]:
get_top_recommendations(product_id = 'B07W6ZQWH1', original_df = clean_df, numerical_array = wv_keyword_vector)

Printing top 10 similar products to Amazon Brand - Happy Belly 2% Reduced Fat Milk, Gallon, 128 Ounces


Unnamed: 0,product_id,product_name,cosine similarity
18127,B07W842VMV,"Amazon Brand - Happy Belly 2% Reduced Fat Milk, Half Gal...",0.998289
15778,B07WC9MMNN,"Amazon Brand - Happy Belly 1% Low Fat Chocolate Milk, Ga...",0.985276
5109,B07W97F9DN,"Amazon Brand - Happy Belly 1% Low Fat Chocolate Milk, Ha...",0.984751
20044,B07WC9MMPD,"Amazon Brand - Happy Belly 1% Low Fat Milk, Gallon, 128 ...",0.984708
17142,B07W5Z8SJ8,"Amazon Brand - Happy Belly 1% Low Fat Milk, Half Gallon,...",0.984409
5350,B075JYTT8P,"Amazon Brand - Happy Belly 2% Reduced Fat Milk, Gallon, ...",0.964728
15793,B075JYTT9H,"Amazon Brand - Happy Belly 1% Low Fat Milk, Gallon, 128 ...",0.958159
16368,B07W97CVMS,"Amazon Brand - Happy Belly Whole Milk, Gallon, 128 Ounces",0.954634
15885,B07Z8LZZ5W,"Producers 2% Reduced Fat Milk, 32 oz., Pasteurized",0.938274
11075,B07WLXNBZ8,"Amazon Brand - Happy Belly Fat Free Skim Milk, Gallon, 1...",0.936253


In [81]:
get_top_recommendations(product_id = 'B07B4MRV2H', original_df = clean_df, numerical_array = wv_keyword_vector)

Printing top 10 similar products to ‰∫öÈ©¨ÈÄäÂìÅÁâå ‚Äì ÈìÜÈíâËæπÁºòÁé∞‰ª£‰ΩéËÉåÁöÆÈù©Â∫ßÊ§ÖÁ≥ªÂàó, ÁöÆÈù©, ÁÇ≠ÈªëËâ≤


Unnamed: 0,product_id,product_name,cosine similarity
7345,B07B4MG2MD,Stone & Beam Andover Ïä¨Î¶ΩÏª§Î≤Ñ ÏãúÌä∏ Ïª¨Î†âÏÖò,1.0
4174,B07F4PXFWZ,"HomePop Modern Swoop Arm Accent Chair, Gray Geometric Pa...",0.908407
13971,B07F4D1TDW,HomePop K6908-B269 Miller Modern Swoop Arm Velvet Accent...,0.896915
2590,B07FK69CKR,"Artum Hill Olivia Accent Chair, Espresso",0.77764
14245,B07D4F6ZKH,Red Hook Martina Contemporary Upholstered Armless Accent...,0.76318
18089,B07D4FFWC8,Red Hook Martina Contemporary Upholstered Armless Accent...,0.754189
10173,B07DBG25XB,Ravenna Home Justin Â∑•‰∏öÊ°åÁ≥ªÂàó,0.735987
9175,B07BWJMB9F,‰∫öÈ©¨ÈÄäÂìÅÁâå ‚Äì Stone & Beam Cheyanne Áé∞‰ª£ÂÆ¢ÂéÖË£...,0.735395
12351,B075YP336C,Î¶¨Î≤≥ Î†àÏù¥Ï≤º 2Í∞ú ÏÑ∏Ìä∏ Í≥µÍ∞Ñ ÏãúÎåÄ Î™®Îçò Ï§ëÎ∂Ä...,0.731301
12826,B075YV7V1P,Rivet Ray ‰∏≠‰∏ñÁ∫™Áé∞‰ª£Âé®ÊàøÈ§êÂéÖÊ§ÖÂ≠êÔºå2 ‰ª∂Â•óÔº...,0.728193


In [83]:
get_top_recommendations(product_id = 'B07B4MRV2H', original_df = clean_df, numerical_array = wv_keyword_vector)

Printing top 10 similar products to ‰∫öÈ©¨ÈÄäÂìÅÁâå ‚Äì ÈìÜÈíâËæπÁºòÁé∞‰ª£‰ΩéËÉåÁöÆÈù©Â∫ßÊ§ÖÁ≥ªÂàó, ÁöÆÈù©, ÁÇ≠ÈªëËâ≤


Unnamed: 0,product_id,product_name,cosine similarity
7345,B07B4MG2MD,Stone & Beam Andover Ïä¨Î¶ΩÏª§Î≤Ñ ÏãúÌä∏ Ïª¨Î†âÏÖò,1.0
16338,B07B4MTFSC,ÏïÑÎßàÏ°¥ Î∏åÎûúÎìú - Ïä§ÌÜ§ & Îπî Ïï§ÎèÑÎ≤Ñ Ï¢åÏÑù Ïª¨Î...,1.0
6826,B07B4MPXJP,ÏïÑÎßàÏ°¥ Î∏åÎûúÎìú - Ïä§ÌÜ§ Ïï§ Îπî Ïï§ÎèÑÎ≤Ñ Î™®Îçò Ïã...,1.0
9564,B07B4MRKQG,‰∫öÈ©¨ÈÄäÂìÅÁâå ‚Äì ÈìÜÈíâËæπÁºòÁé∞‰ª£‰ΩéËÉåÁöÆÈù©Â∫ßÊ§Ö...,1.0
16021,B07B4MDK2G,ÏïÑÎßàÏ°¥ Î∏åÎûúÎìú - Ïä§ÌÜ§ & Îπî Ïï§ÎèÑÎ≤Ñ Ï¢åÏÑù Ïª¨Î...,1.0
11730,B07B4GXXDV,Amazon Brand - Stone & Beam Bradbury Chesterfield Á∞áÁªí...,1.0
7974,B07B4CZP32,ÏïÑÎßàÏ°¥ Î∏åÎûúÎìú - Ïä§ÌÜ§ & Îπî Ïï§ÎèÑÎ≤Ñ Ï¢åÏÑù Ïª¨Î...,1.0
2163,B07B4MFR9Q,Stone & Beam Andover Ïä¨Î¶ΩÏª§Î≤Ñ ÏãúÌä∏ Ïª¨Î†âÏÖò,1.0
20451,B07B4MH6L6,Stone & Beam Andover Ïä¨Î¶ΩÏª§Î≤Ñ ÏãúÌä∏ Ïª¨Î†âÏÖò,1.0
16910,B07B4D46TB,ÏïÑÎßàÏ°¥ Î∏åÎûúÎìú ‚Äì Ïä§ÌÜ§ & Îπî Ïï§ÎèÑÎ≤Ñ Ïä¨Î¶ΩÏª§...,1.0
