In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
products = pd.read_csv('products.csv')
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')

In [6]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [8]:
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [11]:
departments.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [15]:
product_desc = pd.merge(pd.merge(products, aisles, on = 'aisle_id', how='inner'), departments, on = 'department_id', how ='inner')

In [16]:
product_desc.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes,snacks
2,102,Danish Butter Cookies,61,19,cookies cakes,snacks
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes,snacks
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes,snacks


In [17]:
product_desc['metadata'] = product_desc.apply(lambda x: x['aisle'] +' '+ x['department'] +' '+ x['product_name'], axis = 1)

In [18]:
product_desc.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department,metadata
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks,cookies cakes snacks Chocolate Sandwich Cookies
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,cookies cakes,snacks,cookies cakes snacks Nutter Butter Cookie Bite...
2,102,Danish Butter Cookies,61,19,cookies cakes,snacks,cookies cakes snacks Danish Butter Cookies
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,cookies cakes,snacks,cookies cakes snacks Gluten Free All Natural C...
4,285,Mini Nilla Wafers Munch Pack,61,19,cookies cakes,snacks,cookies cakes snacks Mini Nilla Wafers Munch Pack


In [19]:
count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(product_desc['metadata'])

In [21]:
count_vec_matrix.shape

(49688, 10624)

In [24]:
#This function takes in any words and vectorizes them, then finds similar vector in the count_vec_matrix

def metadata_search_engine(product_input):

    vec = count_vec.transform(pd.Series(product_input))
    cosine_sim = cosine_similarity(vec, count_vec_matrix)
    similarity_score = pd.DataFrame(cosine_sim.reshape(49688,), index = product_desc.index, columns=['score'])
    non_zero_scores = similarity_score[similarity_score['score'] > 0]

    if len(non_zero_scores) == 0:
        print('No similar products found.  Please refine your search terms and try again')
        return

    if len(non_zero_scores) < 10:
        item_count = len(non_zero_scores)
    else:
        item_count = 10

    similarity_scores = similarity_score.sort_values(['score'], ascending=False)[:item_count]

    return product_desc['product_name'].iloc[similarity_scores.index]

In [25]:
metadata_search_engine('Oreo')

582                           Spring Oreo
206                       Peppermint Oreo
514                      Mini Oreo Go Pak
790                     Golden Oreo Thins
43233                 Chocolate Chip Oreo
554             Mini Oreo Golden Snak Sak
273      Coconut Delight Creme Fudge Oreo
43395            Oreo Chocolate Candy Bar
48811                 Oreo Filled Cupcake
4511       Oreo Cookie Sticks & Creme Dip
Name: product_name, dtype: object

In [26]:
metadata_search_engine('Biscuits')

3451                                          Tea Biscuits
137                                       HobNobs Biscuits
480                                  The Original Biscuits
233                                       Caramel Biscuits
767                                        Butter Biscuits
43470                                             Biscuits
766                                  Biscuits, Gingerspice
37256    Flavor Snacks Dog Biscuits - For Small/Medium ...
530                                 European Biscuits Dark
140                            The Original Wheat Biscuits
Name: product_name, dtype: object

In [27]:
metadata_search_engine('milk')

29312                                           Milk
29256                                     Milk Whole
29367                                        1% Milk
29307                                        2% Milk
29257                                     Whole Milk
29431     Milk Magic Chocolate Milk Flavoring Straws
29349    Milk Magic Strawberry Milk Flavoring Straws
29276                                 2 Percent Milk
29377                           Vitamin D Whole Milk
29428                                    Nonfat Milk
Name: product_name, dtype: object