In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_colwidth', None)
import warnings
warnings.simplefilter("ignore", category=Warning)

## Open Data

In [3]:
df = pd.read_excel('database\perfume_database.xlsx', 
                   usecols=['brand', 'perfume', 'notes'])

In [4]:
# Drop perfumes with no notes
df = df[df['notes'].notna()]
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla"", ""Exotic Fruits"", ""Red wine"", ""Musk"", ""Woodsy Notes""]"
1,40 Notes Perfume,Cashmere Musk,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmere Wood""]"
2,40 Notes Perfume,Exotic Ylang Ylang,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]"
3,40 Notes Perfume,Exquisite Amber,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""Musk""]"
4,40 Notes Perfume,Oudwood Veil,"[""Kephalis"", ""Agarwood (Oud)""]"
...,...,...,...
36964,Urban Rituelle,Lemongrass Blend,"[""Lemongrass"", ""Myrtle"", ""Grapefruit"", ""Eucalyptus""]"
36965,Urban Rituelle,Peach Blossom,"[""Peach"", ""Honey"", ""Sweet Pea"", ""Mimosa""]"
36966,Urban Rituelle,Pomegranate,"[""Pomegranate"", ""Citruses"", ""Red Berries""]"
36967,Urban Rituelle,Vanilla,"[""Vanilla"", ""Caramel"", ""Milk""]"


## Clean Data

In [5]:
corpus = pd.DataFrame(df['notes'])

In [6]:
itens_to_remove = [
    '[', ']', '"', '{', '}',
    'middle: ', 'top: ', 'base: ', 'null'
]
def remove_items(text):
    for item in itens_to_remove:
        text = text.replace(item, "")
    return text

In [7]:
# Cleaning text
corpus['notes'] = corpus['notes'].astype(str)
corpus['notes'] = corpus['notes'].str.lower()
corpus['notes'] = corpus['notes'].apply(remove_items)

In [8]:
df['notes'] = corpus['notes']
df.head()

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"citruses, saffron, tonka bean, vanilla, exotic fruits, red wine, musk, woodsy notes"
1,40 Notes Perfume,Cashmere Musk,"sandalwood, cedar, white musk, cashmere wood"
2,40 Notes Perfume,Exotic Ylang Ylang,"ylang-ylang, gardenia, musk"
3,40 Notes Perfume,Exquisite Amber,"labdanum, styrax, benzoin, vanilla, musk"
4,40 Notes Perfume,Oudwood Veil,"kephalis, agarwood (oud)"


## Vectorize Data

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def custom_tokenizer(text):
    return text.split(',')

In [11]:
count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

In [12]:
bag_of_words = count_vectorizer.fit_transform(corpus.notes)

In [13]:
bag_of_words.shape

(36969, 2145)

## Calculate similarity

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import pickle, os, sys

In [15]:
similarity_matrix_sparse = cosine_similarity(bag_of_words, dense_output=False)

In [16]:
similarity_matrix_sparse.shape

(36969, 36969)

In [17]:
type(similarity_matrix_sparse)

scipy.sparse._csr.csr_matrix

In [18]:
max_values = 6
num_rows = similarity_matrix_sparse.shape[0]

top_index = []
top_values = []

for index in range(num_rows):
    if index%5000 == 0 and index !=0: print(f'{index} calculated values')
    if index+1 == num_rows: print(f'{index} calculated values')
    perfume_search = similarity_matrix_sparse.getrow(index)
    top_similarity = np.argsort(perfume_search.data)[-max_values:][::-1]
    top_index.append(perfume_search.indices[top_similarity])
    top_values.append(perfume_search.data[top_similarity])

5000 calculated values
10000 calculated values
15000 calculated values
20000 calculated values
25000 calculated values
30000 calculated values
35000 calculated values
36968 calculated values


In [26]:
def filter_by_perfume(selected_perfume):
    perfume_index = df.query(f"perfume=='{selected_perfume}'").index[0]
    df_filter = df.iloc[top_index[perfume_index], :].reset_index(drop=True)
    df_filter['similarity'] = top_values[perfume_index]
    df_filter['similarity'] = df_filter['similarity'].map("{:.2%}".format)
    df_filter['notes'] = df_filter['notes'].str.split(',').apply(sorted, 1).str.join(',').str.strip(',')
    df_filter = df_filter[['brand', 'perfume', 'similarity', 'notes']]
    
    return df_filter

In [27]:
filter_by_perfume('Boss Soul')

Unnamed: 0,brand,perfume,similarity,notes
0,Hugo Boss,Boss Soul,100.00%,"amber, anise, bergamot, cardamom, cinnamon, coriander, mandarin orange, musk, nutmeg, pepper, tonka bean, vanilla, vetiver,lavender"
1,Shirley May,Compass,80.58%,"amber, bergamot, cardamom, coriander, mandarin orange, musk, nutmeg, pepper, tonka bean, vetiver,cinnamon"
2,Carolina Herrera,212 Men White,64.47%,"bergamot, cardamom, coriander, mandarin orange, musk, pepper, sandalwood, tobacco, vetiver, virginia cedar,lavender"
3,Kristiansand New York,Kristiansand,64.29%,"amber, cinnamon, clary sage, fig, mandarin orange, musk, nutmeg, oak moss, pepper, plum, tonka bean, vetiver, virginia cedar,lavender"
4,Lobogal,Naceo Noir,62.36%,"amber, bergamot, cinnamon, coriander, myrrh, nutmeg, vanilla, vetiver,sandalwood"
5,Benetton,Let s Move,62.36%,"amber, bergamot, cedar, geranium, mandarin orange, pepper, tonka bean, vanilla,lavender"


## App

In [22]:
import gradio as gr

In [28]:
# V2.0
brand_options = list(df['brand'].unique())
perfume_options = list(df['perfume'].unique())

def perfume_change(brand):
    names = list(df.query(f"brand=='{brand}'")['perfume'])
    return gr.update(choices=names, value=None)

with gr.Blocks() as demo:
    with gr.Row():
        brand_dropdown = gr.Dropdown(choices=brand_options, value='Hugo Boss', label="Brand")
        perfume_dropdown = gr.Dropdown(choices=perfume_options, value='Boss Soul', label="Perfume Name")
        brand_dropdown.change(fn=perfume_change, inputs=[brand_dropdown], outputs=[perfume_dropdown])
        btn = gr.Button(value="Search similar", scale=.1)
        
    with gr.Row():
        output_df = gr.outputs.Dataframe(type='pandas')
        btn.click(filter_by_perfume, inputs=[perfume_dropdown], outputs=[output_df])
        
demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


