In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Open Data

In [2]:
df = pd.read_excel('database\perfume_database.xlsx', 
                   usecols=['brand', 'perfume', 'notes'])

In [3]:
# Drop perfumes with no notes
df = df[df['notes'].notna()]
df

Unnamed: 0,brand,perfume,notes
0,18 21 Man Made,Sweet Tobacco Spirits,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla..."
1,40 Notes Perfume,Cashmere Musk,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmer..."
2,40 Notes Perfume,Exotic Ylang Ylang,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]"
3,40 Notes Perfume,Exquisite Amber,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""..."
4,40 Notes Perfume,Oudwood Veil,"[""Kephalis"", ""Agarwood (Oud)""]"
...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"[""Lemongrass"", ""Myrtle"", ""Grapefruit"", ""Eucaly..."
37922,Urban Rituelle,Peach Blossom,"[""Peach"", ""Honey"", ""Sweet Pea"", ""Mimosa""]"
37923,Urban Rituelle,Pomegranate,"[""Pomegranate"", ""Citruses"", ""Red Berries""]"
37924,Urban Rituelle,Vanilla,"[""Vanilla"", ""Caramel"", ""Milk""]"


In [4]:
corpus = pd.DataFrame(df['notes'])
corpus.head(10)

Unnamed: 0,notes
0,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla..."
1,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmer..."
2,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]"
3,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""..."
4,"[""Kephalis"", ""Agarwood (Oud)""]"
5,"[""Green Notes"", ""Jasmine"", ""Tuberose"", ""Honeys..."
6,"[""Grapefruit"", ""Black Currant"", ""Honeysuckle"",..."
7,"[""Orange Blossom"", ""Neroli"", ""White Musk""]"
8,"{""middle"": [""Woodsy Notes"", ""Coriander"", ""Nutm..."
9,"{""middle"": [""Damask Rose"", ""Rose""], ""base"": [""..."


## Clean Data

In [5]:
itens_to_remove = [
    '[', ']', '"', '{', '}',
    'middle: ', 'top: ', 'base: ', 'null'
]
def remove_items(text):
    for item in itens_to_remove:
        text = text.replace(item, "")
    return text

In [6]:
# Cleaning text
corpus['notes'] = corpus['notes'].astype(str)
corpus['notes'] = corpus['notes'].str.lower()
corpus['notes'] = corpus['notes'].apply(remove_items)
corpus.head(10)

Unnamed: 0,notes
0,"citruses, saffron, tonka bean, vanilla, exotic..."
1,"sandalwood, cedar, white musk, cashmere wood"
2,"ylang-ylang, gardenia, musk"
3,"labdanum, styrax, benzoin, vanilla, musk"
4,"kephalis, agarwood (oud)"
5,"green notes, jasmine, tuberose, honeysuckle"
6,"grapefruit, black currant, honeysuckle, orchid..."
7,"orange blossom, neroli, white musk"
8,"woodsy notes, coriander, nutmeg, patchouli, oa..."
9,"damask rose, rose, amber, ginger, apricot, cle..."


## Vectorize Data

In [9]:
def custom_tokenizer(text):
    return text.split(',')

In [10]:
count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

In [51]:
bag_of_words = count_vectorizer.fit_transform(corpus.notes)

In [52]:
bag_of_words.shape

(36969, 2145)

## Calculate similarity

In [None]:
#similarity_matrix = cosine_similarity(bag_of_words)

In [None]:
similarity_matrix.shape